From 7adbb4a3ed4ce05d366222e640fed8ce32598f2a Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Tue, 12 Dec 2023 01:08:51 +0100 Subject: [PATCH 01/13] Put tensile source into src dir and remove tests from source --- Tensile/Configs/build_client.yaml | 28 ------------------ .../bugs/2sum_src_pgr1_smallsum.yaml | 0 {Tensile/Tests => Tests}/bugs/d2lds.yaml | 0 .../bugs/fractional_plus_pbc.yaml | 0 .../Tests => Tests}/bugs/free10_swap.yaml | 0 {Tensile/Tests => Tests}/bugs/hpa_beta.yaml | 0 .../Tests => Tests}/bugs/nosourcetmp.yaml | 0 .../bugs/simple_use_initial_strides_1.yaml | 0 {Tensile/Tests => Tests}/bugs/swizzlec1.yaml | 0 .../bugs/test_glvw4_edge_no_asem.yaml | 0 ...lts[Run_Contraction-src1].contraction.yaml | 0 {Tensile/Tests => Tests}/conftest.py | 0 {Tensile/Tests => Tests}/create_tests.py | 0 .../disabled/classic/test_convolution.yaml | 0 .../convolution/test_conv_act1d_filter1d.yaml | 0 .../test_conv_act1d_filter1d_simple.yaml | 0 .../test_conv_act1d_filter2d_simple.yaml | 0 .../test_conv_act1d_filter3d_simple.yaml | 0 .../test_conv_act1d_filter5d_simple.yaml | 0 .../convolution/test_conv_act2d_filter1d.yaml | 0 .../test_conv_act2d_filter1d_simple.yaml | 0 .../disabled/direct_to_lds/dtl_dgemm.yaml | 0 .../direct_to_lds/dtl_dgemm_lite.yaml | 0 .../direct_to_lds/dtl_tsgr_dgemm.yaml | 0 .../disabled/hgemm_nn_source.yaml | 0 .../disabled/multi_sum/test_.py | 0 .../disabled/starter_packed_case.yaml | 0 .../disabled/stridea0_pack_nt.yaml | 0 .../disabled/strideb0_pack_nn.yaml | 0 .../disabled/test_assertion_selection.yaml | 0 .../disabled/test_create_library.yaml | 0 {Tensile/Tests => Tests}/dot/mixmad-nt.yaml | 0 {Tensile/Tests => Tests}/dot/mixmad.yaml | 0 .../bfloat16/bfloat16_hpa_source_nn.yaml | 0 .../bfloat16/bfloat16_hpa_source_nt.yaml | 0 .../bfloat16/bfloat16_hpa_source_tn.yaml | 0 .../bfloat16/bfloat16_hpa_source_tt.yaml | 0 .../Tests => Tests}/emulation/dgemm_asm.yaml | 0 .../double_complex/double_complex_hip_cn.yaml | 0 .../b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml | 0 .../b8f8gemm_hybrid_b8f8b8s_gfx940.yaml | 0 .../float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml | 0 .../float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml | 0 .../float8/b8gemm_b8b8s_SR_gfx940.yaml | 0 .../emulation/float8/b8gemm_b8b8s_gfx940.yaml | 0 .../emulation/float8/b8gemm_b8hs_gfx940.yaml | 0 .../emulation/float8/b8gemm_b8ss_gfx940.yaml | 0 .../f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml | 0 .../f8b8gemm_hybrid_f8b8b8s_gfx940.yaml | 0 .../float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml | 0 .../float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml | 0 ...f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml | 0 .../float8/f8gemm_f8f8s_SR_gfx940.yaml | 0 .../emulation/float8/f8gemm_f8f8s_gfx940.yaml | 0 .../emulation/float8/f8gemm_f8hs_gfx940.yaml | 0 .../emulation/float8/f8gemm_f8ss_gfx940.yaml | 0 .../float_complex/float_complex_hip_cc.yaml | 0 .../emulation/hgemm_asm_nn.yaml | 0 .../emulation/hgemm_asm_nt.yaml | 0 .../emulation/hgemm_asm_tn.yaml | 0 .../emulation/hgemm_asm_tt.yaml | 0 .../emulation/hgemm_hpa_asm_nn.yaml | 0 .../emulation/hgemm_hpa_asm_nt.yaml | 0 .../emulation/hgemm_hpa_asm_tn.yaml | 0 .../emulation/hgemm_hpa_asm_tt.yaml | 0 .../emulation/igemm_hpa_hip_nn.yaml | 0 .../emulation/igemm_hpa_hip_nt.yaml | 0 .../emulation/igemm_hpa_hip_tn.yaml | 0 .../emulation/igemm_hpa_hip_tt.yaml | 0 .../Tests => Tests}/emulation/mfma/1LDSB.yaml | 0 .../emulation/mfma/cgemm_asm.yaml | 0 .../emulation/mfma/cgemm_asm_conjugate.yaml | 0 .../Tests => Tests}/emulation/mfma/dgemm.yaml | 0 .../emulation/mfma/hpa_bfloat16_gemm_asm.yaml | 0 .../mfma/hpa_bfloat16_gemm_asm_gfx940.yaml | 0 .../emulation/mfma/hpa_hgemm_asm.yaml | 0 .../mfma/hpa_igemm_i8_asm_gfx940.yaml | 0 .../Tests => Tests}/emulation/mfma/sgemm.yaml | 0 .../extended/big_tensor/biga.yaml | 0 .../extended/big_tensor/bigskinny_nt.yaml | 0 .../extended/big_tensor/largec.yaml | 0 .../rocblas_dgemm_bufferload_limit.yaml | 0 .../rocblas_sgemm_bufferload_limit.yaml | 0 .../extended/classic/test_persistent.yaml | 0 .../classic/test_tensor_contraction.yaml | 0 .../extended/classic_source/test_dgemm.yaml | 0 .../classic_source/test_hgemm_nn.yaml | 0 .../classic_source/test_hgemm_nt.yaml | 0 .../classic_source/test_hgemm_tn_tt.yaml | 0 .../extended/classic_source/test_sgemm.yaml | 0 .../YamlBuilder/YamlBuilder.py | 0 .../convolution_config/YamlBuilder/header.yml | 0 .../YamlBuilder/solutions/sgemm_1.yml | 0 .../YamlBuilder/solutions/sgemm_src.yml | 0 .../extended/convolution_config/conftest.py | 0 .../test_backwarddata_nchw.py | 0 .../test_backwardweights_nchw.py | 0 .../convolution_config/test_bad_input.py | 0 .../test_conv_vs_contraction.py | 0 .../convolution_config/test_forward_cnhw.py | 0 .../convolution_config/test_forward_nchw.py | 0 .../test_forward_nchw_ckyx.py | 0 .../convolution_config/test_forward_nhwc.py | 0 .../convolution_config/test_forward_pad.py | 0 .../convolution_config/test_simple.py | 0 .../unittests/test_problem_sizes.py | 0 .../unittests/test_string_swap.py | 0 .../custom_kernel/ck_dgemm_90a_nn.yaml | 0 .../ck_dgemm_90a_nn_large_offset.yaml | 0 .../extended/direct_to_lds/dtl_dgemm.yaml | 0 .../extended/direct_to_lds/dtl_hgemm.yaml | 0 .../extended/direct_to_lds/dtl_sgemm.yaml | 0 .../extended/direct_to_lds/dtl_tsgr_f8.yaml | 0 .../direct_to_lds/dtl_tsgr_hgemm.yaml | 0 .../direct_to_lds/dtl_tsgr_sgemm.yaml | 0 .../extended/direct_to_vgpr/dtv_cgemm.yaml | 0 .../extended/direct_to_vgpr/dtv_dgemm.yaml | 0 .../direct_to_vgpr/dtv_dgemm_a1b0.yaml | 0 .../extended/direct_to_vgpr/dtv_f8gemm.yaml | 0 .../extended/direct_to_vgpr/dtv_hgemm.yaml | 0 .../extended/direct_to_vgpr/dtv_igemm.yaml | 0 .../extended/dot2/hgemm_hpa_dot2_nn.yaml | 0 .../extended/dot2/hgemm_hpa_dot2_tn.yaml | 0 .../extended/dot2/hgemm_hpa_dot2_tn_2.yaml | 0 .../extended/double_complex/zgemm_asm.yaml | 0 .../double_complex/zgemm_hip_source_cc.yaml | 0 .../double_complex/zgemm_hip_source_cn.yaml | 0 .../double_complex/zgemm_hip_source_ct.yaml | 0 .../double_complex/zgemm_hip_source_nc.yaml | 0 .../double_complex/zgemm_hip_source_nn.yaml | 0 .../double_complex/zgemm_hip_source_nt.yaml | 0 .../double_complex/zgemm_hip_source_tc.yaml | 0 .../double_complex/zgemm_hip_source_tn.yaml | 0 .../double_complex/zgemm_hip_source_tt.yaml | 0 .../extended/flat/test_dgemm_asm_flat.yaml | 0 .../extended/flat/test_sgemm_asm_flat.yaml | 0 .../extended/flat/test_sgemm_asm_flat_nt.yaml | 0 .../extended/flat/test_sgemm_asm_flat_tn.yaml | 0 .../extended/flat/test_sgemm_asm_flat_tt.yaml | 0 .../extended/float8/f8gemm-hybrid-ss.yaml | 0 .../extended/float_complex/cgemm_asm.yaml | 0 .../float_complex/cgemm_hip_source_cc.yaml | 0 .../float_complex/cgemm_hip_source_cn.yaml | 0 .../float_complex/cgemm_hip_source_ct.yaml | 0 .../float_complex/cgemm_hip_source_nc.yaml | 0 .../float_complex/cgemm_hip_source_nn.yaml | 0 .../float_complex/cgemm_hip_source_nt.yaml | 0 .../float_complex/cgemm_hip_source_tc.yaml | 0 .../float_complex/cgemm_hip_source_tn.yaml | 0 .../float_complex/cgemm_hip_source_tt.yaml | 0 .../test_dgemm_fractional_tile_sweep.yaml | 0 .../test_hgemm_fractional_tile_sweep.yaml | 0 .../test_sgemm_fractional_edge.yaml | 0 .../test_sgemm_fractional_tile_sweep.yaml | 0 .../extended/global_split_u/hgemm_gsu.yaml | 0 .../global_split_u/hgemm_gsu_minkforgsu.yaml | 0 .../global_split_u/sgemm_gsu_batch.yaml | 0 .../global_split_u/sgemm_gsu_beta0.yaml | 0 .../global_split_u/sgemm_gsu_beta1.yaml | 0 .../global_split_u/sgemm_gsu_beta2.yaml | 0 .../global_split_u/sgemm_gsu_usebeta0.yaml | 0 .../hpa_source/test_hgemm_hpa_src_nn.yaml | 0 .../hpa_source/test_hgemm_hpa_src_nt.yaml | 0 .../hpa_source/test_hgemm_hpa_src_tn.yaml | 0 .../hpa_source/test_hgemm_hpa_src_tt.yaml | 0 .../local_split_u/bfloat16_lsu_mfma.yaml | 0 .../local_split_u/cgemm_lsu_mfma.yaml | 0 .../extended/local_split_u/dgemm_lsu.yaml | 0 .../local_split_u/dgemm_lsu_mfma.yaml | 0 .../local_split_u/f8gemm_lsu_mfma.yaml | 0 .../extended/local_split_u/hgemm_lsu.yaml | 0 .../local_split_u/hgemm_lsu_grvw2.yaml | 0 .../local_split_u/hgemm_lsu_mfma.yaml | 0 .../local_split_u/hgemm_lsu_mfma_a1b0.yaml | 0 .../local_split_u/igemm_lsu_mfma.yaml | 0 .../extended/local_split_u/sgemm_lsu.yaml | 0 .../local_split_u/sgemm_lsu_mfma.yaml | 0 .../local_split_u/zgemm_lsu_mfma.yaml | 0 .../mirror_dims/mirror_dims_1sum_zp.yaml | 0 .../mirror_dims_2sum_mir_summ.yaml | 0 .../mirror_dims_2sum_mir_summ_zp_other.yaml | 0 .../mirror_dims_2sum_mir_summ_zp_unroll.yaml | 0 .../mirror_dims_2sum_mir_unroll.yaml | 0 .../mirror_dims_2sum_mir_unroll_summ.yaml | 0 .../mirror_dims_2sum_mir_unroll_zp_other.yaml | 0 ...mirror_dims_2sum_mir_unroll_zp_unroll.yaml | 0 .../mirror_dims_3sum_mir_summ1.yaml | 0 .../mirror_dims_3sum_mir_summ1_summ2.yaml | 0 .../mirror_dims_3sum_mir_summ2.yaml | 0 .../mirror_dims_3sum_mir_summ_zp_other.yaml | 0 .../mirror_dims_3sum_mir_unroll.yaml | 0 .../mirror_dims_3sum_mir_unroll_summ1.yaml | 0 .../mirror_dims_3sum_mir_unroll_zp_other.yaml | 0 .../extended/multi_sum/2sum.yaml | 0 .../extended/multi_sum/2sum_gsu.yaml | 0 .../extended/multi_sum/2sum_gsu_simple.yaml | 0 .../extended/multi_sum/2sum_gsu_src.yaml | 0 .../extended/multi_sum/2sum_src.yaml | 0 .../extended/multi_sum/3sum_gsu.yaml | 0 .../multi_sum/simple_sum2_scrambled.yaml | 0 .../multi_sum_psd/1sum_gsu_simple.yaml | 0 .../extended/multi_sum_psd/1sum_simple.yaml | 0 .../extended/multi_sum_psd/2sum.yaml | 0 .../extended/multi_sum_psd/2sum_gsu.yaml | 0 .../multi_sum_psd/2sum_gsu_simple.yaml | 0 .../multi_sum_psd/2sum_gsuremainder.yaml | 0 .../2sum_gsuremainder_simple.yaml | 0 .../extended/multi_sum_psd/2sum_pbd.yaml | 0 .../multi_sum_psd/2sum_scrambled_simple.yaml | 0 .../extended/multi_sum_psd/3sum.yaml | 0 .../extended/multi_sum_psd/3sum_gsu.yaml | 0 .../multi_sum_psd/3sum_gsu_simple.yaml | 0 .../extended/multi_sum_psd/3sum_simple.yaml | 0 .../extended/multi_sum_psd/README | 0 .../hackable_simple_unrollinc1.yaml | 0 .../extended/nonbatched/sgemm_asm_nn.yaml | 0 .../extended/nonbatched/sgemm_asm_nt.yaml | 0 .../extended/nonbatched/sgemm_asm_tn.yaml | 0 .../extended/nonbatched/sgemm_asm_tt.yaml | 0 .../pack_tensor_dims/multi_free2.yaml | 0 .../pack_tensor_dims/multi_free_batch.yaml | 0 .../pack_tensor_dims/packed_perf_nn.yaml | 0 .../simple_stridea0_pack.yaml | 0 .../simple_strideb0_pack.yaml | 0 .../pack_tensor_dims/strideb0_pack_nt.yaml | 0 .../pack_tensor_dims/strideb0_pack_tn.yaml | 0 .../pack_tensor_dims/vectorstore0.yaml | 0 .../extended/stagger_u/big_skinny_A_NN.yaml | 0 .../extended/stagger_u/big_skinny_A_NT.yaml | 0 .../extended/stagger_u/big_skinny_A_TN.yaml | 0 .../extended/stagger_u/big_skinny_A_TT.yaml | 0 .../extended/stagger_u/big_skinny_B_NN.yaml | 0 .../extended/stagger_u/big_skinny_B_NT.yaml | 0 .../extended/stagger_u/big_skinny_B_TN.yaml | 0 .../extended/stagger_u/big_skinny_B_TT.yaml | 0 .../extended/stream_k/sk_2tile_hgemm_hhs.yaml | 0 .../extended/stream_k/sk_2tile_sgemm.yaml | 0 .../extended/stream_k/sk_hgemm_hhs.yaml | 0 .../extended/stream_k/sk_sgemm.yaml | 0 .../extended/tensor_contraction/README | 0 .../tensor_contraction/allownofree.yaml | 0 .../tensor_contraction/assert_size_equal.yaml | 0 .../tensor_contraction/exact_conv.yaml | 0 .../extended/tensor_contraction/filter.yaml | 0 .../extended/tensor_contraction/ncdhw.yaml | 0 .../tensor_contraction/sweep_packed_dims.yaml | 0 .../extended/tensor_contraction/swizzle0.yaml | 0 .../extended/tensor_contraction/swizzle1.yaml | 0 .../extended/tensor_contraction/swizzle2.yaml | 0 .../extended/tensor_contraction/swizzle3.yaml | 0 ...packed_strides3d_defaults.contraction.yaml | 0 ...w_packed_strides_filter3d.contraction.yaml | 0 .../test_nchw_filter_contraction.yaml | 0 .../tlu0_non_unit_stride.yaml | 0 .../simple_use_initial_strides_1.yaml | 0 .../extended/use_initial_strides/test_1.yaml | 0 .../extended/use_initial_strides/test_2.yaml | 0 .../use_initial_strides/test_strides.yaml | 0 .../use_initial_strides/test_strides1.yaml | 0 .../perf_uis_cd_specialized.yaml | 0 .../test_use_initial_strides_cd_0.yaml | 0 .../test_use_initial_strides_cd_2.yaml | 0 .../extended/vector_width/hgemm_nn_asm.yaml | 0 .../extended/vector_width/sgemm_nn_asm.yaml | 0 .../vector_width/sgemm_nn_source.yaml | 0 .../zeropad/test_zp_2sum_zpother.yaml | 0 .../extended/zeropad/test_zp_simple_1sum.yaml | 0 .../zeropad/test_zp_simple_2sum_zp_both.yaml | 0 .../zeropad/test_zp_simple_2sum_zp_other.yaml | 0 .../test_zp_simple_2sum_zp_unroll.yaml | 0 .../zeropad/test_zp_simple_3sum_zp_other.yaml | 0 .../hipModuleLoad_timing/Makefile | 0 .../hipModuleLoadTiming.cpp | 0 .../integration/test_integration.py | 0 .../pre_checkin/4xi8gemm_hpa_hip_nn.yaml | 0 .../pre_checkin/4xi8gemm_hpa_hip_nt.yaml | 0 .../pre_checkin/4xi8gemm_hpa_hip_tn.yaml | 0 .../pre_checkin/4xi8gemm_hpa_hip_tt.yaml | 0 .../bfloat16/bfloat16_hpa_source_nn.yaml | 0 .../bfloat16/bfloat16_hpa_source_nt.yaml | 0 .../bfloat16/bfloat16_hpa_source_tn.yaml | 0 .../bfloat16/bfloat16_hpa_source_tt.yaml | 0 .../bfloat16/bfloat16s_hpa_source_nn.yaml | 0 .../bfloat16/bfloat16s_hpa_source_nt.yaml | 0 .../bfloat16/bfloat16s_hpa_source_tn.yaml | 0 .../bfloat16/bfloat16s_hpa_source_tt.yaml | 0 .../Tests => Tests}/pre_checkin/cov/COV4.yaml | 0 .../Tests => Tests}/pre_checkin/cov/COV5.yaml | 0 .../pre_checkin/cov/COVDefault.yaml | 0 .../denorm/bfloat16_hpa_source_nn.yaml | 0 .../pre_checkin/denorm/dgemm_asm.yaml | 0 .../pre_checkin/denorm/hgemm_hpa_asm_nn.yaml | 0 .../denorm/mfma/bfloat16_1k_denorm.yaml | 0 .../denorm/mfma/bfloat16_denorm.yaml | 0 .../pre_checkin/denorm/mfma/dgemm_denorm.yaml | 0 .../pre_checkin/denorm/mfma/hgemm_denorm.yaml | 0 .../denorm/mfma/hgemm_denorm_alt.yaml | 0 .../denorm/mfma/hgemm_denorm_alt_rnz.yaml | 0 .../pre_checkin/denorm/mfma/sgemm_denorm.yaml | 0 .../pre_checkin/denorm/sgemm_asm_nn.yaml | 0 .../pre_checkin/dgemm_asm.yaml | 0 .../pre_checkin/dgemm_general_batch_asm.yaml | 0 .../direct_to_vgpr/dtv_sgemm_lite.yaml | 0 .../double_complex/double_complex_asm_cc.yaml | 0 .../double_complex/double_complex_asm_cn.yaml | 0 .../double_complex/double_complex_asm_ct.yaml | 0 .../double_complex/double_complex_asm_nc.yaml | 0 .../double_complex/double_complex_asm_nn.yaml | 0 .../double_complex/double_complex_asm_nt.yaml | 0 .../double_complex/double_complex_asm_tc.yaml | 0 .../double_complex/double_complex_asm_tn.yaml | 0 .../double_complex/double_complex_asm_tt.yaml | 0 .../double_complex/double_complex_hip_cc.yaml | 0 .../double_complex/double_complex_hip_cn.yaml | 0 .../double_complex/double_complex_hip_ct.yaml | 0 .../double_complex/double_complex_hip_nc.yaml | 0 .../double_complex/double_complex_hip_nn.yaml | 0 .../double_complex/double_complex_hip_nt.yaml | 0 .../double_complex/double_complex_hip_tc.yaml | 0 .../double_complex/double_complex_hip_tn.yaml | 0 .../double_complex/double_complex_hip_tt.yaml | 0 .../float_complex/float_complex_asm_cc.yaml | 0 .../float_complex/float_complex_asm_cn.yaml | 0 .../float_complex/float_complex_asm_ct.yaml | 0 .../float_complex/float_complex_asm_nc.yaml | 0 .../float_complex/float_complex_asm_nn.yaml | 0 .../float_complex/float_complex_asm_nt.yaml | 0 .../float_complex/float_complex_asm_tc.yaml | 0 .../float_complex/float_complex_asm_tn.yaml | 0 .../float_complex/float_complex_asm_tt.yaml | 0 .../float_complex/float_complex_hip_cc.yaml | 0 .../float_complex/float_complex_hip_cn.yaml | 0 .../float_complex/float_complex_hip_ct.yaml | 0 .../float_complex/float_complex_hip_nc.yaml | 0 .../float_complex/float_complex_hip_nn.yaml | 0 .../float_complex/float_complex_hip_nt.yaml | 0 .../float_complex/float_complex_hip_tc.yaml | 0 .../float_complex/float_complex_hip_tn.yaml | 0 .../float_complex/float_complex_hip_tt.yaml | 0 .../pre_checkin/hgemm_asm_nn.yaml | 0 .../pre_checkin/hgemm_asm_nt.yaml | 0 .../pre_checkin/hgemm_asm_tn.yaml | 0 .../pre_checkin/hgemm_asm_tt.yaml | 0 .../hgemm_general_batch_asm_nn.yaml | 0 .../hgemm_general_batch_hpa_asm_nn.yaml | 0 .../hgemm_hpa_asm_f32_alphabeta_nn.yaml | 0 .../hgemm_hpa_asm_f32_alphabeta_nt.yaml | 0 .../hgemm_hpa_asm_f32_alphabeta_tn.yaml | 0 .../hgemm_hpa_asm_f32_alphabeta_tt.yaml | 0 .../pre_checkin/hgemm_hpa_asm_nn.yaml | 0 .../pre_checkin/hgemm_hpa_asm_nt.yaml | 0 .../pre_checkin/hgemm_hpa_asm_tn.yaml | 0 .../pre_checkin/hgemm_hpa_asm_tt.yaml | 0 .../pre_checkin/hgemm_hpa_iu2_asm_nn.yaml | 0 .../pre_checkin/hgemm_hpa_iu2_asm_nt.yaml | 0 .../pre_checkin/hgemm_hpa_iu2_asm_tn.yaml | 0 .../pre_checkin/hgemm_hpa_iu2_asm_tt.yaml | 0 .../pre_checkin/hsgemm_hpa_asm_nn.yaml | 0 .../pre_checkin/hsgemm_hpa_asm_nt.yaml | 0 .../pre_checkin/hsgemm_hpa_asm_tn.yaml | 0 .../pre_checkin/hsgemm_hpa_asm_tt.yaml | 0 .../pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml | 0 .../pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml | 0 .../pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml | 0 .../pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml | 0 .../pre_checkin/igemm_hpa_asm_nn.yaml | 0 .../pre_checkin/igemm_hpa_hip_nn.yaml | 0 .../pre_checkin/mfma/1LDSB.yaml | 0 .../pre_checkin/mfma/c-tile-reuse-no-nll.yaml | 0 .../pre_checkin/mfma/cgemm_asm.yaml | 0 .../pre_checkin/mfma/cgemm_asm_conjugate.yaml | 0 .../mfma/dgemm_alpha1_beta0_sgpr.yaml | 0 .../pre_checkin/mfma/dgemm_asm.yaml | 0 .../pre_checkin/mfma/dgemm_gb_global_ldd.yaml | 0 .../pre_checkin/mfma/dgemm_large_offset.yaml | 0 .../mfma/hpa_bfloat16_gemm_asm.yaml | 0 .../mfma/hpa_bfloat16_gemm_asm_gfx940.yaml | 0 .../hpa_bfloat16_general_batch_gemm_asm.yaml | 0 ...float16_general_batch_gemm_asm_gfx940.yaml | 0 .../mfma/hpa_bfloat16s_gemm_asm.yaml | 0 .../mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml | 0 .../pre_checkin/mfma/hpa_hgemm_asm.yaml | 0 .../mfma/hpa_hgemm_f32_alphabeta_asm.yaml | 0 .../mfma/hpa_hgemm_general_batch_asm.yaml | 0 .../pre_checkin/mfma/hpa_hgemm_split_lds.yaml | 0 .../pre_checkin/mfma/hpa_hsgemm_asm.yaml | 0 .../pre_checkin/mfma/hpa_igemm_i8_asm.yaml | 0 .../mfma/hpa_igemm_i8_asm_gfx940.yaml | 0 .../mfma/hpa_igemm_i8_split_lds.yaml | 0 .../mfma/hpa_igemm_i8_split_lds_gfx940.yaml | 0 .../pre_checkin/mfma/sgemm_64bit_offset.yaml | 0 .../mfma/sgemm_64bit_offset_post.yaml | 0 .../pre_checkin/mfma/sgemm_asm.yaml | 0 .../mfma/sgemm_general_batch_asm.yaml | 0 .../pre_checkin/mfma/sgemm_split_lds.yaml | 0 .../mfma/sgemm_xf32_asm_gfx940.yaml | 0 .../pre_checkin/mfma/wider_local_read.yaml | 0 .../pre_checkin/mfma/zgemm_asm.yaml | 0 .../pre_checkin/mfma/zgemm_asm_conjugate.yaml | 0 .../no_load_loop/nll_reproduce_bug.yaml | 0 .../no_load_loop/sgemm_nll_asm_nn.yaml | 0 .../no_load_loop/sgemm_nll_asm_nt.yaml | 0 .../no_load_loop/sgemm_nll_asm_tn.yaml | 0 .../no_load_loop/sgemm_nll_asm_tt.yaml | 0 .../regression/persistent_kernel.yaml | 0 .../pre_checkin/sgemm_asm_nn.yaml | 0 .../pre_checkin/sgemm_asm_nt.yaml | 0 .../pre_checkin/sgemm_asm_tn.yaml | 0 .../pre_checkin/sgemm_asm_tn_bigk.yaml | 0 .../pre_checkin/sgemm_asm_tt.yaml | 0 .../pre_checkin/sgemm_exact_dict.yaml | 0 .../sgemm_general_batch_asm_nn.yaml | 0 .../source/test_dgemm_defaults.yaml | 0 .../source/test_hgemm_defaults.yaml | 0 .../pre_checkin/source/test_hgemm_hpa.yaml | 0 .../source/test_sgemm_defaults.yaml | 0 .../pre_checkin/wmma/hgemm_wmma.yaml | 0 .../wmma/hpa_bfloat16_gemm_wmma.yaml | 0 .../pre_checkin/wmma/hpa_hgemm_wmma.yaml | 0 .../pre_checkin/wmma/hpa_igemm_wmma.yaml | 0 .../special/global_split_u_src/README | 0 .../special/global_split_u_src/hgemm_gsu.yaml | 0 .../global_split_u_src/sgemm_gsu_beta0.yaml | 0 .../global_split_u_src/sgemm_gsu_beta1.yaml | 0 .../global_split_u_src/sgemm_gsu_beta2.yaml | 0 .../sgemm_gsu_usebeta0.yaml | 0 .../special/igemm/igemm_hpa_hip_lsu.yaml | 0 .../special/igemm/igemm_hpa_hip_nn.yaml | 0 .../special/igemm/igemm_hpa_hip_tt.yaml | 0 .../library_data/hardcodedParameters.yaml | 0 .../initialSolutionParameters.yaml | 0 .../library/Kernels.so-000-gfx1010.hsaco | 0 .../library/Kernels.so-000-gfx1011.hsaco | 0 .../library/Kernels.so-000-gfx803.hsaco | 0 .../library/Kernels.so-000-gfx900.hsaco | 0 .../library/Kernels.so-000-gfx906.hsaco | 0 .../library/Kernels.so-000-gfx908.hsaco | 0 .../library_data/library/TensileLibrary.yaml | 0 .../library/TensileLibrary_gfx1010.co | 0 .../library/TensileLibrary_gfx1011.co | 0 .../library/TensileLibrary_gfx803.co | 0 .../library/TensileLibrary_gfx900.co | 0 .../library/TensileLibrary_gfx906.co | 0 .../library/TensileLibrary_gfx908.co | 0 .../unit/library_data/library/metadata.yaml | 0 .../unit/library_data/problemType.yaml | 0 .../unit/solutions/solutions_nn_3.yaml | 0 {Tensile/Tests => Tests}/unit/__init__.py | 0 .../unit/customKernels/TestKernel.s | 0 .../unit/replacement/bad_file/bad.txt | 0 .../unit/replacement/duplicate_kernel/a.txt | 0 .../unit/replacement/duplicate_kernel/b.txt | 0 .../replacement/known_kernels_v2/baz.s.txt | 0 .../known_kernels_v2/kernel_named_bar.txt | 0 .../known_kernels_v2/kernel_named_foo.txt | 0 .../replacement/known_kernels_v3/baz.s.txt | 0 .../known_kernels_v3/kernel_named_bar.txt | 0 .../known_kernels_v3/kernel_named_foo.txt | 0 {Tensile/Tests => Tests}/unit/test_Common.py | 0 .../Tests => Tests}/unit/test_Component.py | 0 .../unit/test_Configuration.py | 0 .../unit/test_CustomKernels.py | 0 .../Tests => Tests}/unit/test_DataType.py | 0 .../unit/test_HardwarePredicates.py | 0 .../unit/test_KernelWriterAssembly.py | 0 .../Tests => Tests}/unit/test_LibraryIO.py | 0 .../unit/test_PerfMetricPredicates.py | 0 .../Tests => Tests}/unit/test_Priority.py | 0 .../unit/test_ReplacementKernels.py | 0 .../unit/test_TensileCreateLibrary.py | 0 .../Tests => Tests}/unit/test_conv_problem.py | 0 .../unit/test_exact_problem.py | 0 .../Tests => Tests}/unit/test_makeProblem.py | 0 .../Tests => Tests}/unit/test_mergeLogic.py | 0 .../Tests => Tests}/unit/test_tryAssembler.py | 0 .../unit/test_useGlobalParameters.py | 0 .../vega_20/fast/igemm_asm_nn.yaml | 0 .../vega_20/fast/igemm_asm_nt.yaml | 0 .../vega_20/fast/igemm_asm_tn.yaml | 0 .../vega_20/fast/igemm_asm_tt.yaml | 0 .../global_split_u/igemm_gsu_beta0.yaml | 0 .../global_split_u/igemm_gsu_beta1.yaml | 0 .../global_split_u/igemm_gsu_beta2.yaml | 0 .../nightly/local_split_u/igemm_lsu.yaml | 0 .../Tests => Tests}/weekly/assertions/README | 0 .../assertions/test_hgemm_asem2_asm.yaml | 0 .../classic_source/test_hgemm_vectors.yaml | 0 .../classic_source/test_sgemm_vectors.yaml | 0 .../Tests => Tests}/yaml_only/test_config.py | 0 {Tensile/Tests => Tests}/yaml_only/test_ya | 0 .../Tensile}/AsmMemoryInstruction.py | 0 {Tensile => src/Tensile}/AsmRegisterPool.py | 0 {Tensile => src/Tensile}/AsmUtils.py | 0 {Tensile => src/Tensile}/BenchmarkProblems.py | 0 {Tensile => src/Tensile}/BenchmarkSplitter.py | 0 {Tensile => src/Tensile}/BenchmarkStructs.py | 0 {Tensile => src/Tensile}/ClientExecutable.py | 0 {Tensile => src/Tensile}/ClientWriter.py | 0 {Tensile => src/Tensile}/Code.py | 0 {Tensile => src/Tensile}/Common.py | 0 {Tensile => src/Tensile}/Component.py | 0 .../Tensile}/Components/ComputeStoreVgprs.py | 0 .../Tensile}/Components/LocalRead.py | 0 .../Tensile}/Components/LraTileAssignment.py | 0 .../Tensile}/Components/MAC_BF16_HPA.py | 0 .../Tensile}/Components/MAC_F16.py | 0 .../Tensile}/Components/MAC_F16_HPA.py | 0 .../Tensile}/Components/MAC_F32.py | 0 .../Tensile}/Components/MAC_F32C.py | 0 .../Tensile}/Components/MAC_F64.py | 0 .../Tensile}/Components/MAC_F64C.py | 0 .../Tensile}/Components/MAC_I8X4.py | 0 .../Tensile}/Components/MAC_I8_HPA.py | 0 {Tensile => src/Tensile}/Components/MFMA.py | 0 .../Components/NotLocalFullTileElements.py | 0 .../Tensile}/Components/Priority.py | 0 .../Components/PseudoRandomGenerator.py | 0 .../Components/ShiftVectorComponents.py | 0 .../Tensile}/Components/Signature.py | 0 .../Tensile}/Components/__init__.py | 0 {Tensile => src/Tensile}/Configuration.py | 0 {Tensile => src/Tensile}/Contractions.py | 0 {Tensile => src/Tensile}/CustomKernels.py | 0 ...128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s | 0 {Tensile => src/Tensile}/DataType.py | 0 {Tensile => src/Tensile}/EmbeddedData.py | 0 .../Tensile}/GenerateSummations.py | 0 {Tensile => src/Tensile}/Hardware.py | 0 {Tensile => src/Tensile}/KernelWriter.py | 0 .../Tensile}/KernelWriterAssembly.py | 0 {Tensile => src/Tensile}/KernelWriterBase.py | 0 .../Tensile}/KernelWriterBetaOnly.py | 0 .../Tensile}/KernelWriterConversion.py | 0 .../Tensile}/KernelWriterSource.py | 0 .../Tensile}/KernelWriterStreamKInit.py | 0 {Tensile => src/Tensile}/LibraryIO.py | 0 {Tensile => src/Tensile}/LibraryLogic.py | 0 {Tensile => src/Tensile}/Parallel.py | 0 {Tensile => src/Tensile}/Properties.py | 0 .../Tensile}/ReplacementKernels.py | 0 {Tensile => src/Tensile}/SolutionLibrary.py | 0 .../Tensile}/SolutionSelectionLibrary.py | 0 {Tensile => src/Tensile}/SolutionStructs.py | 0 {Tensile => src/Tensile}/SolutionWriter.py | 0 {Tensile => src/Tensile}/Tensile.py | 0 .../Tensile}/TensileBenchmarkCluster.py | 0 .../TensileBenchmarkClusterScripts.py | 0 .../Tensile}/TensileBenchmarkLibraryClient.py | 0 .../Tensile}/TensileClientConfig.py | 0 .../Tensile}/TensileCreateLibrary.py | 0 .../Tensile}/TensileLibLogicToYaml.py | 0 .../Tensile}/TensileMergeLibrary.py | 0 .../Tensile}/TensileRetuneLibrary.py | 0 .../Tensile}/TensileUpdateLibrary.py | 0 {Tensile => src/Tensile}/Utils.py | 0 {Tensile => src/Tensile}/__init__.py | 0 {Tensile => src/Tensile}/bin/Tensile | 0 .../Tensile}/bin/TensileBenchmarkCluster | 0 .../Tensile}/bin/TensileClientConfig | 0 .../Tensile}/bin/TensileCreateLibrary | 0 .../Tensile}/bin/TensileGenerateSummations | 0 .../Tensile}/bin/TensileLibLogicToYaml | 0 .../Tensile}/bin/TensileMergeLibrary | 0 .../Tensile}/bin/TensileRetuneLibrary | 0 .../Tensile}/bin/TensileUpdateLibrary | 0 .../alternate-format/sizeList-example.yaml | 0 .../alternate-format/vega20-example.yaml | 0 .../Tensile/data}/Configs/deep_bench_nn.csv | 0 .../data}/Configs/deep_bench_nn_batched.csv | 0 .../Tensile/data}/Configs/deep_bench_nt.csv | 0 .../data}/Configs/deep_bench_nt_batched.csv | 0 .../Tensile/data}/Configs/deep_bench_tn.csv | 0 .../data}/Configs/deep_bench_tn_batched.csv | 0 .../Configs/mfma/mfma_hpa_bf16_nt_test.yaml | 0 .../Configs/mfma/mfma_igemm_lite_test.yaml | 0 .../Configs/mfma/mfma_igemm_nn_asm_full.yaml | 0 .../Configs/mfma/mfma_igemm_nt_asm_full.yaml | 0 .../Configs/mfma/mfma_igemm_tn_asm_full.yaml | 0 .../Configs/mfma/mfma_igemm_tt_asm_full.yaml | 0 .../Tensile/data}/Configs/mfma/mfma_test.yaml | 0 .../mfma/rocblas_cgemm_asm_xdlops.yaml | 0 .../mfma/rocblas_sgemm_asm_single_kernel.yaml | 0 .../mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml | 0 .../data}/Configs/mfma/sgemm_tlunn.yaml | 0 .../Configs/mfma/sgemm_transposeLDS.yaml | 0 .../vega10_Cijk_Ailk_Bljk_HB.yaml | 0 .../vega10_Cijk_Ailk_Bljk_SB.yaml | 0 .../vega10_Cijk_Ailk_Bjlk_HB.yaml | 0 .../vega10_Cijk_Ailk_Bjlk_SB.yaml | 0 .../vega10_Cijk_Ailk_Bljk_HB.yaml | 0 .../vega10_Cijk_Ailk_Bljk_SB.yaml | 0 .../vega10_Cijk_Alik_Bljk_HB.yaml | 0 .../vega10_Cijk_Alik_Bljk_SB.yaml | 0 .../Tensile/data}/Configs/miopen/Makefile | 0 .../Tensile/data}/Configs/miopen/README.md | 0 .../configs/vega20_sgemm_nn_bert.yaml | 0 .../configs/vega20_sgemm_nt_bert.yaml | 0 .../configs/vega20_sgemm_tn_bert.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_bert.yaml | 0 .../configs/vega20_sgemm_nt_bert.yaml | 0 .../configs/vega20_sgemm_tn_bert.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/arcturus_sgemm_nn_bert.yaml | 0 .../configs/arcturus_sgemm_nt_bert.yaml | 0 .../configs/arcturus_sgemm_tn_bert.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_msra.yaml | 0 .../configs/vega20_sgemm_nt_msra.yaml | 0 .../configs/vega20_sgemm_tn_msra.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_bert.yaml | 0 .../configs/vega20_sgemm_nt_bert.yaml | 0 .../configs/vega20_sgemm_tn_bert.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_hgemm_nn_bert_f16.yaml | 0 .../configs/vega20_hgemm_nt_bert_f16.yaml | 0 .../configs/vega20_hgemm_tn_bert_f16.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_HB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_HB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_HB.yaml | 0 .../configs/bert_sgemm_xdlops_nn.yaml | 0 .../configs/bert_sgemm_xdlops_tn.yaml | 0 .../2020-05-18/configs/dlrm_sgemm_xdlops.yaml | 0 .../configs/dlrm_sgemm_xdlops_nt.yaml | 0 .../replacement-kernel-arcturus-tn.yaml | 0 .../rocblas_sgemm_nn_inc1_asm_full.yaml | 0 .../rocblas_sgemm_nt_inc1_asm_full.yaml | 0 .../rocblas_sgemm_tn_inc1_asm_full.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_batched_msra.yaml | 0 .../configs/vega20_sgemm_nt_batched_msra.yaml | 0 .../configs/vega20_sgemm_tn_batched_msra.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_onnx.yaml | 0 .../configs/vega20_sgemm_nt_onnx.yaml | 0 .../configs/vega20_sgemm_tn_onnx.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_hgemm_nn_megatron.yaml | 0 .../configs/vega20_hgemm_nt_megatron.yaml | 0 .../configs/vega20_hgemm_tn_megatron.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_HBH.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_HBH.yaml | 0 .../archives/bert/2020-11-06/configs/doit.sh | 0 .../archives/bert/2020-11-06/configs/nn.yaml | 0 .../archives/bert/2020-11-06/configs/nt.yaml | 0 .../archives/bert/2020-11-06/configs/tn.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../bert/2020-11-08/configs/bert-nn.yaml | 0 .../bert/2020-11-08/configs/bert-nt.yaml | 0 .../bert/2020-11-08/configs/bert-tn.yaml | 0 .../archives/bert/2020-11-08/configs/doit.sh | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_dlrm.yaml | 0 .../configs/vega20_sgemm_nt_dlrm.yaml | 0 .../configs/vega20_sgemm_tn_dlrm.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/arcturus_sgemm_nn_dlrm.yaml | 0 .../configs/arcturus_sgemm_nt_dlrm.yaml | 0 .../configs/arcturus_sgemm_tn_dlrm.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../dlrm/2020-07-02/configs/temp.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../configs/sgemm_xdlops_nn_terabyte.yaml | 0 .../configs/sgemm_xdlops_nt_terabyte.yaml | 0 .../configs/sgemm_xdlops_tn_terabyte.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 ...urus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml | 0 ...urus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml | 0 ...urus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../miopen/archives/ext2/2020-11-05/README | 0 .../clients/samples/example_gemm_ext2-tn.cpp | 0 .../ext2/2020-11-05/gfx900/configs/doit.sh | 0 .../gfx900/configs/spec2-nn-gfx900.yaml | 0 .../gfx900/configs/spec2-tn-gfx900.yaml | 0 .../gfx900/configs/speccd-nn-gfx900.yaml | 0 .../gfx900/configs/speccd-tn-gfx900.yaml | 0 .../joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../ext2/2020-11-05/gfx906/configs/doit.sh | 0 .../gfx906/configs/spec2-nn-gfx906.yaml | 0 .../gfx906/configs/spec2-tn-gfx906.yaml | 0 .../gfx906/configs/speccd-nn-gfx906.yaml | 0 .../gfx906/configs/speccd-tn-gfx906.yaml | 0 .../joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../ext2/2020-11-05/gfx908/configs/doit.sh | 0 .../gfx908/configs/spec2-nn-gfx908.yaml | 0 .../gfx908/configs/spec2-tn-gfx908.yaml | 0 .../gfx908/configs/speccd-nn-gfx908.yaml | 0 .../gfx908/configs/speccd-tn-gfx908.yaml | 0 .../joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml | 0 .../raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml | 0 .../configs/sgemm_inception_nn.yaml | 0 .../configs/sgemm_inception_nt_batched.yaml | 0 .../configs/sgemm_inception_tn.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_riga.yaml | 0 .../configs/vega20_sgemm_nt_riga.yaml | 0 .../configs/vega20_sgemm_tn_riga.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../arcturus_sgemm_nn_resnext-inception.yaml | 0 .../arcturus_sgemm_nt_resnext-inception.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../2021-02-04/2_BenchmarkData.tar.gz | Bin .../configs/vega20_hgemm_nn_hbh.yaml | 0 .../configs/vega20_hgemm_nt_hbh.yaml | 0 .../configs/vega20_hgemm_tn_hbh.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_HBH.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_HBH.yaml | 0 .../configs/vega20_sgemm_nn_mlp.yaml | 0 .../configs/vega20_sgemm_nt_mlp.yaml | 0 .../configs/vega20_sgemm_tn_mlp.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_k1.yaml | 0 .../configs/vega20_sgemm_nt_k1.yaml | 0 .../configs/vega20_sgemm_tn_k1.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../archive/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../archive/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../archive/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../archive/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs1/vega20_sgemm_nn_phantom.yaml | 0 .../configs1/vega20_sgemm_tn_phantom.yaml | 0 .../configs2/vega20_sgemm_nn_phantom.yaml | 0 .../configs2/vega20_sgemm_nt_phantom.yaml | 0 .../configs2/vega20_sgemm_tn_phantom.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_riga.yaml | 0 .../configs/vega20_sgemm_nt_riga.yaml | 0 .../configs/vega20_sgemm_tn_riga.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/resnet-inception-nn-2x2.yaml | 0 .../configs/resnet-inception-nn.yaml | 0 .../configs/resnet-inception-nt-2x2.yaml | 0 .../configs/resnet-inception-nt.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_S.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_S.yaml | 0 .../exact/vega20_Cijkl_Aijml_Bkml_SI.yaml | 0 .../exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml | 0 .../configs/resnet-inception-hgemm-nn.yaml | 0 .../configs/resnet-inception-hgemm-nt.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_HH.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_HH.yaml | 0 .../arcturus_sgemm_nn_resnext-inception.yaml | 0 .../arcturus_sgemm_nt_resnext-inception.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../archives/resnet50/2018-09-12/README.md | 0 .../2018-09-12/config/hgemm_resnet50_nn.yaml | 0 .../2018-09-12/config/hgemm_resnet50_nt.yaml | 0 .../2018-09-12/config/hgemm_resnet50_tn.yaml | 0 .../2018-09-12/config/sgemm_resnet50_nn.yaml | 0 .../2018-09-12/config/sgemm_resnet50_nt.yaml | 0 .../2018-09-12/config/sgemm_resnet50_tn.yaml | 0 .../logic/vega20_Cijk_Ailk_Bjlk_HB.yaml | 0 .../logic/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../logic/vega20_Cijk_Ailk_Bljk_HB.yaml | 0 .../logic/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../logic/vega20_Cijk_Alik_Bljk_HB.yaml | 0 .../logic/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../archives/resnet50/2018-10-09/README.md | 0 .../2018-10-09/config/hgemm_resnet50_nn.yaml | 0 .../2018-10-09/config/hgemm_resnet50_nt.yaml | 0 .../2018-10-09/config/hgemm_resnet50_tn.yaml | 0 .../2018-10-09/config/hpa_resnet50_nn.yaml | 0 .../2018-10-09/config/hpa_resnet50_nt.yaml | 0 .../2018-10-09/config/hpa_resnet50_tn.yaml | 0 .../2018-10-09/config/sgemm_resnet50_nn.yaml | 0 .../2018-10-09/config/sgemm_resnet50_nt.yaml | 0 .../2018-10-09/config/sgemm_resnet50_tn.yaml | 0 .../logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml | 0 .../logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml | 0 .../logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml | 0 .../logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml | 0 .../logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../logic/main/vega20_Cijk_Alik_Bljk_HB.yaml | 0 .../logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml | 0 .../logic/main/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../merged/vega20_Cijk_Ailk_Bjlk_HB.yaml | 0 .../merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml | 0 .../merged/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../merged/vega20_Cijk_Ailk_Bljk_HB.yaml | 0 .../merged/vega20_Cijk_Ailk_Bljk_HBH.yaml | 0 .../merged/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../merged/vega20_Cijk_Alik_Bljk_HB.yaml | 0 .../merged/vega20_Cijk_Alik_Bljk_HBH.yaml | 0 .../merged/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml | 0 .../resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml | 0 .../resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml | 0 .../resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml | 0 .../resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../resnet50/vega20_Cijk_Alik_Bljk_HB.yaml | 0 .../resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml | 0 .../resnet50/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_resnet50.yaml | 0 .../configs/vega20_sgemm_nt_resnet50.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../2021-02-10/2_BenchmarkData.tar.gz | Bin .../configs/arcturus_sgemm_nn_sb.yaml | 0 .../configs/arcturus_sgemm_nt_sb.yaml | 0 .../configs/arcturus_sgemm_tn_sb.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../2021-02-17/2_BenchmarkData.tar.gz | Bin .../configs/vega20_sgemm_nn_resnext3d.yaml | 0 .../configs/vega20_sgemm_nt_resnext3d.yaml | 0 .../configs/vega20_sgemm_tn_resnext3d.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../2021-02-18/2_BenchmarkData.tar.gz | Bin .../configs/vega20_sgemm_nn_resnext3d-r2.yaml | 0 .../configs/vega20_sgemm_nt_resnext3d-r2.yaml | 0 .../configs/vega20_sgemm_tn_resnext3d-r2.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../replacement-kernel-arcturus-tn.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../base/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../combined/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../configuration/sgemm_tn-guard-pr195.yaml | 0 .../inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../inc/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../archives/rk/2020-08-12/logs/convert.log | 0 .../archives/rk/2020-08-12/logs/merge.log | 0 .../2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_shakespeare.yaml | 0 .../configs/vega20_sgemm_nt_shakespeare.yaml | 0 .../configs/vega20_sgemm_tn_shakespeare.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega10_sgemm_nn_shakespeare.yaml | 0 .../configs/vega10_sgemm_nt_shakespeare.yaml | 0 .../configs/vega10_sgemm_tn_shakespeare.yaml | 0 .../exact/vega10_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega10_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega10_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/arcturus_sgemm_tn_miopen.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../arcturus_dgemm_nn_skinny_small.yaml | 0 .../arcturus_dgemm_nt_skinny_small.yaml | 0 .../vegoa20_dgemm_nn_skinny_small.yaml | 0 .../vegoa20_dgemm_nt_skinny_small.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_DB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_DB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_DB.yaml | 0 .../arcturus_dgemm_nn_skinny_large.yaml | 0 .../configs/vega20_dgemm_nn_skinny_large.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_DB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_DB.yaml | 0 .../2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../configs/sgemm_sparseNN_gemm_nn.yaml | 0 .../configs/sgemm_sparseNN_gemm_tn.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega10_sgemm_nn_transformer.yaml | 0 .../configs/vega10_sgemm_nt_transformer.yaml | 0 .../configs/vega10_sgemm_tn_transformer.yaml | 0 .../exact/vega10_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega10_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega10_Cijk_Alik_Bljk_SB.yaml | 0 .../configs/vega20_sgemm_nn_transformer.yaml | 0 .../configs/vega20_sgemm_nt_transformer.yaml | 0 .../configs/vega20_sgemm_tn_transformer.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../arcturus_sgemm_nn_transformer.yaml | 0 .../arcturus_sgemm_nt_transformer.yaml | 0 .../arcturus_sgemm_tn_transformer.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../arcturus_sgemm_nn_transformer.yaml | 0 .../arcturus_sgemm_nt_transformer.yaml | 0 .../arcturus_sgemm_tn_transformer.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml | 0 .../vega20_sgemm_nn_sgemm_transformer.yaml | 0 .../vega20_sgemm_nt_sgemm_transformer.yaml | 0 .../vega20_sgemm_tn_sgemm_transformer.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../vega20_hgemm_nn_hgemm_transformer.yaml | 0 .../vega20_hgemm_nt_hgemm_transformer.yaml | 0 .../vega20_hgemm_tn_hgemm_transformer.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml | 0 .../exact/vega20_Cijk_Ailk_Bljk_HBH.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_HBH.yaml | 0 .../configs/vega20_sgemm_nt_winograd.yaml | 0 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml | 0 .../configs/vega20_sgemm_tn_winograd.yaml | 0 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml | 0 .../data}/Configs/miopen/boiler/header.yml | 0 .../miopen/boiler/library_logic_hip_only.yml | 0 .../boiler/library_logic_vega10_only.yml | 0 .../boiler/library_logic_vega20_only.yml | 0 .../data}/Configs/miopen/convert_cfg.py | 0 .../Tensile/data}/Configs/miopen/make_all.sh | 0 .../problems/nn/deepbench_conv_1x1_batch1.yml | 0 .../problems/nn/deepbench_conv_1x1_batchN.yml | 0 .../problems/nn/deepbench_gemm_large.yml | 0 .../problems/nn/deepbench_gemm_skinny.yml | 0 .../miopen/problems/nn/resnet50_all.yml | 0 .../miopen/problems/nn/resnet50_batch64.yml | 0 .../miopen/problems/nn/resnet_batch64_B.yml | 0 .../problems/nt/deepbench_gemm_large.yml | 0 .../problems/nt/deepbench_gemm_skinny.yml | 0 .../miopen/problems/nt/resnet50_all.yml | 0 .../problems/tn/deepbench_gemm_large.yml | 0 .../problems/tn/deepbench_gemm_skinny.yml | 0 .../miopen/problems/tn/resnet50_all.yml | 0 .../solutions/hgemm_large_explore_3.yml | 0 .../solutions/hgemm_large_explore_5.yml | 0 .../Configs/miopen/solutions/hgemm_quick.yml | 0 .../solutions/hgemm_skinny_explore_3.yml | 0 .../solutions/hgemm_skinny_explore_5.yml | 0 .../solutions/sgemm_large_explore_3.yml | 0 .../solutions/sgemm_large_explore_5.yml | 0 .../solutions/sgemm_large_explore_7.yml | 0 .../Configs/miopen/solutions/sgemm_quick.yml | 0 .../solutions/sgemm_skinny_explore_3.yml | 0 .../solutions/sgemm_skinny_explore_4.yml | 0 .../solutions/sgemm_skinny_explore_5.yml | 0 .../solutions/sgemm_skinny_explore_7.yml | 0 .../data}/Configs/miopen/types/hgemm_nn.yml | 0 .../data}/Configs/miopen/types/hgemm_nt.yml | 0 .../data}/Configs/miopen/types/hgemm_tn.yml | 0 .../data}/Configs/miopen/types/hgemm_tt.yml | 0 .../data}/Configs/miopen/types/igemm_nn.yml | 0 .../data}/Configs/miopen/types/igemm_nt.yml | 0 .../data}/Configs/miopen/types/igemm_tn.yml | 0 .../data}/Configs/miopen/types/igemm_tt.yml | 0 .../data}/Configs/miopen/types/sgemm_nn.yml | 0 .../data}/Configs/miopen/types/sgemm_nt.yml | 0 .../data}/Configs/miopen/types/sgemm_tn.yml | 0 .../data}/Configs/miopen/types/sgemm_tt.yml | 0 .../navi21/rocblas_hgemm_gb_nn_asm_full.yaml | 0 .../navi21/rocblas_hgemm_gb_nt_asm_full.yaml | 0 .../navi21/rocblas_hgemm_gb_tn_asm_full.yaml | 0 .../navi21/rocblas_hgemm_gb_tt_asm_full.yaml | 0 .../navi21/rocblas_hgemm_sb_nn_asm_full.yaml | 0 .../navi21/rocblas_hgemm_sb_nt_asm_full.yaml | 0 .../navi21/rocblas_hgemm_sb_tn_asm_full.yaml | 0 .../navi21/rocblas_hgemm_sb_tt_asm_full.yaml | 0 .../rocblas_hpa_hgemm_gb_nn_asm_full.yaml | 0 .../rocblas_hpa_hgemm_gb_nt_asm_full.yaml | 0 .../rocblas_hpa_hgemm_gb_tn_asm_full.yaml | 0 .../rocblas_hpa_hgemm_gb_tt_asm_full.yaml | 0 .../rocblas_hpa_hgemm_sb_nn_asm_full.yaml | 0 .../rocblas_hpa_hgemm_sb_nt_asm_full.yaml | 0 .../rocblas_hpa_hgemm_sb_tn_asm_full.yaml | 0 .../rocblas_hpa_hgemm_sb_tt_asm_full.yaml | 0 .../navi21/rocblas_sgemm_gb_nn_asm_full.yaml | 0 .../navi21/rocblas_sgemm_gb_nt_asm_full.yaml | 0 .../navi21/rocblas_sgemm_gb_tn_asm_full.yaml | 0 .../navi21/rocblas_sgemm_gb_tt_asm_full.yaml | 0 .../navi21/rocblas_sgemm_sb_nn_asm_full.yaml | 0 .../navi21/rocblas_sgemm_sb_nt_asm_full.yaml | 0 .../navi21/rocblas_sgemm_sb_tn_asm_full.yaml | 0 .../navi21/rocblas_sgemm_sb_tt_asm_full.yaml | 0 .../Tensile/data}/Configs/rocblas_cgemm.yaml | 0 .../data}/Configs/rocblas_cgemm_asm_lite.yaml | 0 .../data}/Configs/rocblas_cgemm_hip_lite.yaml | 0 .../data}/Configs/rocblas_dgemm_asm_lite.yaml | 0 .../rocblas_dgemm_asm_single_kernel.yaml | 0 .../Configs/rocblas_dgemm_asm_square.yaml | 0 .../rocblas_dgemm_bufferload_limit.yaml | 0 .../data}/Configs/rocblas_dgemm_hip_lite.yaml | 0 .../Configs/rocblas_dgemm_nn_asm_full.yaml | 0 .../rocblas_dgemm_nn_inc0_asm_full.yaml | 0 .../Configs/rocblas_dgemm_nt_asm_full.yaml | 0 .../rocblas_dgemm_nt_inc0_asm_full.yaml | 0 .../rocblas_dgemm_nt_inc1_asm_full.yaml | 0 .../rocblas_dgemm_nt_inc2_asm_full.yaml | 0 .../rocblas_dgemm_nt_inc3_asm_full.yaml | 0 .../rocblas_dgemm_nt_resume_train_exp.yaml | 0 .../Configs/rocblas_dgemm_tn_asm_full.yaml | 0 .../Configs/rocblas_dgemm_tt_asm_full.yaml | 0 .../data}/Configs/rocblas_hgemm_asm_full.yaml | 0 .../data}/Configs/rocblas_hgemm_asm_lite.yaml | 0 .../rocblas_hgemm_asm_single_kernel.yaml | 0 .../rocblas_hgemm_bufferload_limit.yaml | 0 .../data}/Configs/rocblas_hgemm_hip_lite.yaml | 0 .../rocblas_hpa_bf16_gemm_tn_asm_test.yaml | 0 .../rocblas_hpa_bf16s_gemm_tn_asm_test.yaml | 0 .../rocblas_hpa_bfloat16_gemm_inc1_hip.yaml | 0 ...as_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml | 0 ...as_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml | 0 ...as_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml | 0 .../rocblas_hpa_bfloat16_hip_lite.yaml | 0 ...ocblas_hpa_bfloat16_hip_single_kernel.yaml | 0 ...rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml | 0 ...rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml | 0 .../rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml | 0 ...s_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml | 0 ...s_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml | 0 ...s_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml | 0 .../rocblas_hpa_bfloat16s_hip_lite.yaml | 0 ...cblas_hpa_bfloat16s_hip_single_kernel.yaml | 0 ...ocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml | 0 ...ocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml | 0 .../Configs/rocblas_hpa_hgemm_asm_lite.yaml | 0 .../rocblas_hpa_hgemm_asm_single_kernel.yaml | 0 .../Configs/rocblas_hpa_hgemm_hip_lite.yaml | 0 .../Configs/rocblas_hpa_hgemm_inc1_hip.yaml | 0 .../rocblas_hpa_hgemm_nn_asm_full.yaml | 0 .../rocblas_hpa_hgemm_nn_inc1_asm_full.yaml | 0 .../rocblas_hpa_hgemm_nt_asm_full.yaml | 0 .../rocblas_hpa_hgemm_nt_inc1_asm_full.yaml | 0 .../rocblas_hpa_hgemm_tn_asm_full.yaml | 0 .../rocblas_hpa_hgemm_tn_inc1_asm_full.yaml | 0 .../rocblas_hpa_hgemm_tt_asm_full.yaml | 0 .../Configs/rocblas_hpa_hsgemm_asm_lite.yaml | 0 .../rocblas_hpa_hsgemm_asm_single_kernel.yaml | 0 .../Configs/rocblas_hpa_hsgemm_hip_lite.yaml | 0 .../Configs/rocblas_hpa_hsgemm_inc1_hip.yaml | 0 .../rocblas_hpa_hsgemm_nn_asm_full.yaml | 0 .../rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml | 0 .../rocblas_hpa_hsgemm_nt_asm_full.yaml | 0 .../rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml | 0 .../rocblas_hpa_hsgemm_tn_asm_full.yaml | 0 .../rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml | 0 .../rocblas_hpa_hsgemm_tt_asm_full.yaml | 0 .../Configs/rocblas_hpa_igemm_nn_hip.yaml | 0 .../Configs/rocblas_hpa_igemm_nt_hip.yaml | 0 .../Configs/rocblas_hpa_igemm_tn_hip.yaml | 0 .../Configs/rocblas_hpa_igemm_tt_hip.yaml | 0 .../Configs/rocblas_hsgemm_asm_lite.yaml | 0 .../Configs/rocblas_igemm_asm_full_nn.yaml | 0 .../Configs/rocblas_igemm_asm_full_nt.yaml | 0 .../Configs/rocblas_igemm_asm_full_tn.yaml | 0 .../Configs/rocblas_igemm_asm_full_tt.yaml | 0 .../rocblas_igemm_hip_single_kernel.yaml | 0 .../data}/Configs/rocblas_sgemm_asm_full.yaml | 0 .../data}/Configs/rocblas_sgemm_asm_lite.yaml | 0 .../data}/Configs/rocblas_sgemm_asm_only.yaml | 0 .../rocblas_sgemm_asm_single_kernel.yaml | 0 .../rocblas_sgemm_bufferload_limit.yaml | 0 .../data}/Configs/rocblas_sgemm_example.yaml | 0 .../data}/Configs/rocblas_sgemm_hip_lite.yaml | 0 .../rocblas_sgemm_nn_inc1_asm_full.yaml | 0 .../rocblas_sgemm_nt_inc1_asm_full.yaml | 0 .../rocblas_sgemm_tn_inc1_asm_full.yaml | 0 .../rocblas_sgemm_tn_inc2_asm_full.yaml | 0 .../rocblas_sgemm_tn_inc3_asm_full.yaml | 0 .../Tensile/data}/Configs/rocblas_zgemm.yaml | 0 .../data}/Configs/rocblas_zgemm_asm_lite.yaml | 0 .../Tensile/data}/Perf/BDAS/dgemm_kmeans.yaml | 0 .../Tensile/data}/Perf/BDAS/dgemm_pca.yaml | 0 .../Tensile/data}/Perf/BERT/sgemm_xdlops.yaml | 0 .../Tensile/data}/Perf/DLRM/sgemm_xdlops.yaml | 0 .../data}/Perf/DLRM/sgemm_xdlops_nn.yaml | 0 .../Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml | 0 .../data}/Perf/DLRM/sgemm_xdlops_nt.yaml | 0 .../Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml | 0 .../Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml | 0 .../data}/Perf/TRANSFORMER/sgemm_xdlops.yaml | 0 .../Perf/TRANSFORMER/sgemm_xdlops_nn.yaml | 0 .../Perf/TRANSFORMER/sgemm_xdlops_nt.yaml | 0 .../Tensile/data}/Perf/conv/README | 0 .../data}/Perf/conv/conv_1x1_af0em.yaml | 0 .../data}/Perf/conv/conv_1x1_oddpbd.yaml | 0 .../data}/Perf/conv/conv_1x1u2_bdww.yaml | 0 .../data}/Perf/conv/conv_1x1u2_fwd.yaml | 0 .../Tensile/data}/Perf/conv/conv_1x7_fwd.yaml | 0 .../Tensile/data}/Perf/conv/conv_7x1_fwd.yaml | 0 .../data}/Perf/conv/conv_7x1_fwd2.yaml | 0 .../data}/Perf/conv/conv_7x1_roundup.yaml | 0 .../data}/Perf/conv/conv_7x7u2_fwd.yaml | 0 .../data}/Perf/conv/conv_bwdd_pbd.yaml | 0 .../Tensile/data}/Perf/conv/conv_fwd.yaml | 0 .../Tensile/data}/Perf/conv_bwdd_ex0.yaml | 0 .../Tensile/data}/Perf/conv_bwdd_ex1.yaml | 0 .../Tensile/data}/Perf/conv_bwdw_big_gsu.yaml | 0 .../data}/Perf/conv_bwdw_small_gsu.yaml | 0 .../Tensile/data}/Perf/conv_fwd_ex0.yaml | 0 .../data}/Perf/dgemm_large_square.yaml | 0 {Tensile => src/Tensile/data}/Perf/hpl.yaml | 0 .../Tensile/data}/Perf/hpl_one.yaml | 0 .../Tensile/data}/Perf/hpl_quick.yaml | 0 .../Tensile/data}/Perf/hpl_quick44k.yaml | 0 .../data}/Perf/inception/conv_1x1u1.yaml | 0 .../Perf/inception/conv_1x1u1_starter.yaml | 0 .../data}/Perf/inception/conv_NxN.yaml | 0 .../data}/Perf/sgemm_large_square_nn.yaml | 0 .../data}/Perf/sgemm_large_square_nt.yaml | 0 .../data}/Perf/sgemm_large_square_tn.yaml | 0 .../data}/Perf/use_initial_strides_cd/README | 0 .../perf_baseline0.yaml | 0 .../use_initial_strides_cd/perf_uis_cd0.yaml | 0 .../perf_uis_cd_specialized.yaml | 0 .../Tensile/data}/Source/CMakeLists.txt | 0 .../Tensile/data}/Source/EnableWarnings.cmake | 0 .../Tensile/data}/Source/FindHIP.cmake | 0 .../Tensile/data}/Source/FindOpenCL.cmake | 0 .../Tensile/data}/Source/KernelHeader.h | 0 .../Tensile/data}/Source/TensileTypes.h | 0 .../data}/Source/client/CMakeLists.txt | 0 .../Source/client/include/BenchmarkTimer.hpp | 0 .../Source/client/include/CSVStackFile.hpp | 0 .../client/include/ClientProblemFactory.hpp | 0 .../client/include/ConvolutionProblem.hpp | 0 .../client/include/DataInitialization.hpp | 0 .../include/DataInitializationTyped.hpp | 0 .../Source/client/include/HardwareMonitor.hpp | 0 .../include/HardwareMonitorListener.hpp | 0 .../client/include/HardwareMonitorType.hpp | 0 .../client/include/HardwareMonitorWindows.hpp | 0 .../client/include/HardwareMonitor_fwd.hpp | 0 .../client/include/LibraryUpdateReporter.hpp | 0 .../Source/client/include/LogReporter.hpp | 0 .../client/include/MetaResultReporter.hpp | 0 .../Source/client/include/MetaRunListener.hpp | 0 .../client/include/PerformanceReporter.hpp | 0 .../client/include/ProgressListener.hpp | 0 .../data}/Source/client/include/Reference.hpp | 0 .../client/include/ReferenceValidator.hpp | 0 .../client/include/ResultComparison.hpp | 0 .../client/include/ResultFileReporter.hpp | 0 .../Source/client/include/ResultReporter.hpp | 0 .../client/include/ResultReporter_fwd.hpp | 0 .../Source/client/include/RunListener.hpp | 0 .../client/include/SolutionIterator.hpp | 0 .../Source/client/include/TimingEvents.hpp | 0 .../Tensile/data}/Source/client/main.cpp | 0 .../Source/client/source/BenchmarkTimer.cpp | 0 .../Source/client/source/CSVStackFile.cpp | 0 .../client/source/ClientProblemFactory.cpp | 0 .../client/source/ConvolutionProblem.cpp | 0 .../client/source/DataInitialization.cpp | 0 .../Source/client/source/HardwareMonitor.cpp | 0 .../client/source/HardwareMonitorListener.cpp | 0 .../client/source/LibraryUpdateReporter.cpp | 0 .../Source/client/source/MetaRunListener.cpp | 0 .../client/source/PerformanceReporter.cpp | 0 .../Source/client/source/ProgressListener.cpp | 0 .../data}/Source/client/source/Reference.cpp | 0 .../client/source/ReferenceValidator.cpp | 0 .../client/source/ResultFileReporter.cpp | 0 .../Source/client/source/ResultReporter.cpp | 0 .../Source/client/source/SolutionIterator.cpp | 0 .../Source/client/source/TimingEvents.cpp | 0 .../data}/Source/cmake/FindROCmSMI.cmake | 0 .../Tensile/data}/Source/hip_f8_impl.h | 0 .../Tensile/data}/Source/lib/CMakeLists.txt | 0 .../SolutionLibraries/KernelsLiteNavi.yaml | 0 .../navi10_Cijk_Ailk_Bjlk_SB.yaml | 0 .../navi10_Cijk_Ailk_Bljk_SB.yaml | 0 .../navi10_Cijk_Alik_Bjlk_SB.yaml | 0 .../navi10_Cijk_Alik_Bljk_SB.yaml | 0 .../Source/lib/include/Tensile/AMDGPU.hpp | 0 .../lib/include/Tensile/AMDGPUPredicates.hpp | 0 .../lib/include/Tensile/AMDGPU_Detail.hpp | 0 .../include/Tensile/ArithmeticUnitTypes.hpp | 0 .../lib/include/Tensile/CachingLibrary.hpp | 0 .../Source/lib/include/Tensile/Comparison.hpp | 0 .../include/Tensile/ContractionLibrary.hpp | 0 .../include/Tensile/ContractionProblem.hpp | 0 .../Tensile/ContractionProblemPredicates.hpp | 0 .../Tensile/ContractionProblemProperties.hpp | 0 .../Tensile/ContractionProblem_Detail.hpp | 0 .../Tensile/ContractionProblem_fwd.hpp | 0 .../include/Tensile/ContractionSolution.hpp | 0 .../Tensile/ContractionSolution_fwd.hpp | 0 .../lib/include/Tensile/Contractions.hpp | 0 .../Source/lib/include/Tensile/DataTypes.hpp | 0 .../include/Tensile/DataTypes_BFloat16.hpp | 0 .../Tensile/DataTypes_Float8_BFloat8.hpp | 0 .../lib/include/Tensile/DataTypes_Half.hpp | 0 .../lib/include/Tensile/DataTypes_Int8.hpp | 0 .../lib/include/Tensile/DataTypes_Int8x4.hpp | 0 .../include/Tensile/DataTypes_XFloat32.hpp | 0 .../Source/lib/include/Tensile/Debug.hpp | 0 .../lib/include/Tensile/DecisionTree.hpp | 0 .../include/Tensile/DecisionTreeLibrary.hpp | 0 .../Source/lib/include/Tensile/Distance.hpp | 0 .../lib/include/Tensile/DistinctType.hpp | 0 .../lib/include/Tensile/EmbeddedData.hpp | 0 .../lib/include/Tensile/EmbeddedLibrary.hpp | 0 .../lib/include/Tensile/ExactLogicLibrary.hpp | 0 .../Tensile/GranularitySelectionLibrary.hpp | 0 .../lib/include/Tensile/KernelArguments.hpp | 0 .../include/Tensile/KernelLanguageTypes.hpp | 0 .../Source/lib/include/Tensile/MLFeatures.hpp | 0 .../Source/lib/include/Tensile/Macros.hpp | 0 .../Source/lib/include/Tensile/MapLibrary.hpp | 0 .../include/Tensile/MasterSolutionLibrary.hpp | 0 .../lib/include/Tensile/MatchingLibrary.hpp | 0 .../Tensile/PerformanceMetricTypes.hpp | 0 .../include/Tensile/PlaceholderLibrary.hpp | 0 .../Source/lib/include/Tensile/Predicates.hpp | 0 .../Source/lib/include/Tensile/ProblemKey.hpp | 0 .../Source/lib/include/Tensile/Properties.hpp | 0 .../lib/include/Tensile/PropertyMatching.hpp | 0 .../lib/include/Tensile/ScalarValueTypes.hpp | 0 .../lib/include/Tensile/Serialization.hpp | 0 .../include/Tensile/Serialization/Base.hpp | 0 .../Tensile/Serialization/Containers.hpp | 0 .../Serialization/ContractionPredicates.hpp | 0 .../Serialization/ContractionSolution.hpp | 0 .../Serialization/DecisionTreeLibrary.hpp | 0 .../Serialization/ExactLogicLibrary.hpp | 0 .../GranularitySelectionLibrary.hpp | 0 .../Tensile/Serialization/HasTraits.hpp | 0 .../Tensile/Serialization/MLFeatures.hpp | 0 .../Tensile/Serialization/MapLibrary.hpp | 0 .../Tensile/Serialization/MatchingLibrary.hpp | 0 .../Serialization/PlaceholderLibrary.hpp | 0 .../Tensile/Serialization/Predicates.hpp | 0 .../Tensile/Serialization/Properties.hpp | 0 .../Tensile/Serialization/SolutionLibrary.hpp | 0 .../include/Tensile/SingleSolutionLibrary.hpp | 0 .../Source/lib/include/Tensile/Singleton.hpp | 0 .../lib/include/Tensile/SolutionLibrary.hpp | 0 .../include/Tensile/SolutionLibrary_fwd.hpp | 0 .../include/Tensile/SolutionMapLibrary.hpp | 0 .../Source/lib/include/Tensile/Tensile.hpp | 0 .../lib/include/Tensile/Tensile_fwd.hpp | 0 .../lib/include/Tensile/TensorDescriptor.hpp | 0 .../Tensile/TensorDescriptor_Detail.hpp | 0 .../include/Tensile/TensorDescriptor_fwd.hpp | 0 .../Source/lib/include/Tensile/TensorOps.hpp | 0 .../lib/include/Tensile/TensorOps_fwd.hpp | 0 .../Tensile/UserDrivenTuningParser.hpp | 0 .../Source/lib/include/Tensile/Utils.hpp | 0 .../data}/Source/lib/include/Tensile/geom.hpp | 0 .../lib/include/Tensile/hip/HipHardware.hpp | 0 .../Tensile/hip/HipSolutionAdapter.hpp | 0 .../lib/include/Tensile/hip/HipUtils.hpp | 0 .../Source/lib/include/Tensile/hip_f8_impl.h | 0 .../lib/include/Tensile/llvm/Loading.hpp | 0 .../Source/lib/include/Tensile/llvm/YAML.hpp | 0 .../lib/include/Tensile/msgpack/Loading.hpp | 0 .../include/Tensile/msgpack/MessagePack.hpp | 0 .../Source/lib/include/Tensile/ocl/OclFwd.hpp | 0 .../lib/include/Tensile/ocl/OclHardware.hpp | 0 .../Tensile/ocl/OclSolutionAdapter.hpp | 0 .../lib/include/Tensile/ocl/OclUtils.hpp | 0 .../data}/Source/lib/source/AMDGPU.cpp | 0 .../Source/lib/source/ArithmeticUnitTypes.cpp | 0 .../Source/lib/source/ContractionProblem.cpp | 0 .../Source/lib/source/ContractionSolution.cpp | 0 .../data}/Source/lib/source/DataTypes.cpp | 0 .../Tensile/data}/Source/lib/source/Debug.cpp | 0 .../data}/Source/lib/source/EmbeddedData.cpp | 0 .../Source/lib/source/EmbeddedLibrary.cpp | 0 .../Source/lib/source/KernelArguments.cpp | 0 .../Source/lib/source/KernelLanguageTypes.cpp | 0 .../data}/Source/lib/source/MLFeatures.cpp | 0 .../lib/source/PerformanceMetricTypes.cpp | 0 .../Source/lib/source/ScalarValueTypes.cpp | 0 .../data}/Source/lib/source/Tensile.cpp | 0 .../Source/lib/source/TensorDescriptor.cpp | 0 .../data}/Source/lib/source/TensorOps.cpp | 0 .../lib/source/UserDrivenTuningParser.cpp | 0 .../Tensile/data}/Source/lib/source/Utils.cpp | 0 .../Source/lib/source/hip/CMakeLists.txt | 0 .../Source/lib/source/hip/HipHardware.cpp | 0 .../lib/source/hip/HipSolutionAdapter.cpp | 0 .../data}/Source/lib/source/llvm/Loading.cpp | 0 .../data}/Source/lib/source/llvm/YAML.cpp | 0 .../Source/lib/source/msgpack/MessagePack.cpp | 0 .../Source/lib/source/ocl/CMakeLists.txt | 0 .../Source/lib/source/ocl/OclHardware.cpp | 0 .../lib/source/ocl/OclSolutionAdapter.cpp | 0 .../data}/Source/lib/source/ocl/OclUtils.cpp | 0 .../Tensile/data}/Source/multigpu.sh | 0 .../Tensile/data}/Source/tensile_bfloat16.h | 0 .../data}/Source/tensile_float8_bfloat8.h | 0 .../Tensile/data}/Source/winners.awk | 0 .../archive/merge_rocblas_yaml_files.py | 0 .../Tensile/data}/Utilities/merge.py | 0 .../Tensile/data}/cmake/TensileConfig.cmake | 0 .../data}/cmake/TensileConfigVersion.cmake | 0 1353 files changed, 28 deletions(-) delete mode 100644 Tensile/Configs/build_client.yaml rename {Tensile/Tests => Tests}/bugs/2sum_src_pgr1_smallsum.yaml (100%) rename {Tensile/Tests => Tests}/bugs/d2lds.yaml (100%) rename {Tensile/Tests => Tests}/bugs/fractional_plus_pbc.yaml (100%) rename {Tensile/Tests => Tests}/bugs/free10_swap.yaml (100%) rename {Tensile/Tests => Tests}/bugs/hpa_beta.yaml (100%) rename {Tensile/Tests => Tests}/bugs/nosourcetmp.yaml (100%) rename {Tensile/Tests => Tests}/bugs/simple_use_initial_strides_1.yaml (100%) rename {Tensile/Tests => Tests}/bugs/swizzlec1.yaml (100%) rename {Tensile/Tests => Tests}/bugs/test_glvw4_edge_no_asem.yaml (100%) rename {Tensile/Tests => Tests}/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml (100%) rename {Tensile/Tests => Tests}/conftest.py (100%) rename {Tensile/Tests => Tests}/create_tests.py (100%) rename {Tensile/Tests => Tests}/disabled/classic/test_convolution.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter1d.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter1d_simple.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter2d_simple.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter3d_simple.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter5d_simple.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act2d_filter1d.yaml (100%) rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act2d_filter1d_simple.yaml (100%) rename {Tensile/Tests => Tests}/disabled/direct_to_lds/dtl_dgemm.yaml (100%) rename {Tensile/Tests => Tests}/disabled/direct_to_lds/dtl_dgemm_lite.yaml (100%) rename {Tensile/Tests => Tests}/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml (100%) rename {Tensile/Tests => Tests}/disabled/hgemm_nn_source.yaml (100%) rename {Tensile/Tests => Tests}/disabled/multi_sum/test_.py (100%) rename {Tensile/Tests => Tests}/disabled/starter_packed_case.yaml (100%) rename {Tensile/Tests => Tests}/disabled/stridea0_pack_nt.yaml (100%) rename {Tensile/Tests => Tests}/disabled/strideb0_pack_nn.yaml (100%) rename {Tensile/Tests => Tests}/disabled/test_assertion_selection.yaml (100%) rename {Tensile/Tests => Tests}/disabled/test_create_library.yaml (100%) rename {Tensile/Tests => Tests}/dot/mixmad-nt.yaml (100%) rename {Tensile/Tests => Tests}/dot/mixmad.yaml (100%) rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_nn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_nt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_tn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_tt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/dgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/emulation/double_complex/double_complex_hip_cn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8b8s_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8hs_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8ss_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8f8s_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8hs_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8ss_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/float_complex/float_complex_hip_cc.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_nn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_nt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_tn.yaml (100%) rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_tt.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/1LDSB.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/cgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/cgemm_asm_conjugate.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/dgemm.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/hpa_bfloat16_gemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/hpa_hgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/emulation/mfma/sgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/big_tensor/biga.yaml (100%) rename {Tensile/Tests => Tests}/extended/big_tensor/bigskinny_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/big_tensor/largec.yaml (100%) rename {Tensile/Tests => Tests}/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml (100%) rename {Tensile/Configs => Tests/extended/bufferload_offset}/rocblas_sgemm_bufferload_limit.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic/test_persistent.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic/test_tensor_contraction.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic_source/test_dgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic_source/test_hgemm_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic_source/test_hgemm_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic_source/test_hgemm_tn_tt.yaml (100%) rename {Tensile/Tests => Tests}/extended/classic_source/test_sgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/YamlBuilder.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/header.yml (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/conftest.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_backwarddata_nchw.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_backwardweights_nchw.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_bad_input.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_conv_vs_contraction.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_cnhw.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_nchw.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_nchw_ckyx.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_nhwc.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_pad.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/test_simple.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/unittests/test_problem_sizes.py (100%) rename {Tensile/Tests => Tests}/extended/convolution_config/unittests/test_string_swap.py (100%) rename {Tensile/Tests => Tests}/extended/custom_kernel/ck_dgemm_90a_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_dgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_hgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_sgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_tsgr_f8.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_tsgr_hgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_tsgr_sgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_cgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_dgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_f8gemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_hgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_igemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/dot2/hgemm_hpa_dot2_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/dot2/hgemm_hpa_dot2_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/dot2/hgemm_hpa_dot2_tn_2.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_cc.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_cn.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_ct.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_nc.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_tc.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_tt.yaml (100%) rename {Tensile/Tests => Tests}/extended/flat/test_dgemm_asm_flat.yaml (100%) rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat.yaml (100%) rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat_tt.yaml (100%) rename {Tensile/Tests => Tests}/extended/float8/f8gemm-hybrid-ss.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_cc.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_cn.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_ct.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_nc.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_tc.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_tt.yaml (100%) rename {Tensile/Tests => Tests}/extended/fractional/test_dgemm_fractional_tile_sweep.yaml (100%) rename {Tensile/Tests => Tests}/extended/fractional/test_hgemm_fractional_tile_sweep.yaml (100%) rename {Tensile/Tests => Tests}/extended/fractional/test_sgemm_fractional_edge.yaml (100%) rename {Tensile/Tests => Tests}/extended/fractional/test_sgemm_fractional_tile_sweep.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/hgemm_gsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/hgemm_gsu_minkforgsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_batch.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_beta0.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_beta1.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_beta2.yaml (100%) rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_usebeta0.yaml (100%) rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_tt.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/bfloat16_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/cgemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/dgemm_lsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/dgemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/f8gemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu_grvw2.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/igemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/sgemm_lsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/sgemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/local_split_u/zgemm_lsu_mfma.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_1sum_zp.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml (100%) rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/2sum.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_gsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_gsu_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_gsu_src.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_src.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/3sum_gsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum/simple_sum2_scrambled.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/1sum_gsu_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/1sum_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsu_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsuremainder.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_pbd.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_scrambled_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum_gsu.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum_gsu_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum_simple.yaml (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/README (100%) rename {Tensile/Tests => Tests}/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml (100%) rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/multi_free2.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/multi_free_batch.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/packed_perf_nn.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/simple_stridea0_pack.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/simple_strideb0_pack.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/strideb0_pack_nt.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/strideb0_pack_tn.yaml (100%) rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/vectorstore0.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_NN.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_NT.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_TN.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_TT.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_NN.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_NT.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_TN.yaml (100%) rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_TT.yaml (100%) rename {Tensile/Tests => Tests}/extended/stream_k/sk_2tile_hgemm_hhs.yaml (100%) rename {Tensile/Tests => Tests}/extended/stream_k/sk_2tile_sgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/stream_k/sk_hgemm_hhs.yaml (100%) rename {Tensile/Tests => Tests}/extended/stream_k/sk_sgemm.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/README (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/allownofree.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/assert_size_equal.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/exact_conv.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/filter.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/ncdhw.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/sweep_packed_dims.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle0.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle1.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle2.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle3.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/test_nchw_filter_contraction.yaml (100%) rename {Tensile/Tests => Tests}/extended/tensor_contraction/tlu0_non_unit_stride.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides/simple_use_initial_strides_1.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_1.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_2.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_strides.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_strides1.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml (100%) rename {Tensile/Tests => Tests}/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml (100%) rename {Tensile/Tests => Tests}/extended/vector_width/hgemm_nn_asm.yaml (100%) rename {Tensile/Tests => Tests}/extended/vector_width/sgemm_nn_asm.yaml (100%) rename {Tensile/Tests => Tests}/extended/vector_width/sgemm_nn_source.yaml (100%) rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_2sum_zpother.yaml (100%) rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_1sum.yaml (100%) rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_2sum_zp_both.yaml (100%) rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_2sum_zp_other.yaml (100%) rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml (100%) rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_3sum_zp_other.yaml (100%) rename {Tensile/Tests => Tests}/hipModuleLoad_timing/Makefile (100%) rename {Tensile/Tests => Tests}/hipModuleLoad_timing/hipModuleLoadTiming.cpp (100%) rename {Tensile/Tests => Tests}/integration/test_integration.py (100%) rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/cov/COV4.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/cov/COV5.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/cov/COVDefault.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/dgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/bfloat16_denorm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/dgemm_denorm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/hgemm_denorm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/sgemm_denorm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/denorm/sgemm_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/dgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/dgemm_general_batch_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_cc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_cn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_ct.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_nc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_tc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_cc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_cn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_ct.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_nc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_tc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_cc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_cn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_ct.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_nc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_tc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_cc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_cn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_ct.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_nc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_tc.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_general_batch_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/igemm_hpa_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/igemm_hpa_hip_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/1LDSB.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/c-tile-reuse-no-nll.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/cgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/cgemm_asm_conjugate.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_gb_global_ldd.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_large_offset.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_split_lds.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hsgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_64bit_offset.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_64bit_offset_post.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_general_batch_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_split_lds.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/wider_local_read.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/zgemm_asm.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/mfma/zgemm_asm_conjugate.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/nll_reproduce_bug.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/regression/persistent_kernel.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_tn_bigk.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_exact_dict.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/sgemm_general_batch_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/source/test_dgemm_defaults.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/source/test_hgemm_defaults.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/source/test_hgemm_hpa.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/source/test_sgemm_defaults.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/wmma/hgemm_wmma.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/wmma/hpa_hgemm_wmma.yaml (100%) rename {Tensile/Tests => Tests}/pre_checkin/wmma/hpa_igemm_wmma.yaml (100%) rename {Tensile/Tests => Tests}/special/global_split_u_src/README (100%) rename {Tensile/Tests => Tests}/special/global_split_u_src/hgemm_gsu.yaml (100%) rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_beta0.yaml (100%) rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_beta1.yaml (100%) rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_beta2.yaml (100%) rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_usebeta0.yaml (100%) rename {Tensile/Tests => Tests}/special/igemm/igemm_hpa_hip_lsu.yaml (100%) rename {Tensile/Tests => Tests}/special/igemm/igemm_hpa_hip_nn.yaml (100%) rename {Tensile/Tests => Tests}/special/igemm/igemm_hpa_hip_tt.yaml (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/hardcodedParameters.yaml (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/initialSolutionParameters.yaml (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary.yaml (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx1010.co (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx1011.co (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx803.co (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx900.co (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx906.co (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx908.co (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/metadata.yaml (100%) rename {Tensile/Tests => Tests}/test_data/unit/library_data/problemType.yaml (100%) rename {Tensile/Tests => Tests}/test_data/unit/solutions/solutions_nn_3.yaml (100%) rename {Tensile/Tests => Tests}/unit/__init__.py (100%) rename {Tensile/Tests => Tests}/unit/customKernels/TestKernel.s (100%) rename {Tensile/Tests => Tests}/unit/replacement/bad_file/bad.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/duplicate_kernel/a.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/duplicate_kernel/b.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v2/baz.s.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v2/kernel_named_bar.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v2/kernel_named_foo.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v3/baz.s.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v3/kernel_named_bar.txt (100%) rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v3/kernel_named_foo.txt (100%) rename {Tensile/Tests => Tests}/unit/test_Common.py (100%) rename {Tensile/Tests => Tests}/unit/test_Component.py (100%) rename {Tensile/Tests => Tests}/unit/test_Configuration.py (100%) rename {Tensile/Tests => Tests}/unit/test_CustomKernels.py (100%) rename {Tensile/Tests => Tests}/unit/test_DataType.py (100%) rename {Tensile/Tests => Tests}/unit/test_HardwarePredicates.py (100%) rename {Tensile/Tests => Tests}/unit/test_KernelWriterAssembly.py (100%) rename {Tensile/Tests => Tests}/unit/test_LibraryIO.py (100%) rename {Tensile/Tests => Tests}/unit/test_PerfMetricPredicates.py (100%) rename {Tensile/Tests => Tests}/unit/test_Priority.py (100%) rename {Tensile/Tests => Tests}/unit/test_ReplacementKernels.py (100%) rename {Tensile/Tests => Tests}/unit/test_TensileCreateLibrary.py (100%) rename {Tensile/Tests => Tests}/unit/test_conv_problem.py (100%) rename {Tensile/Tests => Tests}/unit/test_exact_problem.py (100%) rename {Tensile/Tests => Tests}/unit/test_makeProblem.py (100%) rename {Tensile/Tests => Tests}/unit/test_mergeLogic.py (100%) rename {Tensile/Tests => Tests}/unit/test_tryAssembler.py (100%) rename {Tensile/Tests => Tests}/unit/test_useGlobalParameters.py (100%) rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_nn.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_nt.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_tn.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_tt.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml (100%) rename {Tensile/Tests => Tests}/vega_20/nightly/local_split_u/igemm_lsu.yaml (100%) rename {Tensile/Tests => Tests}/weekly/assertions/README (100%) rename {Tensile/Tests => Tests}/weekly/assertions/test_hgemm_asem2_asm.yaml (100%) rename {Tensile/Tests => Tests}/weekly/classic_source/test_hgemm_vectors.yaml (100%) rename {Tensile/Tests => Tests}/weekly/classic_source/test_sgemm_vectors.yaml (100%) rename {Tensile/Tests => Tests}/yaml_only/test_config.py (100%) rename {Tensile/Tests => Tests}/yaml_only/test_ya (100%) rename {Tensile => src/Tensile}/AsmMemoryInstruction.py (100%) rename {Tensile => src/Tensile}/AsmRegisterPool.py (100%) rename {Tensile => src/Tensile}/AsmUtils.py (100%) rename {Tensile => src/Tensile}/BenchmarkProblems.py (100%) rename {Tensile => src/Tensile}/BenchmarkSplitter.py (100%) rename {Tensile => src/Tensile}/BenchmarkStructs.py (100%) rename {Tensile => src/Tensile}/ClientExecutable.py (100%) rename {Tensile => src/Tensile}/ClientWriter.py (100%) rename {Tensile => src/Tensile}/Code.py (100%) rename {Tensile => src/Tensile}/Common.py (100%) rename {Tensile => src/Tensile}/Component.py (100%) rename {Tensile => src/Tensile}/Components/ComputeStoreVgprs.py (100%) rename {Tensile => src/Tensile}/Components/LocalRead.py (100%) rename {Tensile => src/Tensile}/Components/LraTileAssignment.py (100%) rename {Tensile => src/Tensile}/Components/MAC_BF16_HPA.py (100%) rename {Tensile => src/Tensile}/Components/MAC_F16.py (100%) rename {Tensile => src/Tensile}/Components/MAC_F16_HPA.py (100%) rename {Tensile => src/Tensile}/Components/MAC_F32.py (100%) rename {Tensile => src/Tensile}/Components/MAC_F32C.py (100%) rename {Tensile => src/Tensile}/Components/MAC_F64.py (100%) rename {Tensile => src/Tensile}/Components/MAC_F64C.py (100%) rename {Tensile => src/Tensile}/Components/MAC_I8X4.py (100%) rename {Tensile => src/Tensile}/Components/MAC_I8_HPA.py (100%) rename {Tensile => src/Tensile}/Components/MFMA.py (100%) rename {Tensile => src/Tensile}/Components/NotLocalFullTileElements.py (100%) rename {Tensile => src/Tensile}/Components/Priority.py (100%) rename {Tensile => src/Tensile}/Components/PseudoRandomGenerator.py (100%) rename {Tensile => src/Tensile}/Components/ShiftVectorComponents.py (100%) rename {Tensile => src/Tensile}/Components/Signature.py (100%) rename {Tensile => src/Tensile}/Components/__init__.py (100%) rename {Tensile => src/Tensile}/Configuration.py (100%) rename {Tensile => src/Tensile}/Contractions.py (100%) rename {Tensile => src/Tensile}/CustomKernels.py (100%) rename {Tensile => src/Tensile}/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s (100%) rename {Tensile => src/Tensile}/DataType.py (100%) rename {Tensile => src/Tensile}/EmbeddedData.py (100%) rename {Tensile => src/Tensile}/GenerateSummations.py (100%) rename {Tensile => src/Tensile}/Hardware.py (100%) rename {Tensile => src/Tensile}/KernelWriter.py (100%) rename {Tensile => src/Tensile}/KernelWriterAssembly.py (100%) rename {Tensile => src/Tensile}/KernelWriterBase.py (100%) rename {Tensile => src/Tensile}/KernelWriterBetaOnly.py (100%) rename {Tensile => src/Tensile}/KernelWriterConversion.py (100%) rename {Tensile => src/Tensile}/KernelWriterSource.py (100%) rename {Tensile => src/Tensile}/KernelWriterStreamKInit.py (100%) rename {Tensile => src/Tensile}/LibraryIO.py (100%) rename {Tensile => src/Tensile}/LibraryLogic.py (100%) rename {Tensile => src/Tensile}/Parallel.py (100%) rename {Tensile => src/Tensile}/Properties.py (100%) rename {Tensile => src/Tensile}/ReplacementKernels.py (100%) rename {Tensile => src/Tensile}/SolutionLibrary.py (100%) rename {Tensile => src/Tensile}/SolutionSelectionLibrary.py (100%) rename {Tensile => src/Tensile}/SolutionStructs.py (100%) rename {Tensile => src/Tensile}/SolutionWriter.py (100%) rename {Tensile => src/Tensile}/Tensile.py (100%) rename {Tensile => src/Tensile}/TensileBenchmarkCluster.py (100%) rename {Tensile => src/Tensile}/TensileBenchmarkClusterScripts.py (100%) rename {Tensile => src/Tensile}/TensileBenchmarkLibraryClient.py (100%) rename {Tensile => src/Tensile}/TensileClientConfig.py (100%) rename {Tensile => src/Tensile}/TensileCreateLibrary.py (100%) rename {Tensile => src/Tensile}/TensileLibLogicToYaml.py (100%) rename {Tensile => src/Tensile}/TensileMergeLibrary.py (100%) rename {Tensile => src/Tensile}/TensileRetuneLibrary.py (100%) rename {Tensile => src/Tensile}/TensileUpdateLibrary.py (100%) rename {Tensile => src/Tensile}/Utils.py (100%) rename {Tensile => src/Tensile}/__init__.py (100%) rename {Tensile => src/Tensile}/bin/Tensile (100%) rename {Tensile => src/Tensile}/bin/TensileBenchmarkCluster (100%) rename {Tensile => src/Tensile}/bin/TensileClientConfig (100%) rename {Tensile => src/Tensile}/bin/TensileCreateLibrary (100%) rename {Tensile => src/Tensile}/bin/TensileGenerateSummations (100%) rename {Tensile => src/Tensile}/bin/TensileLibLogicToYaml (100%) rename {Tensile => src/Tensile}/bin/TensileMergeLibrary (100%) rename {Tensile => src/Tensile}/bin/TensileRetuneLibrary (100%) rename {Tensile => src/Tensile}/bin/TensileUpdateLibrary (100%) rename {Tensile => src/Tensile/data}/Configs/alternate-format/sizeList-example.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/alternate-format/vega20-example.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/deep_bench_nn.csv (100%) rename {Tensile => src/Tensile/data}/Configs/deep_bench_nn_batched.csv (100%) rename {Tensile => src/Tensile/data}/Configs/deep_bench_nt.csv (100%) rename {Tensile => src/Tensile/data}/Configs/deep_bench_nt_batched.csv (100%) rename {Tensile => src/Tensile/data}/Configs/deep_bench_tn.csv (100%) rename {Tensile => src/Tensile/data}/Configs/deep_bench_tn_batched.csv (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_hpa_bf16_nt_test.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_lite_test.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_test.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/sgemm_tlunn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/mfma/sgemm_transposeLDS.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/Makefile (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/README.md (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/README (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/README.md (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/README.md (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/logs/convert.log (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/logs/merge.log (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/header.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/library_logic_hip_only.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/library_logic_vega10_only.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/library_logic_vega20_only.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/convert_cfg.py (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/make_all.sh (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_gemm_large.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/resnet50_all.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/resnet50_batch64.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/resnet_batch64_B.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nt/deepbench_gemm_large.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nt/resnet50_all.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/tn/deepbench_gemm_large.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/problems/tn/resnet50_all.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_large_explore_3.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_large_explore_5.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_quick.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_skinny_explore_3.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_skinny_explore_5.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_large_explore_3.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_large_explore_5.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_large_explore_7.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_quick.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_3.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_4.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_5.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_7.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_nn.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_nt.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_tn.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_tt.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_nn.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_nt.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_tn.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_tt.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_nn.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_nt.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_tn.yml (100%) rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_tt.yml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_cgemm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_cgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_cgemm_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_asm_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_asm_square.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_bufferload_limit.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_resume_train_exp.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_asm_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_bufferload_limit.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_inc1_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_nn_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_nt_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_tn_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_tt_hip.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_hsgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_tt.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_hip_single_kernel.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_only.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_single_kernel.yaml (100%) rename {Tensile/Tests/extended/bufferload_offset => src/Tensile/data/Configs}/rocblas_sgemm_bufferload_limit.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_example.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_hip_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_zgemm.yaml (100%) rename {Tensile => src/Tensile/data}/Configs/rocblas_zgemm_asm_lite.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/BDAS/dgemm_kmeans.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/BDAS/dgemm_pca.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/BERT/sgemm_xdlops.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/TRANSFORMER/sgemm_xdlops.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/README (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1_af0em.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1_oddpbd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1u2_bdww.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1u2_fwd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x7_fwd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x1_fwd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x1_fwd2.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x1_roundup.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x7u2_fwd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_bwdd_pbd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv/conv_fwd.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv_bwdd_ex0.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv_bwdd_ex1.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv_bwdw_big_gsu.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv_bwdw_small_gsu.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/conv_fwd_ex0.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/dgemm_large_square.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/hpl.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/hpl_one.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/hpl_quick.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/hpl_quick44k.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/inception/conv_1x1u1.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/inception/conv_1x1u1_starter.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/inception/conv_NxN.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/sgemm_large_square_nn.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/sgemm_large_square_nt.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/sgemm_large_square_tn.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/README (100%) rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/perf_baseline0.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/perf_uis_cd0.yaml (100%) rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml (100%) rename {Tensile => src/Tensile/data}/Source/CMakeLists.txt (100%) rename {Tensile => src/Tensile/data}/Source/EnableWarnings.cmake (100%) rename {Tensile => src/Tensile/data}/Source/FindHIP.cmake (100%) rename {Tensile => src/Tensile/data}/Source/FindOpenCL.cmake (100%) rename {Tensile => src/Tensile/data}/Source/KernelHeader.h (100%) rename {Tensile => src/Tensile/data}/Source/TensileTypes.h (100%) rename {Tensile => src/Tensile/data}/Source/client/CMakeLists.txt (100%) rename {Tensile => src/Tensile/data}/Source/client/include/BenchmarkTimer.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/CSVStackFile.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ClientProblemFactory.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ConvolutionProblem.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/DataInitialization.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/DataInitializationTyped.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitor.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitorListener.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitorType.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitorWindows.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitor_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/LibraryUpdateReporter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/LogReporter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/MetaResultReporter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/MetaRunListener.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/PerformanceReporter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ProgressListener.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/Reference.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ReferenceValidator.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ResultComparison.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ResultFileReporter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ResultReporter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/ResultReporter_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/RunListener.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/SolutionIterator.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/include/TimingEvents.hpp (100%) rename {Tensile => src/Tensile/data}/Source/client/main.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/BenchmarkTimer.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/CSVStackFile.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/ClientProblemFactory.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/ConvolutionProblem.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/DataInitialization.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/HardwareMonitor.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/HardwareMonitorListener.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/LibraryUpdateReporter.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/MetaRunListener.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/PerformanceReporter.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/ProgressListener.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/Reference.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/ReferenceValidator.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/ResultFileReporter.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/ResultReporter.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/SolutionIterator.cpp (100%) rename {Tensile => src/Tensile/data}/Source/client/source/TimingEvents.cpp (100%) rename {Tensile => src/Tensile/data}/Source/cmake/FindROCmSMI.cmake (100%) rename {Tensile => src/Tensile/data}/Source/hip_f8_impl.h (100%) rename {Tensile => src/Tensile/data}/Source/lib/CMakeLists.txt (100%) rename {Tensile => src/Tensile/data}/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml (100%) rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/AMDGPU.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/AMDGPUPredicates.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/AMDGPU_Detail.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/CachingLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Comparison.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblem.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblemPredicates.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblemProperties.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblem_Detail.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblem_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionSolution.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionSolution_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Contractions.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_BFloat16.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Half.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Int8.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Int8x4.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_XFloat32.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Debug.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DecisionTree.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DecisionTreeLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Distance.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DistinctType.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/EmbeddedData.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/EmbeddedLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ExactLogicLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/KernelArguments.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/KernelLanguageTypes.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MLFeatures.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Macros.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MapLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MasterSolutionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MatchingLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/PerformanceMetricTypes.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/PlaceholderLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Predicates.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ProblemKey.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Properties.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/PropertyMatching.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ScalarValueTypes.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Base.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Containers.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/HasTraits.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/MLFeatures.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/MapLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Predicates.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Properties.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SingleSolutionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Singleton.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SolutionLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SolutionMapLibrary.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Tensile.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Tensile_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorDescriptor.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorOps.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorOps_fwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/UserDrivenTuningParser.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Utils.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/geom.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip/HipHardware.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip/HipUtils.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip_f8_impl.h (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/llvm/Loading.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/llvm/YAML.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/msgpack/Loading.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/msgpack/MessagePack.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclFwd.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclHardware.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclUtils.hpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/AMDGPU.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ArithmeticUnitTypes.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ContractionProblem.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ContractionSolution.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/DataTypes.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/Debug.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/EmbeddedData.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/EmbeddedLibrary.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/KernelArguments.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/KernelLanguageTypes.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/MLFeatures.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/PerformanceMetricTypes.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ScalarValueTypes.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/Tensile.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/TensorDescriptor.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/TensorOps.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/UserDrivenTuningParser.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/Utils.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/hip/CMakeLists.txt (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/hip/HipHardware.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/hip/HipSolutionAdapter.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/llvm/Loading.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/llvm/YAML.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/msgpack/MessagePack.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/CMakeLists.txt (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/OclHardware.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/OclSolutionAdapter.cpp (100%) rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/OclUtils.cpp (100%) rename {Tensile => src/Tensile/data}/Source/multigpu.sh (100%) rename {Tensile => src/Tensile/data}/Source/tensile_bfloat16.h (100%) rename {Tensile => src/Tensile/data}/Source/tensile_float8_bfloat8.h (100%) rename {Tensile => src/Tensile/data}/Source/winners.awk (100%) rename {Tensile => src/Tensile/data}/Utilities/archive/merge_rocblas_yaml_files.py (100%) rename {Tensile => src/Tensile/data}/Utilities/merge.py (100%) rename {Tensile => src/Tensile/data}/cmake/TensileConfig.cmake (100%) rename {Tensile => src/Tensile/data}/cmake/TensileConfigVersion.cmake (100%) diff --git a/Tensile/Configs/build_client.yaml b/Tensile/Configs/build_client.yaml deleted file mode 100644 index 70cc3f62f1..0000000000 --- a/Tensile/Configs/build_client.yaml +++ /dev/null @@ -1,28 +0,0 @@ -GlobalParameters: - MinimumRequiredVersion: 4.4.0 - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - CMakeBuildType: Release - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - LibraryPrintDebug: False - NumElementsToValidate: 0 - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - Platform: 0 - Device: 0 - KernelTime: True - DataInitTypeBeta : 0 - -BenchmarkProblems: - - - - # ProblemType - OperationType: GEMM - DataType: s - - - # BenchmarkProblemSizeGroup - Standard - InitialSolutionParameters: diff --git a/Tensile/Tests/bugs/2sum_src_pgr1_smallsum.yaml b/Tests/bugs/2sum_src_pgr1_smallsum.yaml similarity index 100% rename from Tensile/Tests/bugs/2sum_src_pgr1_smallsum.yaml rename to Tests/bugs/2sum_src_pgr1_smallsum.yaml diff --git a/Tensile/Tests/bugs/d2lds.yaml b/Tests/bugs/d2lds.yaml similarity index 100% rename from Tensile/Tests/bugs/d2lds.yaml rename to Tests/bugs/d2lds.yaml diff --git a/Tensile/Tests/bugs/fractional_plus_pbc.yaml b/Tests/bugs/fractional_plus_pbc.yaml similarity index 100% rename from Tensile/Tests/bugs/fractional_plus_pbc.yaml rename to Tests/bugs/fractional_plus_pbc.yaml diff --git a/Tensile/Tests/bugs/free10_swap.yaml b/Tests/bugs/free10_swap.yaml similarity index 100% rename from Tensile/Tests/bugs/free10_swap.yaml rename to Tests/bugs/free10_swap.yaml diff --git a/Tensile/Tests/bugs/hpa_beta.yaml b/Tests/bugs/hpa_beta.yaml similarity index 100% rename from Tensile/Tests/bugs/hpa_beta.yaml rename to Tests/bugs/hpa_beta.yaml diff --git a/Tensile/Tests/bugs/nosourcetmp.yaml b/Tests/bugs/nosourcetmp.yaml similarity index 100% rename from Tensile/Tests/bugs/nosourcetmp.yaml rename to Tests/bugs/nosourcetmp.yaml diff --git a/Tensile/Tests/bugs/simple_use_initial_strides_1.yaml b/Tests/bugs/simple_use_initial_strides_1.yaml similarity index 100% rename from Tensile/Tests/bugs/simple_use_initial_strides_1.yaml rename to Tests/bugs/simple_use_initial_strides_1.yaml diff --git a/Tensile/Tests/bugs/swizzlec1.yaml b/Tests/bugs/swizzlec1.yaml similarity index 100% rename from Tensile/Tests/bugs/swizzlec1.yaml rename to Tests/bugs/swizzlec1.yaml diff --git a/Tensile/Tests/bugs/test_glvw4_edge_no_asem.yaml b/Tests/bugs/test_glvw4_edge_no_asem.yaml similarity index 100% rename from Tensile/Tests/bugs/test_glvw4_edge_no_asem.yaml rename to Tests/bugs/test_glvw4_edge_no_asem.yaml diff --git a/Tensile/Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml b/Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml similarity index 100% rename from Tensile/Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml rename to Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml diff --git a/Tensile/Tests/conftest.py b/Tests/conftest.py similarity index 100% rename from Tensile/Tests/conftest.py rename to Tests/conftest.py diff --git a/Tensile/Tests/create_tests.py b/Tests/create_tests.py similarity index 100% rename from Tensile/Tests/create_tests.py rename to Tests/create_tests.py diff --git a/Tensile/Tests/disabled/classic/test_convolution.yaml b/Tests/disabled/classic/test_convolution.yaml similarity index 100% rename from Tensile/Tests/disabled/classic/test_convolution.yaml rename to Tests/disabled/classic/test_convolution.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d.yaml b/Tests/disabled/convolution/test_conv_act1d_filter1d.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d.yaml rename to Tests/disabled/convolution/test_conv_act1d_filter1d.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml rename to Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml rename to Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml rename to Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml rename to Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d.yaml b/Tests/disabled/convolution/test_conv_act2d_filter1d.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d.yaml rename to Tests/disabled/convolution/test_conv_act2d_filter1d.yaml diff --git a/Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml b/Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml similarity index 100% rename from Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml rename to Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml diff --git a/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml b/Tests/disabled/direct_to_lds/dtl_dgemm.yaml similarity index 100% rename from Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml rename to Tests/disabled/direct_to_lds/dtl_dgemm.yaml diff --git a/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml b/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml similarity index 100% rename from Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml rename to Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml diff --git a/Tensile/Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml b/Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml similarity index 100% rename from Tensile/Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml rename to Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml diff --git a/Tensile/Tests/disabled/hgemm_nn_source.yaml b/Tests/disabled/hgemm_nn_source.yaml similarity index 100% rename from Tensile/Tests/disabled/hgemm_nn_source.yaml rename to Tests/disabled/hgemm_nn_source.yaml diff --git a/Tensile/Tests/disabled/multi_sum/test_.py b/Tests/disabled/multi_sum/test_.py similarity index 100% rename from Tensile/Tests/disabled/multi_sum/test_.py rename to Tests/disabled/multi_sum/test_.py diff --git a/Tensile/Tests/disabled/starter_packed_case.yaml b/Tests/disabled/starter_packed_case.yaml similarity index 100% rename from Tensile/Tests/disabled/starter_packed_case.yaml rename to Tests/disabled/starter_packed_case.yaml diff --git a/Tensile/Tests/disabled/stridea0_pack_nt.yaml b/Tests/disabled/stridea0_pack_nt.yaml similarity index 100% rename from Tensile/Tests/disabled/stridea0_pack_nt.yaml rename to Tests/disabled/stridea0_pack_nt.yaml diff --git a/Tensile/Tests/disabled/strideb0_pack_nn.yaml b/Tests/disabled/strideb0_pack_nn.yaml similarity index 100% rename from Tensile/Tests/disabled/strideb0_pack_nn.yaml rename to Tests/disabled/strideb0_pack_nn.yaml diff --git a/Tensile/Tests/disabled/test_assertion_selection.yaml b/Tests/disabled/test_assertion_selection.yaml similarity index 100% rename from Tensile/Tests/disabled/test_assertion_selection.yaml rename to Tests/disabled/test_assertion_selection.yaml diff --git a/Tensile/Tests/disabled/test_create_library.yaml b/Tests/disabled/test_create_library.yaml similarity index 100% rename from Tensile/Tests/disabled/test_create_library.yaml rename to Tests/disabled/test_create_library.yaml diff --git a/Tensile/Tests/dot/mixmad-nt.yaml b/Tests/dot/mixmad-nt.yaml similarity index 100% rename from Tensile/Tests/dot/mixmad-nt.yaml rename to Tests/dot/mixmad-nt.yaml diff --git a/Tensile/Tests/dot/mixmad.yaml b/Tests/dot/mixmad.yaml similarity index 100% rename from Tensile/Tests/dot/mixmad.yaml rename to Tests/dot/mixmad.yaml diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml similarity index 100% rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml rename to Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml similarity index 100% rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml rename to Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml similarity index 100% rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml rename to Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml similarity index 100% rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml rename to Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml diff --git a/Tensile/Tests/emulation/dgemm_asm.yaml b/Tests/emulation/dgemm_asm.yaml similarity index 100% rename from Tensile/Tests/emulation/dgemm_asm.yaml rename to Tests/emulation/dgemm_asm.yaml diff --git a/Tensile/Tests/emulation/double_complex/double_complex_hip_cn.yaml b/Tests/emulation/double_complex/double_complex_hip_cn.yaml similarity index 100% rename from Tensile/Tests/emulation/double_complex/double_complex_hip_cn.yaml rename to Tests/emulation/double_complex/double_complex_hip_cn.yaml diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml rename to Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml rename to Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml rename to Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml rename to Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml b/Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml rename to Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml rename to Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml rename to Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml rename to Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml rename to Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml diff --git a/Tensile/Tests/emulation/float_complex/float_complex_hip_cc.yaml b/Tests/emulation/float_complex/float_complex_hip_cc.yaml similarity index 100% rename from Tensile/Tests/emulation/float_complex/float_complex_hip_cc.yaml rename to Tests/emulation/float_complex/float_complex_hip_cc.yaml diff --git a/Tensile/Tests/emulation/hgemm_asm_nn.yaml b/Tests/emulation/hgemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_asm_nn.yaml rename to Tests/emulation/hgemm_asm_nn.yaml diff --git a/Tensile/Tests/emulation/hgemm_asm_nt.yaml b/Tests/emulation/hgemm_asm_nt.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_asm_nt.yaml rename to Tests/emulation/hgemm_asm_nt.yaml diff --git a/Tensile/Tests/emulation/hgemm_asm_tn.yaml b/Tests/emulation/hgemm_asm_tn.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_asm_tn.yaml rename to Tests/emulation/hgemm_asm_tn.yaml diff --git a/Tensile/Tests/emulation/hgemm_asm_tt.yaml b/Tests/emulation/hgemm_asm_tt.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_asm_tt.yaml rename to Tests/emulation/hgemm_asm_tt.yaml diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_nn.yaml b/Tests/emulation/hgemm_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_hpa_asm_nn.yaml rename to Tests/emulation/hgemm_hpa_asm_nn.yaml diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_nt.yaml b/Tests/emulation/hgemm_hpa_asm_nt.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_hpa_asm_nt.yaml rename to Tests/emulation/hgemm_hpa_asm_nt.yaml diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_tn.yaml b/Tests/emulation/hgemm_hpa_asm_tn.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_hpa_asm_tn.yaml rename to Tests/emulation/hgemm_hpa_asm_tn.yaml diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_tt.yaml b/Tests/emulation/hgemm_hpa_asm_tt.yaml similarity index 100% rename from Tensile/Tests/emulation/hgemm_hpa_asm_tt.yaml rename to Tests/emulation/hgemm_hpa_asm_tt.yaml diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_nn.yaml b/Tests/emulation/igemm_hpa_hip_nn.yaml similarity index 100% rename from Tensile/Tests/emulation/igemm_hpa_hip_nn.yaml rename to Tests/emulation/igemm_hpa_hip_nn.yaml diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_nt.yaml b/Tests/emulation/igemm_hpa_hip_nt.yaml similarity index 100% rename from Tensile/Tests/emulation/igemm_hpa_hip_nt.yaml rename to Tests/emulation/igemm_hpa_hip_nt.yaml diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_tn.yaml b/Tests/emulation/igemm_hpa_hip_tn.yaml similarity index 100% rename from Tensile/Tests/emulation/igemm_hpa_hip_tn.yaml rename to Tests/emulation/igemm_hpa_hip_tn.yaml diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_tt.yaml b/Tests/emulation/igemm_hpa_hip_tt.yaml similarity index 100% rename from Tensile/Tests/emulation/igemm_hpa_hip_tt.yaml rename to Tests/emulation/igemm_hpa_hip_tt.yaml diff --git a/Tensile/Tests/emulation/mfma/1LDSB.yaml b/Tests/emulation/mfma/1LDSB.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/1LDSB.yaml rename to Tests/emulation/mfma/1LDSB.yaml diff --git a/Tensile/Tests/emulation/mfma/cgemm_asm.yaml b/Tests/emulation/mfma/cgemm_asm.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/cgemm_asm.yaml rename to Tests/emulation/mfma/cgemm_asm.yaml diff --git a/Tensile/Tests/emulation/mfma/cgemm_asm_conjugate.yaml b/Tests/emulation/mfma/cgemm_asm_conjugate.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/cgemm_asm_conjugate.yaml rename to Tests/emulation/mfma/cgemm_asm_conjugate.yaml diff --git a/Tensile/Tests/emulation/mfma/dgemm.yaml b/Tests/emulation/mfma/dgemm.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/dgemm.yaml rename to Tests/emulation/mfma/dgemm.yaml diff --git a/Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml b/Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml rename to Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml diff --git a/Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml b/Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml rename to Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml diff --git a/Tensile/Tests/emulation/mfma/hpa_hgemm_asm.yaml b/Tests/emulation/mfma/hpa_hgemm_asm.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/hpa_hgemm_asm.yaml rename to Tests/emulation/mfma/hpa_hgemm_asm.yaml diff --git a/Tensile/Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml b/Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml rename to Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml diff --git a/Tensile/Tests/emulation/mfma/sgemm.yaml b/Tests/emulation/mfma/sgemm.yaml similarity index 100% rename from Tensile/Tests/emulation/mfma/sgemm.yaml rename to Tests/emulation/mfma/sgemm.yaml diff --git a/Tensile/Tests/extended/big_tensor/biga.yaml b/Tests/extended/big_tensor/biga.yaml similarity index 100% rename from Tensile/Tests/extended/big_tensor/biga.yaml rename to Tests/extended/big_tensor/biga.yaml diff --git a/Tensile/Tests/extended/big_tensor/bigskinny_nt.yaml b/Tests/extended/big_tensor/bigskinny_nt.yaml similarity index 100% rename from Tensile/Tests/extended/big_tensor/bigskinny_nt.yaml rename to Tests/extended/big_tensor/bigskinny_nt.yaml diff --git a/Tensile/Tests/extended/big_tensor/largec.yaml b/Tests/extended/big_tensor/largec.yaml similarity index 100% rename from Tensile/Tests/extended/big_tensor/largec.yaml rename to Tests/extended/big_tensor/largec.yaml diff --git a/Tensile/Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml b/Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml similarity index 100% rename from Tensile/Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml rename to Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml diff --git a/Tensile/Configs/rocblas_sgemm_bufferload_limit.yaml b/Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_bufferload_limit.yaml rename to Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml diff --git a/Tensile/Tests/extended/classic/test_persistent.yaml b/Tests/extended/classic/test_persistent.yaml similarity index 100% rename from Tensile/Tests/extended/classic/test_persistent.yaml rename to Tests/extended/classic/test_persistent.yaml diff --git a/Tensile/Tests/extended/classic/test_tensor_contraction.yaml b/Tests/extended/classic/test_tensor_contraction.yaml similarity index 100% rename from Tensile/Tests/extended/classic/test_tensor_contraction.yaml rename to Tests/extended/classic/test_tensor_contraction.yaml diff --git a/Tensile/Tests/extended/classic_source/test_dgemm.yaml b/Tests/extended/classic_source/test_dgemm.yaml similarity index 100% rename from Tensile/Tests/extended/classic_source/test_dgemm.yaml rename to Tests/extended/classic_source/test_dgemm.yaml diff --git a/Tensile/Tests/extended/classic_source/test_hgemm_nn.yaml b/Tests/extended/classic_source/test_hgemm_nn.yaml similarity index 100% rename from Tensile/Tests/extended/classic_source/test_hgemm_nn.yaml rename to Tests/extended/classic_source/test_hgemm_nn.yaml diff --git a/Tensile/Tests/extended/classic_source/test_hgemm_nt.yaml b/Tests/extended/classic_source/test_hgemm_nt.yaml similarity index 100% rename from Tensile/Tests/extended/classic_source/test_hgemm_nt.yaml rename to Tests/extended/classic_source/test_hgemm_nt.yaml diff --git a/Tensile/Tests/extended/classic_source/test_hgemm_tn_tt.yaml b/Tests/extended/classic_source/test_hgemm_tn_tt.yaml similarity index 100% rename from Tensile/Tests/extended/classic_source/test_hgemm_tn_tt.yaml rename to Tests/extended/classic_source/test_hgemm_tn_tt.yaml diff --git a/Tensile/Tests/extended/classic_source/test_sgemm.yaml b/Tests/extended/classic_source/test_sgemm.yaml similarity index 100% rename from Tensile/Tests/extended/classic_source/test_sgemm.yaml rename to Tests/extended/classic_source/test_sgemm.yaml diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py b/Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py rename to Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/header.yml b/Tests/extended/convolution_config/YamlBuilder/header.yml similarity index 100% rename from Tensile/Tests/extended/convolution_config/YamlBuilder/header.yml rename to Tests/extended/convolution_config/YamlBuilder/header.yml diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml b/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml similarity index 100% rename from Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml rename to Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml b/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml similarity index 100% rename from Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml rename to Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml diff --git a/Tensile/Tests/extended/convolution_config/conftest.py b/Tests/extended/convolution_config/conftest.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/conftest.py rename to Tests/extended/convolution_config/conftest.py diff --git a/Tensile/Tests/extended/convolution_config/test_backwarddata_nchw.py b/Tests/extended/convolution_config/test_backwarddata_nchw.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_backwarddata_nchw.py rename to Tests/extended/convolution_config/test_backwarddata_nchw.py diff --git a/Tensile/Tests/extended/convolution_config/test_backwardweights_nchw.py b/Tests/extended/convolution_config/test_backwardweights_nchw.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_backwardweights_nchw.py rename to Tests/extended/convolution_config/test_backwardweights_nchw.py diff --git a/Tensile/Tests/extended/convolution_config/test_bad_input.py b/Tests/extended/convolution_config/test_bad_input.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_bad_input.py rename to Tests/extended/convolution_config/test_bad_input.py diff --git a/Tensile/Tests/extended/convolution_config/test_conv_vs_contraction.py b/Tests/extended/convolution_config/test_conv_vs_contraction.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_conv_vs_contraction.py rename to Tests/extended/convolution_config/test_conv_vs_contraction.py diff --git a/Tensile/Tests/extended/convolution_config/test_forward_cnhw.py b/Tests/extended/convolution_config/test_forward_cnhw.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_forward_cnhw.py rename to Tests/extended/convolution_config/test_forward_cnhw.py diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nchw.py b/Tests/extended/convolution_config/test_forward_nchw.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_forward_nchw.py rename to Tests/extended/convolution_config/test_forward_nchw.py diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nchw_ckyx.py b/Tests/extended/convolution_config/test_forward_nchw_ckyx.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_forward_nchw_ckyx.py rename to Tests/extended/convolution_config/test_forward_nchw_ckyx.py diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nhwc.py b/Tests/extended/convolution_config/test_forward_nhwc.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_forward_nhwc.py rename to Tests/extended/convolution_config/test_forward_nhwc.py diff --git a/Tensile/Tests/extended/convolution_config/test_forward_pad.py b/Tests/extended/convolution_config/test_forward_pad.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_forward_pad.py rename to Tests/extended/convolution_config/test_forward_pad.py diff --git a/Tensile/Tests/extended/convolution_config/test_simple.py b/Tests/extended/convolution_config/test_simple.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/test_simple.py rename to Tests/extended/convolution_config/test_simple.py diff --git a/Tensile/Tests/extended/convolution_config/unittests/test_problem_sizes.py b/Tests/extended/convolution_config/unittests/test_problem_sizes.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/unittests/test_problem_sizes.py rename to Tests/extended/convolution_config/unittests/test_problem_sizes.py diff --git a/Tensile/Tests/extended/convolution_config/unittests/test_string_swap.py b/Tests/extended/convolution_config/unittests/test_string_swap.py similarity index 100% rename from Tensile/Tests/extended/convolution_config/unittests/test_string_swap.py rename to Tests/extended/convolution_config/unittests/test_string_swap.py diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml b/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml similarity index 100% rename from Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml rename to Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml b/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml similarity index 100% rename from Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml rename to Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml b/Tests/extended/direct_to_lds/dtl_dgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml rename to Tests/extended/direct_to_lds/dtl_dgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml b/Tests/extended/direct_to_lds/dtl_hgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml rename to Tests/extended/direct_to_lds/dtl_hgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_sgemm.yaml b/Tests/extended/direct_to_lds/dtl_sgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_sgemm.yaml rename to Tests/extended/direct_to_lds/dtl_sgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml b/Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml rename to Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml b/Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml rename to Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml b/Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml rename to Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_cgemm.yaml b/Tests/extended/direct_to_vgpr/dtv_cgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_vgpr/dtv_cgemm.yaml rename to Tests/extended/direct_to_vgpr/dtv_cgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm.yaml b/Tests/extended/direct_to_vgpr/dtv_dgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm.yaml rename to Tests/extended/direct_to_vgpr/dtv_dgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml b/Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml rename to Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml b/Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml rename to Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_hgemm.yaml b/Tests/extended/direct_to_vgpr/dtv_hgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_vgpr/dtv_hgemm.yaml rename to Tests/extended/direct_to_vgpr/dtv_hgemm.yaml diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_igemm.yaml b/Tests/extended/direct_to_vgpr/dtv_igemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_vgpr/dtv_igemm.yaml rename to Tests/extended/direct_to_vgpr/dtv_igemm.yaml diff --git a/Tensile/Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml b/Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml similarity index 100% rename from Tensile/Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml rename to Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml diff --git a/Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml b/Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml similarity index 100% rename from Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml rename to Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml diff --git a/Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml b/Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml similarity index 100% rename from Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml rename to Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_asm.yaml b/Tests/extended/double_complex/zgemm_asm.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_asm.yaml rename to Tests/extended/double_complex/zgemm_asm.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_cc.yaml b/Tests/extended/double_complex/zgemm_hip_source_cc.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_cc.yaml rename to Tests/extended/double_complex/zgemm_hip_source_cc.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_cn.yaml b/Tests/extended/double_complex/zgemm_hip_source_cn.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_cn.yaml rename to Tests/extended/double_complex/zgemm_hip_source_cn.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_ct.yaml b/Tests/extended/double_complex/zgemm_hip_source_ct.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_ct.yaml rename to Tests/extended/double_complex/zgemm_hip_source_ct.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_nc.yaml b/Tests/extended/double_complex/zgemm_hip_source_nc.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_nc.yaml rename to Tests/extended/double_complex/zgemm_hip_source_nc.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_nn.yaml b/Tests/extended/double_complex/zgemm_hip_source_nn.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_nn.yaml rename to Tests/extended/double_complex/zgemm_hip_source_nn.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_nt.yaml b/Tests/extended/double_complex/zgemm_hip_source_nt.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_nt.yaml rename to Tests/extended/double_complex/zgemm_hip_source_nt.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_tc.yaml b/Tests/extended/double_complex/zgemm_hip_source_tc.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_tc.yaml rename to Tests/extended/double_complex/zgemm_hip_source_tc.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_tn.yaml b/Tests/extended/double_complex/zgemm_hip_source_tn.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_tn.yaml rename to Tests/extended/double_complex/zgemm_hip_source_tn.yaml diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_tt.yaml b/Tests/extended/double_complex/zgemm_hip_source_tt.yaml similarity index 100% rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_tt.yaml rename to Tests/extended/double_complex/zgemm_hip_source_tt.yaml diff --git a/Tensile/Tests/extended/flat/test_dgemm_asm_flat.yaml b/Tests/extended/flat/test_dgemm_asm_flat.yaml similarity index 100% rename from Tensile/Tests/extended/flat/test_dgemm_asm_flat.yaml rename to Tests/extended/flat/test_dgemm_asm_flat.yaml diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat.yaml b/Tests/extended/flat/test_sgemm_asm_flat.yaml similarity index 100% rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat.yaml rename to Tests/extended/flat/test_sgemm_asm_flat.yaml diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat_nt.yaml b/Tests/extended/flat/test_sgemm_asm_flat_nt.yaml similarity index 100% rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat_nt.yaml rename to Tests/extended/flat/test_sgemm_asm_flat_nt.yaml diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat_tn.yaml b/Tests/extended/flat/test_sgemm_asm_flat_tn.yaml similarity index 100% rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat_tn.yaml rename to Tests/extended/flat/test_sgemm_asm_flat_tn.yaml diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat_tt.yaml b/Tests/extended/flat/test_sgemm_asm_flat_tt.yaml similarity index 100% rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat_tt.yaml rename to Tests/extended/flat/test_sgemm_asm_flat_tt.yaml diff --git a/Tensile/Tests/extended/float8/f8gemm-hybrid-ss.yaml b/Tests/extended/float8/f8gemm-hybrid-ss.yaml similarity index 100% rename from Tensile/Tests/extended/float8/f8gemm-hybrid-ss.yaml rename to Tests/extended/float8/f8gemm-hybrid-ss.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_asm.yaml b/Tests/extended/float_complex/cgemm_asm.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_asm.yaml rename to Tests/extended/float_complex/cgemm_asm.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_cc.yaml b/Tests/extended/float_complex/cgemm_hip_source_cc.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_cc.yaml rename to Tests/extended/float_complex/cgemm_hip_source_cc.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_cn.yaml b/Tests/extended/float_complex/cgemm_hip_source_cn.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_cn.yaml rename to Tests/extended/float_complex/cgemm_hip_source_cn.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_ct.yaml b/Tests/extended/float_complex/cgemm_hip_source_ct.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_ct.yaml rename to Tests/extended/float_complex/cgemm_hip_source_ct.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_nc.yaml b/Tests/extended/float_complex/cgemm_hip_source_nc.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_nc.yaml rename to Tests/extended/float_complex/cgemm_hip_source_nc.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_nn.yaml b/Tests/extended/float_complex/cgemm_hip_source_nn.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_nn.yaml rename to Tests/extended/float_complex/cgemm_hip_source_nn.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_nt.yaml b/Tests/extended/float_complex/cgemm_hip_source_nt.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_nt.yaml rename to Tests/extended/float_complex/cgemm_hip_source_nt.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_tc.yaml b/Tests/extended/float_complex/cgemm_hip_source_tc.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_tc.yaml rename to Tests/extended/float_complex/cgemm_hip_source_tc.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_tn.yaml b/Tests/extended/float_complex/cgemm_hip_source_tn.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_tn.yaml rename to Tests/extended/float_complex/cgemm_hip_source_tn.yaml diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_tt.yaml b/Tests/extended/float_complex/cgemm_hip_source_tt.yaml similarity index 100% rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_tt.yaml rename to Tests/extended/float_complex/cgemm_hip_source_tt.yaml diff --git a/Tensile/Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml b/Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml similarity index 100% rename from Tensile/Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml rename to Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml diff --git a/Tensile/Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml b/Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml similarity index 100% rename from Tensile/Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml rename to Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml diff --git a/Tensile/Tests/extended/fractional/test_sgemm_fractional_edge.yaml b/Tests/extended/fractional/test_sgemm_fractional_edge.yaml similarity index 100% rename from Tensile/Tests/extended/fractional/test_sgemm_fractional_edge.yaml rename to Tests/extended/fractional/test_sgemm_fractional_edge.yaml diff --git a/Tensile/Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml b/Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml similarity index 100% rename from Tensile/Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml rename to Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml diff --git a/Tensile/Tests/extended/global_split_u/hgemm_gsu.yaml b/Tests/extended/global_split_u/hgemm_gsu.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/hgemm_gsu.yaml rename to Tests/extended/global_split_u/hgemm_gsu.yaml diff --git a/Tensile/Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml b/Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml rename to Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_batch.yaml b/Tests/extended/global_split_u/sgemm_gsu_batch.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_batch.yaml rename to Tests/extended/global_split_u/sgemm_gsu_batch.yaml diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_beta0.yaml b/Tests/extended/global_split_u/sgemm_gsu_beta0.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_beta0.yaml rename to Tests/extended/global_split_u/sgemm_gsu_beta0.yaml diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_beta1.yaml b/Tests/extended/global_split_u/sgemm_gsu_beta1.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_beta1.yaml rename to Tests/extended/global_split_u/sgemm_gsu_beta1.yaml diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_beta2.yaml b/Tests/extended/global_split_u/sgemm_gsu_beta2.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_beta2.yaml rename to Tests/extended/global_split_u/sgemm_gsu_beta2.yaml diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml b/Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml similarity index 100% rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml rename to Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml similarity index 100% rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml rename to Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml similarity index 100% rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml rename to Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml similarity index 100% rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml rename to Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml similarity index 100% rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml rename to Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml diff --git a/Tensile/Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml b/Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml rename to Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/cgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/cgemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/cgemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/cgemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/dgemm_lsu.yaml b/Tests/extended/local_split_u/dgemm_lsu.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/dgemm_lsu.yaml rename to Tests/extended/local_split_u/dgemm_lsu.yaml diff --git a/Tensile/Tests/extended/local_split_u/dgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/dgemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/dgemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/dgemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml b/Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu.yaml b/Tests/extended/local_split_u/hgemm_lsu.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/hgemm_lsu.yaml rename to Tests/extended/local_split_u/hgemm_lsu.yaml diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml b/Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml rename to Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/hgemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/hgemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml b/Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml rename to Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml diff --git a/Tensile/Tests/extended/local_split_u/igemm_lsu_mfma.yaml b/Tests/extended/local_split_u/igemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/igemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/igemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/sgemm_lsu.yaml b/Tests/extended/local_split_u/sgemm_lsu.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/sgemm_lsu.yaml rename to Tests/extended/local_split_u/sgemm_lsu.yaml diff --git a/Tensile/Tests/extended/local_split_u/sgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/sgemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/sgemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/sgemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/local_split_u/zgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/zgemm_lsu_mfma.yaml similarity index 100% rename from Tensile/Tests/extended/local_split_u/zgemm_lsu_mfma.yaml rename to Tests/extended/local_split_u/zgemm_lsu_mfma.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml b/Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml rename to Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml similarity index 100% rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml diff --git a/Tensile/Tests/extended/multi_sum/2sum.yaml b/Tests/extended/multi_sum/2sum.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/2sum.yaml rename to Tests/extended/multi_sum/2sum.yaml diff --git a/Tensile/Tests/extended/multi_sum/2sum_gsu.yaml b/Tests/extended/multi_sum/2sum_gsu.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/2sum_gsu.yaml rename to Tests/extended/multi_sum/2sum_gsu.yaml diff --git a/Tensile/Tests/extended/multi_sum/2sum_gsu_simple.yaml b/Tests/extended/multi_sum/2sum_gsu_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/2sum_gsu_simple.yaml rename to Tests/extended/multi_sum/2sum_gsu_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum/2sum_gsu_src.yaml b/Tests/extended/multi_sum/2sum_gsu_src.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/2sum_gsu_src.yaml rename to Tests/extended/multi_sum/2sum_gsu_src.yaml diff --git a/Tensile/Tests/extended/multi_sum/2sum_src.yaml b/Tests/extended/multi_sum/2sum_src.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/2sum_src.yaml rename to Tests/extended/multi_sum/2sum_src.yaml diff --git a/Tensile/Tests/extended/multi_sum/3sum_gsu.yaml b/Tests/extended/multi_sum/3sum_gsu.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/3sum_gsu.yaml rename to Tests/extended/multi_sum/3sum_gsu.yaml diff --git a/Tensile/Tests/extended/multi_sum/simple_sum2_scrambled.yaml b/Tests/extended/multi_sum/simple_sum2_scrambled.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum/simple_sum2_scrambled.yaml rename to Tests/extended/multi_sum/simple_sum2_scrambled.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml b/Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml rename to Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/1sum_simple.yaml b/Tests/extended/multi_sum_psd/1sum_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/1sum_simple.yaml rename to Tests/extended/multi_sum_psd/1sum_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum.yaml b/Tests/extended/multi_sum_psd/2sum.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum.yaml rename to Tests/extended/multi_sum_psd/2sum.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsu.yaml b/Tests/extended/multi_sum_psd/2sum_gsu.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsu.yaml rename to Tests/extended/multi_sum_psd/2sum_gsu.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml b/Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml rename to Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml b/Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml rename to Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml b/Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml rename to Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_pbd.yaml b/Tests/extended/multi_sum_psd/2sum_pbd.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum_pbd.yaml rename to Tests/extended/multi_sum_psd/2sum_pbd.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml b/Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml rename to Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum.yaml b/Tests/extended/multi_sum_psd/3sum.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/3sum.yaml rename to Tests/extended/multi_sum_psd/3sum.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum_gsu.yaml b/Tests/extended/multi_sum_psd/3sum_gsu.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/3sum_gsu.yaml rename to Tests/extended/multi_sum_psd/3sum_gsu.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml b/Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml rename to Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum_simple.yaml b/Tests/extended/multi_sum_psd/3sum_simple.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/3sum_simple.yaml rename to Tests/extended/multi_sum_psd/3sum_simple.yaml diff --git a/Tensile/Tests/extended/multi_sum_psd/README b/Tests/extended/multi_sum_psd/README similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/README rename to Tests/extended/multi_sum_psd/README diff --git a/Tensile/Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml b/Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml similarity index 100% rename from Tensile/Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml rename to Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_nn.yaml b/Tests/extended/nonbatched/sgemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/extended/nonbatched/sgemm_asm_nn.yaml rename to Tests/extended/nonbatched/sgemm_asm_nn.yaml diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_nt.yaml b/Tests/extended/nonbatched/sgemm_asm_nt.yaml similarity index 100% rename from Tensile/Tests/extended/nonbatched/sgemm_asm_nt.yaml rename to Tests/extended/nonbatched/sgemm_asm_nt.yaml diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_tn.yaml b/Tests/extended/nonbatched/sgemm_asm_tn.yaml similarity index 100% rename from Tensile/Tests/extended/nonbatched/sgemm_asm_tn.yaml rename to Tests/extended/nonbatched/sgemm_asm_tn.yaml diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_tt.yaml b/Tests/extended/nonbatched/sgemm_asm_tt.yaml similarity index 100% rename from Tensile/Tests/extended/nonbatched/sgemm_asm_tt.yaml rename to Tests/extended/nonbatched/sgemm_asm_tt.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/multi_free2.yaml b/Tests/extended/pack_tensor_dims/multi_free2.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/multi_free2.yaml rename to Tests/extended/pack_tensor_dims/multi_free2.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/multi_free_batch.yaml b/Tests/extended/pack_tensor_dims/multi_free_batch.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/multi_free_batch.yaml rename to Tests/extended/pack_tensor_dims/multi_free_batch.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/packed_perf_nn.yaml b/Tests/extended/pack_tensor_dims/packed_perf_nn.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/packed_perf_nn.yaml rename to Tests/extended/pack_tensor_dims/packed_perf_nn.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml b/Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml rename to Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml b/Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml rename to Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml b/Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml rename to Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml b/Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml rename to Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml diff --git a/Tensile/Tests/extended/pack_tensor_dims/vectorstore0.yaml b/Tests/extended/pack_tensor_dims/vectorstore0.yaml similarity index 100% rename from Tensile/Tests/extended/pack_tensor_dims/vectorstore0.yaml rename to Tests/extended/pack_tensor_dims/vectorstore0.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_NN.yaml b/Tests/extended/stagger_u/big_skinny_A_NN.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_A_NN.yaml rename to Tests/extended/stagger_u/big_skinny_A_NN.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_NT.yaml b/Tests/extended/stagger_u/big_skinny_A_NT.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_A_NT.yaml rename to Tests/extended/stagger_u/big_skinny_A_NT.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_TN.yaml b/Tests/extended/stagger_u/big_skinny_A_TN.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_A_TN.yaml rename to Tests/extended/stagger_u/big_skinny_A_TN.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_TT.yaml b/Tests/extended/stagger_u/big_skinny_A_TT.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_A_TT.yaml rename to Tests/extended/stagger_u/big_skinny_A_TT.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_NN.yaml b/Tests/extended/stagger_u/big_skinny_B_NN.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_B_NN.yaml rename to Tests/extended/stagger_u/big_skinny_B_NN.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_NT.yaml b/Tests/extended/stagger_u/big_skinny_B_NT.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_B_NT.yaml rename to Tests/extended/stagger_u/big_skinny_B_NT.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_TN.yaml b/Tests/extended/stagger_u/big_skinny_B_TN.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_B_TN.yaml rename to Tests/extended/stagger_u/big_skinny_B_TN.yaml diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_TT.yaml b/Tests/extended/stagger_u/big_skinny_B_TT.yaml similarity index 100% rename from Tensile/Tests/extended/stagger_u/big_skinny_B_TT.yaml rename to Tests/extended/stagger_u/big_skinny_B_TT.yaml diff --git a/Tensile/Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml b/Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml similarity index 100% rename from Tensile/Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml rename to Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml diff --git a/Tensile/Tests/extended/stream_k/sk_2tile_sgemm.yaml b/Tests/extended/stream_k/sk_2tile_sgemm.yaml similarity index 100% rename from Tensile/Tests/extended/stream_k/sk_2tile_sgemm.yaml rename to Tests/extended/stream_k/sk_2tile_sgemm.yaml diff --git a/Tensile/Tests/extended/stream_k/sk_hgemm_hhs.yaml b/Tests/extended/stream_k/sk_hgemm_hhs.yaml similarity index 100% rename from Tensile/Tests/extended/stream_k/sk_hgemm_hhs.yaml rename to Tests/extended/stream_k/sk_hgemm_hhs.yaml diff --git a/Tensile/Tests/extended/stream_k/sk_sgemm.yaml b/Tests/extended/stream_k/sk_sgemm.yaml similarity index 100% rename from Tensile/Tests/extended/stream_k/sk_sgemm.yaml rename to Tests/extended/stream_k/sk_sgemm.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/README b/Tests/extended/tensor_contraction/README similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/README rename to Tests/extended/tensor_contraction/README diff --git a/Tensile/Tests/extended/tensor_contraction/allownofree.yaml b/Tests/extended/tensor_contraction/allownofree.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/allownofree.yaml rename to Tests/extended/tensor_contraction/allownofree.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/assert_size_equal.yaml b/Tests/extended/tensor_contraction/assert_size_equal.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/assert_size_equal.yaml rename to Tests/extended/tensor_contraction/assert_size_equal.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/exact_conv.yaml b/Tests/extended/tensor_contraction/exact_conv.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/exact_conv.yaml rename to Tests/extended/tensor_contraction/exact_conv.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/filter.yaml b/Tests/extended/tensor_contraction/filter.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/filter.yaml rename to Tests/extended/tensor_contraction/filter.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/ncdhw.yaml b/Tests/extended/tensor_contraction/ncdhw.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/ncdhw.yaml rename to Tests/extended/tensor_contraction/ncdhw.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/sweep_packed_dims.yaml b/Tests/extended/tensor_contraction/sweep_packed_dims.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/sweep_packed_dims.yaml rename to Tests/extended/tensor_contraction/sweep_packed_dims.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle0.yaml b/Tests/extended/tensor_contraction/swizzle0.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/swizzle0.yaml rename to Tests/extended/tensor_contraction/swizzle0.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle1.yaml b/Tests/extended/tensor_contraction/swizzle1.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/swizzle1.yaml rename to Tests/extended/tensor_contraction/swizzle1.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle2.yaml b/Tests/extended/tensor_contraction/swizzle2.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/swizzle2.yaml rename to Tests/extended/tensor_contraction/swizzle2.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle3.yaml b/Tests/extended/tensor_contraction/swizzle3.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/swizzle3.yaml rename to Tests/extended/tensor_contraction/swizzle3.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml b/Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml rename to Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml b/Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml rename to Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml b/Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml rename to Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml diff --git a/Tensile/Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml b/Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml similarity index 100% rename from Tensile/Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml rename to Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml diff --git a/Tensile/Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml b/Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml rename to Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml diff --git a/Tensile/Tests/extended/use_initial_strides/test_1.yaml b/Tests/extended/use_initial_strides/test_1.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides/test_1.yaml rename to Tests/extended/use_initial_strides/test_1.yaml diff --git a/Tensile/Tests/extended/use_initial_strides/test_2.yaml b/Tests/extended/use_initial_strides/test_2.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides/test_2.yaml rename to Tests/extended/use_initial_strides/test_2.yaml diff --git a/Tensile/Tests/extended/use_initial_strides/test_strides.yaml b/Tests/extended/use_initial_strides/test_strides.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides/test_strides.yaml rename to Tests/extended/use_initial_strides/test_strides.yaml diff --git a/Tensile/Tests/extended/use_initial_strides/test_strides1.yaml b/Tests/extended/use_initial_strides/test_strides1.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides/test_strides1.yaml rename to Tests/extended/use_initial_strides/test_strides1.yaml diff --git a/Tensile/Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml b/Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml rename to Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml diff --git a/Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml b/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml rename to Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml diff --git a/Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml b/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml similarity index 100% rename from Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml rename to Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml diff --git a/Tensile/Tests/extended/vector_width/hgemm_nn_asm.yaml b/Tests/extended/vector_width/hgemm_nn_asm.yaml similarity index 100% rename from Tensile/Tests/extended/vector_width/hgemm_nn_asm.yaml rename to Tests/extended/vector_width/hgemm_nn_asm.yaml diff --git a/Tensile/Tests/extended/vector_width/sgemm_nn_asm.yaml b/Tests/extended/vector_width/sgemm_nn_asm.yaml similarity index 100% rename from Tensile/Tests/extended/vector_width/sgemm_nn_asm.yaml rename to Tests/extended/vector_width/sgemm_nn_asm.yaml diff --git a/Tensile/Tests/extended/vector_width/sgemm_nn_source.yaml b/Tests/extended/vector_width/sgemm_nn_source.yaml similarity index 100% rename from Tensile/Tests/extended/vector_width/sgemm_nn_source.yaml rename to Tests/extended/vector_width/sgemm_nn_source.yaml diff --git a/Tensile/Tests/extended/zeropad/test_zp_2sum_zpother.yaml b/Tests/extended/zeropad/test_zp_2sum_zpother.yaml similarity index 100% rename from Tensile/Tests/extended/zeropad/test_zp_2sum_zpother.yaml rename to Tests/extended/zeropad/test_zp_2sum_zpother.yaml diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_1sum.yaml b/Tests/extended/zeropad/test_zp_simple_1sum.yaml similarity index 100% rename from Tensile/Tests/extended/zeropad/test_zp_simple_1sum.yaml rename to Tests/extended/zeropad/test_zp_simple_1sum.yaml diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml b/Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml similarity index 100% rename from Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml rename to Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml b/Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml similarity index 100% rename from Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml rename to Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml b/Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml similarity index 100% rename from Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml rename to Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml b/Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml similarity index 100% rename from Tensile/Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml rename to Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml diff --git a/Tensile/Tests/hipModuleLoad_timing/Makefile b/Tests/hipModuleLoad_timing/Makefile similarity index 100% rename from Tensile/Tests/hipModuleLoad_timing/Makefile rename to Tests/hipModuleLoad_timing/Makefile diff --git a/Tensile/Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp b/Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp similarity index 100% rename from Tensile/Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp rename to Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp diff --git a/Tensile/Tests/integration/test_integration.py b/Tests/integration/test_integration.py similarity index 100% rename from Tensile/Tests/integration/test_integration.py rename to Tests/integration/test_integration.py diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml rename to Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml rename to Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml rename to Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml rename to Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml diff --git a/Tensile/Tests/pre_checkin/cov/COV4.yaml b/Tests/pre_checkin/cov/COV4.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/cov/COV4.yaml rename to Tests/pre_checkin/cov/COV4.yaml diff --git a/Tensile/Tests/pre_checkin/cov/COV5.yaml b/Tests/pre_checkin/cov/COV5.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/cov/COV5.yaml rename to Tests/pre_checkin/cov/COV5.yaml diff --git a/Tensile/Tests/pre_checkin/cov/COVDefault.yaml b/Tests/pre_checkin/cov/COVDefault.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/cov/COVDefault.yaml rename to Tests/pre_checkin/cov/COVDefault.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml b/Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml rename to Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/dgemm_asm.yaml b/Tests/pre_checkin/denorm/dgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/dgemm_asm.yaml rename to Tests/pre_checkin/denorm/dgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml b/Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml rename to Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml b/Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml rename to Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml b/Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml rename to Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml b/Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml rename to Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml b/Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml rename to Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml b/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml rename to Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml b/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml rename to Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml b/Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml rename to Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml diff --git a/Tensile/Tests/pre_checkin/denorm/sgemm_asm_nn.yaml b/Tests/pre_checkin/denorm/sgemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/denorm/sgemm_asm_nn.yaml rename to Tests/pre_checkin/denorm/sgemm_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/dgemm_asm.yaml b/Tests/pre_checkin/dgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/dgemm_asm.yaml rename to Tests/pre_checkin/dgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/dgemm_general_batch_asm.yaml b/Tests/pre_checkin/dgemm_general_batch_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/dgemm_general_batch_asm.yaml rename to Tests/pre_checkin/dgemm_general_batch_asm.yaml diff --git a/Tensile/Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml b/Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml rename to Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml rename to Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml rename to Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml rename to Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml rename to Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml b/Tests/pre_checkin/hgemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml rename to Tests/pre_checkin/hgemm_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml b/Tests/pre_checkin/hgemm_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml rename to Tests/pre_checkin/hgemm_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml b/Tests/pre_checkin/hgemm_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml rename to Tests/pre_checkin/hgemm_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml b/Tests/pre_checkin/hgemm_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml rename to Tests/pre_checkin/hgemm_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml b/Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml rename to Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml b/Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml rename to Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml rename to Tests/pre_checkin/hgemm_hpa_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml rename to Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml rename to Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml rename to Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml rename to Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/igemm_hpa_asm_nn.yaml b/Tests/pre_checkin/igemm_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/igemm_hpa_asm_nn.yaml rename to Tests/pre_checkin/igemm_hpa_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/igemm_hpa_hip_nn.yaml b/Tests/pre_checkin/igemm_hpa_hip_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/igemm_hpa_hip_nn.yaml rename to Tests/pre_checkin/igemm_hpa_hip_nn.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/1LDSB.yaml b/Tests/pre_checkin/mfma/1LDSB.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/1LDSB.yaml rename to Tests/pre_checkin/mfma/1LDSB.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml b/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml rename to Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml b/Tests/pre_checkin/mfma/cgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml rename to Tests/pre_checkin/mfma/cgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml b/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml rename to Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml b/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml rename to Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_asm.yaml b/Tests/pre_checkin/mfma/dgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/dgemm_asm.yaml rename to Tests/pre_checkin/mfma/dgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml b/Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml rename to Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml b/Tests/pre_checkin/mfma/dgemm_large_offset.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml rename to Tests/pre_checkin/mfma/dgemm_large_offset.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml rename to Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml rename to Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml rename to Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml rename to Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml rename to Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml rename to Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml rename to Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml rename to Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml rename to Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml rename to Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml rename to Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml rename to Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml rename to Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml rename to Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml rename to Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml b/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml rename to Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml b/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml rename to Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml b/Tests/pre_checkin/mfma/sgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml rename to Tests/pre_checkin/mfma/sgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml b/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml rename to Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml b/Tests/pre_checkin/mfma/sgemm_split_lds.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml rename to Tests/pre_checkin/mfma/sgemm_split_lds.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml b/Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml rename to Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml b/Tests/pre_checkin/mfma/wider_local_read.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml rename to Tests/pre_checkin/mfma/wider_local_read.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/zgemm_asm.yaml b/Tests/pre_checkin/mfma/zgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/zgemm_asm.yaml rename to Tests/pre_checkin/mfma/zgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml b/Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml rename to Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml diff --git a/Tensile/Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml b/Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml rename to Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml b/Tests/pre_checkin/regression/persistent_kernel.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml rename to Tests/pre_checkin/regression/persistent_kernel.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml b/Tests/pre_checkin/sgemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml rename to Tests/pre_checkin/sgemm_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml b/Tests/pre_checkin/sgemm_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml rename to Tests/pre_checkin/sgemm_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml b/Tests/pre_checkin/sgemm_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml rename to Tests/pre_checkin/sgemm_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tn_bigk.yaml b/Tests/pre_checkin/sgemm_asm_tn_bigk.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_asm_tn_bigk.yaml rename to Tests/pre_checkin/sgemm_asm_tn_bigk.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml b/Tests/pre_checkin/sgemm_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml rename to Tests/pre_checkin/sgemm_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_exact_dict.yaml b/Tests/pre_checkin/sgemm_exact_dict.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_exact_dict.yaml rename to Tests/pre_checkin/sgemm_exact_dict.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml b/Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml rename to Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/source/test_dgemm_defaults.yaml b/Tests/pre_checkin/source/test_dgemm_defaults.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/source/test_dgemm_defaults.yaml rename to Tests/pre_checkin/source/test_dgemm_defaults.yaml diff --git a/Tensile/Tests/pre_checkin/source/test_hgemm_defaults.yaml b/Tests/pre_checkin/source/test_hgemm_defaults.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/source/test_hgemm_defaults.yaml rename to Tests/pre_checkin/source/test_hgemm_defaults.yaml diff --git a/Tensile/Tests/pre_checkin/source/test_hgemm_hpa.yaml b/Tests/pre_checkin/source/test_hgemm_hpa.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/source/test_hgemm_hpa.yaml rename to Tests/pre_checkin/source/test_hgemm_hpa.yaml diff --git a/Tensile/Tests/pre_checkin/source/test_sgemm_defaults.yaml b/Tests/pre_checkin/source/test_sgemm_defaults.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/source/test_sgemm_defaults.yaml rename to Tests/pre_checkin/source/test_sgemm_defaults.yaml diff --git a/Tensile/Tests/pre_checkin/wmma/hgemm_wmma.yaml b/Tests/pre_checkin/wmma/hgemm_wmma.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/wmma/hgemm_wmma.yaml rename to Tests/pre_checkin/wmma/hgemm_wmma.yaml diff --git a/Tensile/Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml b/Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml rename to Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml diff --git a/Tensile/Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml b/Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml rename to Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml diff --git a/Tensile/Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml b/Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml rename to Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml diff --git a/Tensile/Tests/special/global_split_u_src/README b/Tests/special/global_split_u_src/README similarity index 100% rename from Tensile/Tests/special/global_split_u_src/README rename to Tests/special/global_split_u_src/README diff --git a/Tensile/Tests/special/global_split_u_src/hgemm_gsu.yaml b/Tests/special/global_split_u_src/hgemm_gsu.yaml similarity index 100% rename from Tensile/Tests/special/global_split_u_src/hgemm_gsu.yaml rename to Tests/special/global_split_u_src/hgemm_gsu.yaml diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml b/Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml similarity index 100% rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml rename to Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml b/Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml similarity index 100% rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml rename to Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml b/Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml similarity index 100% rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml rename to Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml b/Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml similarity index 100% rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml rename to Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml diff --git a/Tensile/Tests/special/igemm/igemm_hpa_hip_lsu.yaml b/Tests/special/igemm/igemm_hpa_hip_lsu.yaml similarity index 100% rename from Tensile/Tests/special/igemm/igemm_hpa_hip_lsu.yaml rename to Tests/special/igemm/igemm_hpa_hip_lsu.yaml diff --git a/Tensile/Tests/special/igemm/igemm_hpa_hip_nn.yaml b/Tests/special/igemm/igemm_hpa_hip_nn.yaml similarity index 100% rename from Tensile/Tests/special/igemm/igemm_hpa_hip_nn.yaml rename to Tests/special/igemm/igemm_hpa_hip_nn.yaml diff --git a/Tensile/Tests/special/igemm/igemm_hpa_hip_tt.yaml b/Tests/special/igemm/igemm_hpa_hip_tt.yaml similarity index 100% rename from Tensile/Tests/special/igemm/igemm_hpa_hip_tt.yaml rename to Tests/special/igemm/igemm_hpa_hip_tt.yaml diff --git a/Tensile/Tests/test_data/unit/library_data/hardcodedParameters.yaml b/Tests/test_data/unit/library_data/hardcodedParameters.yaml similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/hardcodedParameters.yaml rename to Tests/test_data/unit/library_data/hardcodedParameters.yaml diff --git a/Tensile/Tests/test_data/unit/library_data/initialSolutionParameters.yaml b/Tests/test_data/unit/library_data/initialSolutionParameters.yaml similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/initialSolutionParameters.yaml rename to Tests/test_data/unit/library_data/initialSolutionParameters.yaml diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary.yaml b/Tests/test_data/unit/library_data/library/TensileLibrary.yaml similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary.yaml rename to Tests/test_data/unit/library_data/library/TensileLibrary.yaml diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co diff --git a/Tensile/Tests/test_data/unit/library_data/library/metadata.yaml b/Tests/test_data/unit/library_data/library/metadata.yaml similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/library/metadata.yaml rename to Tests/test_data/unit/library_data/library/metadata.yaml diff --git a/Tensile/Tests/test_data/unit/library_data/problemType.yaml b/Tests/test_data/unit/library_data/problemType.yaml similarity index 100% rename from Tensile/Tests/test_data/unit/library_data/problemType.yaml rename to Tests/test_data/unit/library_data/problemType.yaml diff --git a/Tensile/Tests/test_data/unit/solutions/solutions_nn_3.yaml b/Tests/test_data/unit/solutions/solutions_nn_3.yaml similarity index 100% rename from Tensile/Tests/test_data/unit/solutions/solutions_nn_3.yaml rename to Tests/test_data/unit/solutions/solutions_nn_3.yaml diff --git a/Tensile/Tests/unit/__init__.py b/Tests/unit/__init__.py similarity index 100% rename from Tensile/Tests/unit/__init__.py rename to Tests/unit/__init__.py diff --git a/Tensile/Tests/unit/customKernels/TestKernel.s b/Tests/unit/customKernels/TestKernel.s similarity index 100% rename from Tensile/Tests/unit/customKernels/TestKernel.s rename to Tests/unit/customKernels/TestKernel.s diff --git a/Tensile/Tests/unit/replacement/bad_file/bad.txt b/Tests/unit/replacement/bad_file/bad.txt similarity index 100% rename from Tensile/Tests/unit/replacement/bad_file/bad.txt rename to Tests/unit/replacement/bad_file/bad.txt diff --git a/Tensile/Tests/unit/replacement/duplicate_kernel/a.txt b/Tests/unit/replacement/duplicate_kernel/a.txt similarity index 100% rename from Tensile/Tests/unit/replacement/duplicate_kernel/a.txt rename to Tests/unit/replacement/duplicate_kernel/a.txt diff --git a/Tensile/Tests/unit/replacement/duplicate_kernel/b.txt b/Tests/unit/replacement/duplicate_kernel/b.txt similarity index 100% rename from Tensile/Tests/unit/replacement/duplicate_kernel/b.txt rename to Tests/unit/replacement/duplicate_kernel/b.txt diff --git a/Tensile/Tests/unit/replacement/known_kernels_v2/baz.s.txt b/Tests/unit/replacement/known_kernels_v2/baz.s.txt similarity index 100% rename from Tensile/Tests/unit/replacement/known_kernels_v2/baz.s.txt rename to Tests/unit/replacement/known_kernels_v2/baz.s.txt diff --git a/Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt b/Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt similarity index 100% rename from Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt rename to Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt diff --git a/Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt b/Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt similarity index 100% rename from Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt rename to Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt diff --git a/Tensile/Tests/unit/replacement/known_kernels_v3/baz.s.txt b/Tests/unit/replacement/known_kernels_v3/baz.s.txt similarity index 100% rename from Tensile/Tests/unit/replacement/known_kernels_v3/baz.s.txt rename to Tests/unit/replacement/known_kernels_v3/baz.s.txt diff --git a/Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt b/Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt similarity index 100% rename from Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt rename to Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt diff --git a/Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt b/Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt similarity index 100% rename from Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt rename to Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt diff --git a/Tensile/Tests/unit/test_Common.py b/Tests/unit/test_Common.py similarity index 100% rename from Tensile/Tests/unit/test_Common.py rename to Tests/unit/test_Common.py diff --git a/Tensile/Tests/unit/test_Component.py b/Tests/unit/test_Component.py similarity index 100% rename from Tensile/Tests/unit/test_Component.py rename to Tests/unit/test_Component.py diff --git a/Tensile/Tests/unit/test_Configuration.py b/Tests/unit/test_Configuration.py similarity index 100% rename from Tensile/Tests/unit/test_Configuration.py rename to Tests/unit/test_Configuration.py diff --git a/Tensile/Tests/unit/test_CustomKernels.py b/Tests/unit/test_CustomKernels.py similarity index 100% rename from Tensile/Tests/unit/test_CustomKernels.py rename to Tests/unit/test_CustomKernels.py diff --git a/Tensile/Tests/unit/test_DataType.py b/Tests/unit/test_DataType.py similarity index 100% rename from Tensile/Tests/unit/test_DataType.py rename to Tests/unit/test_DataType.py diff --git a/Tensile/Tests/unit/test_HardwarePredicates.py b/Tests/unit/test_HardwarePredicates.py similarity index 100% rename from Tensile/Tests/unit/test_HardwarePredicates.py rename to Tests/unit/test_HardwarePredicates.py diff --git a/Tensile/Tests/unit/test_KernelWriterAssembly.py b/Tests/unit/test_KernelWriterAssembly.py similarity index 100% rename from Tensile/Tests/unit/test_KernelWriterAssembly.py rename to Tests/unit/test_KernelWriterAssembly.py diff --git a/Tensile/Tests/unit/test_LibraryIO.py b/Tests/unit/test_LibraryIO.py similarity index 100% rename from Tensile/Tests/unit/test_LibraryIO.py rename to Tests/unit/test_LibraryIO.py diff --git a/Tensile/Tests/unit/test_PerfMetricPredicates.py b/Tests/unit/test_PerfMetricPredicates.py similarity index 100% rename from Tensile/Tests/unit/test_PerfMetricPredicates.py rename to Tests/unit/test_PerfMetricPredicates.py diff --git a/Tensile/Tests/unit/test_Priority.py b/Tests/unit/test_Priority.py similarity index 100% rename from Tensile/Tests/unit/test_Priority.py rename to Tests/unit/test_Priority.py diff --git a/Tensile/Tests/unit/test_ReplacementKernels.py b/Tests/unit/test_ReplacementKernels.py similarity index 100% rename from Tensile/Tests/unit/test_ReplacementKernels.py rename to Tests/unit/test_ReplacementKernels.py diff --git a/Tensile/Tests/unit/test_TensileCreateLibrary.py b/Tests/unit/test_TensileCreateLibrary.py similarity index 100% rename from Tensile/Tests/unit/test_TensileCreateLibrary.py rename to Tests/unit/test_TensileCreateLibrary.py diff --git a/Tensile/Tests/unit/test_conv_problem.py b/Tests/unit/test_conv_problem.py similarity index 100% rename from Tensile/Tests/unit/test_conv_problem.py rename to Tests/unit/test_conv_problem.py diff --git a/Tensile/Tests/unit/test_exact_problem.py b/Tests/unit/test_exact_problem.py similarity index 100% rename from Tensile/Tests/unit/test_exact_problem.py rename to Tests/unit/test_exact_problem.py diff --git a/Tensile/Tests/unit/test_makeProblem.py b/Tests/unit/test_makeProblem.py similarity index 100% rename from Tensile/Tests/unit/test_makeProblem.py rename to Tests/unit/test_makeProblem.py diff --git a/Tensile/Tests/unit/test_mergeLogic.py b/Tests/unit/test_mergeLogic.py similarity index 100% rename from Tensile/Tests/unit/test_mergeLogic.py rename to Tests/unit/test_mergeLogic.py diff --git a/Tensile/Tests/unit/test_tryAssembler.py b/Tests/unit/test_tryAssembler.py similarity index 100% rename from Tensile/Tests/unit/test_tryAssembler.py rename to Tests/unit/test_tryAssembler.py diff --git a/Tensile/Tests/unit/test_useGlobalParameters.py b/Tests/unit/test_useGlobalParameters.py similarity index 100% rename from Tensile/Tests/unit/test_useGlobalParameters.py rename to Tests/unit/test_useGlobalParameters.py diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_nn.yaml b/Tests/vega_20/fast/igemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/vega_20/fast/igemm_asm_nn.yaml rename to Tests/vega_20/fast/igemm_asm_nn.yaml diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_nt.yaml b/Tests/vega_20/fast/igemm_asm_nt.yaml similarity index 100% rename from Tensile/Tests/vega_20/fast/igemm_asm_nt.yaml rename to Tests/vega_20/fast/igemm_asm_nt.yaml diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_tn.yaml b/Tests/vega_20/fast/igemm_asm_tn.yaml similarity index 100% rename from Tensile/Tests/vega_20/fast/igemm_asm_tn.yaml rename to Tests/vega_20/fast/igemm_asm_tn.yaml diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_tt.yaml b/Tests/vega_20/fast/igemm_asm_tt.yaml similarity index 100% rename from Tensile/Tests/vega_20/fast/igemm_asm_tt.yaml rename to Tests/vega_20/fast/igemm_asm_tt.yaml diff --git a/Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml b/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml similarity index 100% rename from Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml rename to Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml diff --git a/Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml b/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml similarity index 100% rename from Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml rename to Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml diff --git a/Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml b/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml similarity index 100% rename from Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml rename to Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml diff --git a/Tensile/Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml b/Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml similarity index 100% rename from Tensile/Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml rename to Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml diff --git a/Tensile/Tests/weekly/assertions/README b/Tests/weekly/assertions/README similarity index 100% rename from Tensile/Tests/weekly/assertions/README rename to Tests/weekly/assertions/README diff --git a/Tensile/Tests/weekly/assertions/test_hgemm_asem2_asm.yaml b/Tests/weekly/assertions/test_hgemm_asem2_asm.yaml similarity index 100% rename from Tensile/Tests/weekly/assertions/test_hgemm_asem2_asm.yaml rename to Tests/weekly/assertions/test_hgemm_asem2_asm.yaml diff --git a/Tensile/Tests/weekly/classic_source/test_hgemm_vectors.yaml b/Tests/weekly/classic_source/test_hgemm_vectors.yaml similarity index 100% rename from Tensile/Tests/weekly/classic_source/test_hgemm_vectors.yaml rename to Tests/weekly/classic_source/test_hgemm_vectors.yaml diff --git a/Tensile/Tests/weekly/classic_source/test_sgemm_vectors.yaml b/Tests/weekly/classic_source/test_sgemm_vectors.yaml similarity index 100% rename from Tensile/Tests/weekly/classic_source/test_sgemm_vectors.yaml rename to Tests/weekly/classic_source/test_sgemm_vectors.yaml diff --git a/Tensile/Tests/yaml_only/test_config.py b/Tests/yaml_only/test_config.py similarity index 100% rename from Tensile/Tests/yaml_only/test_config.py rename to Tests/yaml_only/test_config.py diff --git a/Tensile/Tests/yaml_only/test_ya b/Tests/yaml_only/test_ya similarity index 100% rename from Tensile/Tests/yaml_only/test_ya rename to Tests/yaml_only/test_ya diff --git a/Tensile/AsmMemoryInstruction.py b/src/Tensile/AsmMemoryInstruction.py similarity index 100% rename from Tensile/AsmMemoryInstruction.py rename to src/Tensile/AsmMemoryInstruction.py diff --git a/Tensile/AsmRegisterPool.py b/src/Tensile/AsmRegisterPool.py similarity index 100% rename from Tensile/AsmRegisterPool.py rename to src/Tensile/AsmRegisterPool.py diff --git a/Tensile/AsmUtils.py b/src/Tensile/AsmUtils.py similarity index 100% rename from Tensile/AsmUtils.py rename to src/Tensile/AsmUtils.py diff --git a/Tensile/BenchmarkProblems.py b/src/Tensile/BenchmarkProblems.py similarity index 100% rename from Tensile/BenchmarkProblems.py rename to src/Tensile/BenchmarkProblems.py diff --git a/Tensile/BenchmarkSplitter.py b/src/Tensile/BenchmarkSplitter.py similarity index 100% rename from Tensile/BenchmarkSplitter.py rename to src/Tensile/BenchmarkSplitter.py diff --git a/Tensile/BenchmarkStructs.py b/src/Tensile/BenchmarkStructs.py similarity index 100% rename from Tensile/BenchmarkStructs.py rename to src/Tensile/BenchmarkStructs.py diff --git a/Tensile/ClientExecutable.py b/src/Tensile/ClientExecutable.py similarity index 100% rename from Tensile/ClientExecutable.py rename to src/Tensile/ClientExecutable.py diff --git a/Tensile/ClientWriter.py b/src/Tensile/ClientWriter.py similarity index 100% rename from Tensile/ClientWriter.py rename to src/Tensile/ClientWriter.py diff --git a/Tensile/Code.py b/src/Tensile/Code.py similarity index 100% rename from Tensile/Code.py rename to src/Tensile/Code.py diff --git a/Tensile/Common.py b/src/Tensile/Common.py similarity index 100% rename from Tensile/Common.py rename to src/Tensile/Common.py diff --git a/Tensile/Component.py b/src/Tensile/Component.py similarity index 100% rename from Tensile/Component.py rename to src/Tensile/Component.py diff --git a/Tensile/Components/ComputeStoreVgprs.py b/src/Tensile/Components/ComputeStoreVgprs.py similarity index 100% rename from Tensile/Components/ComputeStoreVgprs.py rename to src/Tensile/Components/ComputeStoreVgprs.py diff --git a/Tensile/Components/LocalRead.py b/src/Tensile/Components/LocalRead.py similarity index 100% rename from Tensile/Components/LocalRead.py rename to src/Tensile/Components/LocalRead.py diff --git a/Tensile/Components/LraTileAssignment.py b/src/Tensile/Components/LraTileAssignment.py similarity index 100% rename from Tensile/Components/LraTileAssignment.py rename to src/Tensile/Components/LraTileAssignment.py diff --git a/Tensile/Components/MAC_BF16_HPA.py b/src/Tensile/Components/MAC_BF16_HPA.py similarity index 100% rename from Tensile/Components/MAC_BF16_HPA.py rename to src/Tensile/Components/MAC_BF16_HPA.py diff --git a/Tensile/Components/MAC_F16.py b/src/Tensile/Components/MAC_F16.py similarity index 100% rename from Tensile/Components/MAC_F16.py rename to src/Tensile/Components/MAC_F16.py diff --git a/Tensile/Components/MAC_F16_HPA.py b/src/Tensile/Components/MAC_F16_HPA.py similarity index 100% rename from Tensile/Components/MAC_F16_HPA.py rename to src/Tensile/Components/MAC_F16_HPA.py diff --git a/Tensile/Components/MAC_F32.py b/src/Tensile/Components/MAC_F32.py similarity index 100% rename from Tensile/Components/MAC_F32.py rename to src/Tensile/Components/MAC_F32.py diff --git a/Tensile/Components/MAC_F32C.py b/src/Tensile/Components/MAC_F32C.py similarity index 100% rename from Tensile/Components/MAC_F32C.py rename to src/Tensile/Components/MAC_F32C.py diff --git a/Tensile/Components/MAC_F64.py b/src/Tensile/Components/MAC_F64.py similarity index 100% rename from Tensile/Components/MAC_F64.py rename to src/Tensile/Components/MAC_F64.py diff --git a/Tensile/Components/MAC_F64C.py b/src/Tensile/Components/MAC_F64C.py similarity index 100% rename from Tensile/Components/MAC_F64C.py rename to src/Tensile/Components/MAC_F64C.py diff --git a/Tensile/Components/MAC_I8X4.py b/src/Tensile/Components/MAC_I8X4.py similarity index 100% rename from Tensile/Components/MAC_I8X4.py rename to src/Tensile/Components/MAC_I8X4.py diff --git a/Tensile/Components/MAC_I8_HPA.py b/src/Tensile/Components/MAC_I8_HPA.py similarity index 100% rename from Tensile/Components/MAC_I8_HPA.py rename to src/Tensile/Components/MAC_I8_HPA.py diff --git a/Tensile/Components/MFMA.py b/src/Tensile/Components/MFMA.py similarity index 100% rename from Tensile/Components/MFMA.py rename to src/Tensile/Components/MFMA.py diff --git a/Tensile/Components/NotLocalFullTileElements.py b/src/Tensile/Components/NotLocalFullTileElements.py similarity index 100% rename from Tensile/Components/NotLocalFullTileElements.py rename to src/Tensile/Components/NotLocalFullTileElements.py diff --git a/Tensile/Components/Priority.py b/src/Tensile/Components/Priority.py similarity index 100% rename from Tensile/Components/Priority.py rename to src/Tensile/Components/Priority.py diff --git a/Tensile/Components/PseudoRandomGenerator.py b/src/Tensile/Components/PseudoRandomGenerator.py similarity index 100% rename from Tensile/Components/PseudoRandomGenerator.py rename to src/Tensile/Components/PseudoRandomGenerator.py diff --git a/Tensile/Components/ShiftVectorComponents.py b/src/Tensile/Components/ShiftVectorComponents.py similarity index 100% rename from Tensile/Components/ShiftVectorComponents.py rename to src/Tensile/Components/ShiftVectorComponents.py diff --git a/Tensile/Components/Signature.py b/src/Tensile/Components/Signature.py similarity index 100% rename from Tensile/Components/Signature.py rename to src/Tensile/Components/Signature.py diff --git a/Tensile/Components/__init__.py b/src/Tensile/Components/__init__.py similarity index 100% rename from Tensile/Components/__init__.py rename to src/Tensile/Components/__init__.py diff --git a/Tensile/Configuration.py b/src/Tensile/Configuration.py similarity index 100% rename from Tensile/Configuration.py rename to src/Tensile/Configuration.py diff --git a/Tensile/Contractions.py b/src/Tensile/Contractions.py similarity index 100% rename from Tensile/Contractions.py rename to src/Tensile/Contractions.py diff --git a/Tensile/CustomKernels.py b/src/Tensile/CustomKernels.py similarity index 100% rename from Tensile/CustomKernels.py rename to src/Tensile/CustomKernels.py diff --git a/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s b/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s similarity index 100% rename from Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s rename to src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s diff --git a/Tensile/DataType.py b/src/Tensile/DataType.py similarity index 100% rename from Tensile/DataType.py rename to src/Tensile/DataType.py diff --git a/Tensile/EmbeddedData.py b/src/Tensile/EmbeddedData.py similarity index 100% rename from Tensile/EmbeddedData.py rename to src/Tensile/EmbeddedData.py diff --git a/Tensile/GenerateSummations.py b/src/Tensile/GenerateSummations.py similarity index 100% rename from Tensile/GenerateSummations.py rename to src/Tensile/GenerateSummations.py diff --git a/Tensile/Hardware.py b/src/Tensile/Hardware.py similarity index 100% rename from Tensile/Hardware.py rename to src/Tensile/Hardware.py diff --git a/Tensile/KernelWriter.py b/src/Tensile/KernelWriter.py similarity index 100% rename from Tensile/KernelWriter.py rename to src/Tensile/KernelWriter.py diff --git a/Tensile/KernelWriterAssembly.py b/src/Tensile/KernelWriterAssembly.py similarity index 100% rename from Tensile/KernelWriterAssembly.py rename to src/Tensile/KernelWriterAssembly.py diff --git a/Tensile/KernelWriterBase.py b/src/Tensile/KernelWriterBase.py similarity index 100% rename from Tensile/KernelWriterBase.py rename to src/Tensile/KernelWriterBase.py diff --git a/Tensile/KernelWriterBetaOnly.py b/src/Tensile/KernelWriterBetaOnly.py similarity index 100% rename from Tensile/KernelWriterBetaOnly.py rename to src/Tensile/KernelWriterBetaOnly.py diff --git a/Tensile/KernelWriterConversion.py b/src/Tensile/KernelWriterConversion.py similarity index 100% rename from Tensile/KernelWriterConversion.py rename to src/Tensile/KernelWriterConversion.py diff --git a/Tensile/KernelWriterSource.py b/src/Tensile/KernelWriterSource.py similarity index 100% rename from Tensile/KernelWriterSource.py rename to src/Tensile/KernelWriterSource.py diff --git a/Tensile/KernelWriterStreamKInit.py b/src/Tensile/KernelWriterStreamKInit.py similarity index 100% rename from Tensile/KernelWriterStreamKInit.py rename to src/Tensile/KernelWriterStreamKInit.py diff --git a/Tensile/LibraryIO.py b/src/Tensile/LibraryIO.py similarity index 100% rename from Tensile/LibraryIO.py rename to src/Tensile/LibraryIO.py diff --git a/Tensile/LibraryLogic.py b/src/Tensile/LibraryLogic.py similarity index 100% rename from Tensile/LibraryLogic.py rename to src/Tensile/LibraryLogic.py diff --git a/Tensile/Parallel.py b/src/Tensile/Parallel.py similarity index 100% rename from Tensile/Parallel.py rename to src/Tensile/Parallel.py diff --git a/Tensile/Properties.py b/src/Tensile/Properties.py similarity index 100% rename from Tensile/Properties.py rename to src/Tensile/Properties.py diff --git a/Tensile/ReplacementKernels.py b/src/Tensile/ReplacementKernels.py similarity index 100% rename from Tensile/ReplacementKernels.py rename to src/Tensile/ReplacementKernels.py diff --git a/Tensile/SolutionLibrary.py b/src/Tensile/SolutionLibrary.py similarity index 100% rename from Tensile/SolutionLibrary.py rename to src/Tensile/SolutionLibrary.py diff --git a/Tensile/SolutionSelectionLibrary.py b/src/Tensile/SolutionSelectionLibrary.py similarity index 100% rename from Tensile/SolutionSelectionLibrary.py rename to src/Tensile/SolutionSelectionLibrary.py diff --git a/Tensile/SolutionStructs.py b/src/Tensile/SolutionStructs.py similarity index 100% rename from Tensile/SolutionStructs.py rename to src/Tensile/SolutionStructs.py diff --git a/Tensile/SolutionWriter.py b/src/Tensile/SolutionWriter.py similarity index 100% rename from Tensile/SolutionWriter.py rename to src/Tensile/SolutionWriter.py diff --git a/Tensile/Tensile.py b/src/Tensile/Tensile.py similarity index 100% rename from Tensile/Tensile.py rename to src/Tensile/Tensile.py diff --git a/Tensile/TensileBenchmarkCluster.py b/src/Tensile/TensileBenchmarkCluster.py similarity index 100% rename from Tensile/TensileBenchmarkCluster.py rename to src/Tensile/TensileBenchmarkCluster.py diff --git a/Tensile/TensileBenchmarkClusterScripts.py b/src/Tensile/TensileBenchmarkClusterScripts.py similarity index 100% rename from Tensile/TensileBenchmarkClusterScripts.py rename to src/Tensile/TensileBenchmarkClusterScripts.py diff --git a/Tensile/TensileBenchmarkLibraryClient.py b/src/Tensile/TensileBenchmarkLibraryClient.py similarity index 100% rename from Tensile/TensileBenchmarkLibraryClient.py rename to src/Tensile/TensileBenchmarkLibraryClient.py diff --git a/Tensile/TensileClientConfig.py b/src/Tensile/TensileClientConfig.py similarity index 100% rename from Tensile/TensileClientConfig.py rename to src/Tensile/TensileClientConfig.py diff --git a/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py similarity index 100% rename from Tensile/TensileCreateLibrary.py rename to src/Tensile/TensileCreateLibrary.py diff --git a/Tensile/TensileLibLogicToYaml.py b/src/Tensile/TensileLibLogicToYaml.py similarity index 100% rename from Tensile/TensileLibLogicToYaml.py rename to src/Tensile/TensileLibLogicToYaml.py diff --git a/Tensile/TensileMergeLibrary.py b/src/Tensile/TensileMergeLibrary.py similarity index 100% rename from Tensile/TensileMergeLibrary.py rename to src/Tensile/TensileMergeLibrary.py diff --git a/Tensile/TensileRetuneLibrary.py b/src/Tensile/TensileRetuneLibrary.py similarity index 100% rename from Tensile/TensileRetuneLibrary.py rename to src/Tensile/TensileRetuneLibrary.py diff --git a/Tensile/TensileUpdateLibrary.py b/src/Tensile/TensileUpdateLibrary.py similarity index 100% rename from Tensile/TensileUpdateLibrary.py rename to src/Tensile/TensileUpdateLibrary.py diff --git a/Tensile/Utils.py b/src/Tensile/Utils.py similarity index 100% rename from Tensile/Utils.py rename to src/Tensile/Utils.py diff --git a/Tensile/__init__.py b/src/Tensile/__init__.py similarity index 100% rename from Tensile/__init__.py rename to src/Tensile/__init__.py diff --git a/Tensile/bin/Tensile b/src/Tensile/bin/Tensile similarity index 100% rename from Tensile/bin/Tensile rename to src/Tensile/bin/Tensile diff --git a/Tensile/bin/TensileBenchmarkCluster b/src/Tensile/bin/TensileBenchmarkCluster similarity index 100% rename from Tensile/bin/TensileBenchmarkCluster rename to src/Tensile/bin/TensileBenchmarkCluster diff --git a/Tensile/bin/TensileClientConfig b/src/Tensile/bin/TensileClientConfig similarity index 100% rename from Tensile/bin/TensileClientConfig rename to src/Tensile/bin/TensileClientConfig diff --git a/Tensile/bin/TensileCreateLibrary b/src/Tensile/bin/TensileCreateLibrary similarity index 100% rename from Tensile/bin/TensileCreateLibrary rename to src/Tensile/bin/TensileCreateLibrary diff --git a/Tensile/bin/TensileGenerateSummations b/src/Tensile/bin/TensileGenerateSummations similarity index 100% rename from Tensile/bin/TensileGenerateSummations rename to src/Tensile/bin/TensileGenerateSummations diff --git a/Tensile/bin/TensileLibLogicToYaml b/src/Tensile/bin/TensileLibLogicToYaml similarity index 100% rename from Tensile/bin/TensileLibLogicToYaml rename to src/Tensile/bin/TensileLibLogicToYaml diff --git a/Tensile/bin/TensileMergeLibrary b/src/Tensile/bin/TensileMergeLibrary similarity index 100% rename from Tensile/bin/TensileMergeLibrary rename to src/Tensile/bin/TensileMergeLibrary diff --git a/Tensile/bin/TensileRetuneLibrary b/src/Tensile/bin/TensileRetuneLibrary similarity index 100% rename from Tensile/bin/TensileRetuneLibrary rename to src/Tensile/bin/TensileRetuneLibrary diff --git a/Tensile/bin/TensileUpdateLibrary b/src/Tensile/bin/TensileUpdateLibrary similarity index 100% rename from Tensile/bin/TensileUpdateLibrary rename to src/Tensile/bin/TensileUpdateLibrary diff --git a/Tensile/Configs/alternate-format/sizeList-example.yaml b/src/Tensile/data/Configs/alternate-format/sizeList-example.yaml similarity index 100% rename from Tensile/Configs/alternate-format/sizeList-example.yaml rename to src/Tensile/data/Configs/alternate-format/sizeList-example.yaml diff --git a/Tensile/Configs/alternate-format/vega20-example.yaml b/src/Tensile/data/Configs/alternate-format/vega20-example.yaml similarity index 100% rename from Tensile/Configs/alternate-format/vega20-example.yaml rename to src/Tensile/data/Configs/alternate-format/vega20-example.yaml diff --git a/Tensile/Configs/deep_bench_nn.csv b/src/Tensile/data/Configs/deep_bench_nn.csv similarity index 100% rename from Tensile/Configs/deep_bench_nn.csv rename to src/Tensile/data/Configs/deep_bench_nn.csv diff --git a/Tensile/Configs/deep_bench_nn_batched.csv b/src/Tensile/data/Configs/deep_bench_nn_batched.csv similarity index 100% rename from Tensile/Configs/deep_bench_nn_batched.csv rename to src/Tensile/data/Configs/deep_bench_nn_batched.csv diff --git a/Tensile/Configs/deep_bench_nt.csv b/src/Tensile/data/Configs/deep_bench_nt.csv similarity index 100% rename from Tensile/Configs/deep_bench_nt.csv rename to src/Tensile/data/Configs/deep_bench_nt.csv diff --git a/Tensile/Configs/deep_bench_nt_batched.csv b/src/Tensile/data/Configs/deep_bench_nt_batched.csv similarity index 100% rename from Tensile/Configs/deep_bench_nt_batched.csv rename to src/Tensile/data/Configs/deep_bench_nt_batched.csv diff --git a/Tensile/Configs/deep_bench_tn.csv b/src/Tensile/data/Configs/deep_bench_tn.csv similarity index 100% rename from Tensile/Configs/deep_bench_tn.csv rename to src/Tensile/data/Configs/deep_bench_tn.csv diff --git a/Tensile/Configs/deep_bench_tn_batched.csv b/src/Tensile/data/Configs/deep_bench_tn_batched.csv similarity index 100% rename from Tensile/Configs/deep_bench_tn_batched.csv rename to src/Tensile/data/Configs/deep_bench_tn_batched.csv diff --git a/Tensile/Configs/mfma/mfma_hpa_bf16_nt_test.yaml b/src/Tensile/data/Configs/mfma/mfma_hpa_bf16_nt_test.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_hpa_bf16_nt_test.yaml rename to src/Tensile/data/Configs/mfma/mfma_hpa_bf16_nt_test.yaml diff --git a/Tensile/Configs/mfma/mfma_igemm_lite_test.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_lite_test.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_igemm_lite_test.yaml rename to src/Tensile/data/Configs/mfma/mfma_igemm_lite_test.yaml diff --git a/Tensile/Configs/mfma/mfma_igemm_nn_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_igemm_nn_asm_full.yaml rename to src/Tensile/data/Configs/mfma/mfma_igemm_nn_asm_full.yaml diff --git a/Tensile/Configs/mfma/mfma_igemm_nt_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_igemm_nt_asm_full.yaml rename to src/Tensile/data/Configs/mfma/mfma_igemm_nt_asm_full.yaml diff --git a/Tensile/Configs/mfma/mfma_igemm_tn_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_igemm_tn_asm_full.yaml rename to src/Tensile/data/Configs/mfma/mfma_igemm_tn_asm_full.yaml diff --git a/Tensile/Configs/mfma/mfma_igemm_tt_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_igemm_tt_asm_full.yaml rename to src/Tensile/data/Configs/mfma/mfma_igemm_tt_asm_full.yaml diff --git a/Tensile/Configs/mfma/mfma_test.yaml b/src/Tensile/data/Configs/mfma/mfma_test.yaml similarity index 100% rename from Tensile/Configs/mfma/mfma_test.yaml rename to src/Tensile/data/Configs/mfma/mfma_test.yaml diff --git a/Tensile/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml b/src/Tensile/data/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml similarity index 100% rename from Tensile/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml rename to src/Tensile/data/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml diff --git a/Tensile/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml similarity index 100% rename from Tensile/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml rename to src/Tensile/data/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml diff --git a/Tensile/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml b/src/Tensile/data/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml similarity index 100% rename from Tensile/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml rename to src/Tensile/data/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml diff --git a/Tensile/Configs/mfma/sgemm_tlunn.yaml b/src/Tensile/data/Configs/mfma/sgemm_tlunn.yaml similarity index 100% rename from Tensile/Configs/mfma/sgemm_tlunn.yaml rename to src/Tensile/data/Configs/mfma/sgemm_tlunn.yaml diff --git a/Tensile/Configs/mfma/sgemm_transposeLDS.yaml b/src/Tensile/data/Configs/mfma/sgemm_transposeLDS.yaml similarity index 100% rename from Tensile/Configs/mfma/sgemm_transposeLDS.yaml rename to src/Tensile/data/Configs/mfma/sgemm_transposeLDS.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/Makefile b/src/Tensile/data/Configs/miopen/Makefile similarity index 100% rename from Tensile/Configs/miopen/Makefile rename to src/Tensile/data/Configs/miopen/Makefile diff --git a/Tensile/Configs/miopen/README.md b/src/Tensile/data/Configs/miopen/README.md similarity index 100% rename from Tensile/Configs/miopen/README.md rename to src/Tensile/data/Configs/miopen/README.md diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/README b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/README similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/README rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/README diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/README.md b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/README.md similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/README.md rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/README.md diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/README.md b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/README.md similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/README.md rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/README.md diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/logs/convert.log b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/convert.log similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/logs/convert.log rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/convert.log diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/logs/merge.log b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/merge.log similarity index 100% rename from Tensile/Configs/miopen/archives/rk/2020-08-12/logs/merge.log rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/merge.log diff --git a/Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml diff --git a/Tensile/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml diff --git a/Tensile/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml rename to src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml diff --git a/Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml rename to src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml diff --git a/Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml diff --git a/Tensile/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml diff --git a/Tensile/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml diff --git a/Tensile/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Configs/miopen/boiler/header.yml b/src/Tensile/data/Configs/miopen/boiler/header.yml similarity index 100% rename from Tensile/Configs/miopen/boiler/header.yml rename to src/Tensile/data/Configs/miopen/boiler/header.yml diff --git a/Tensile/Configs/miopen/boiler/library_logic_hip_only.yml b/src/Tensile/data/Configs/miopen/boiler/library_logic_hip_only.yml similarity index 100% rename from Tensile/Configs/miopen/boiler/library_logic_hip_only.yml rename to src/Tensile/data/Configs/miopen/boiler/library_logic_hip_only.yml diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml b/src/Tensile/data/Configs/miopen/boiler/library_logic_vega10_only.yml similarity index 100% rename from Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml rename to src/Tensile/data/Configs/miopen/boiler/library_logic_vega10_only.yml diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml b/src/Tensile/data/Configs/miopen/boiler/library_logic_vega20_only.yml similarity index 100% rename from Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml rename to src/Tensile/data/Configs/miopen/boiler/library_logic_vega20_only.yml diff --git a/Tensile/Configs/miopen/convert_cfg.py b/src/Tensile/data/Configs/miopen/convert_cfg.py similarity index 100% rename from Tensile/Configs/miopen/convert_cfg.py rename to src/Tensile/data/Configs/miopen/convert_cfg.py diff --git a/Tensile/Configs/miopen/make_all.sh b/src/Tensile/data/Configs/miopen/make_all.sh similarity index 100% rename from Tensile/Configs/miopen/make_all.sh rename to src/Tensile/data/Configs/miopen/make_all.sh diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_gemm_large.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_large.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/deepbench_gemm_large.yml rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_large.yml diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml diff --git a/Tensile/Configs/miopen/problems/nn/resnet50_all.yml b/src/Tensile/data/Configs/miopen/problems/nn/resnet50_all.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/resnet50_all.yml rename to src/Tensile/data/Configs/miopen/problems/nn/resnet50_all.yml diff --git a/Tensile/Configs/miopen/problems/nn/resnet50_batch64.yml b/src/Tensile/data/Configs/miopen/problems/nn/resnet50_batch64.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/resnet50_batch64.yml rename to src/Tensile/data/Configs/miopen/problems/nn/resnet50_batch64.yml diff --git a/Tensile/Configs/miopen/problems/nn/resnet_batch64_B.yml b/src/Tensile/data/Configs/miopen/problems/nn/resnet_batch64_B.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nn/resnet_batch64_B.yml rename to src/Tensile/data/Configs/miopen/problems/nn/resnet_batch64_B.yml diff --git a/Tensile/Configs/miopen/problems/nt/deepbench_gemm_large.yml b/src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_large.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nt/deepbench_gemm_large.yml rename to src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_large.yml diff --git a/Tensile/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml b/src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml rename to src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml diff --git a/Tensile/Configs/miopen/problems/nt/resnet50_all.yml b/src/Tensile/data/Configs/miopen/problems/nt/resnet50_all.yml similarity index 100% rename from Tensile/Configs/miopen/problems/nt/resnet50_all.yml rename to src/Tensile/data/Configs/miopen/problems/nt/resnet50_all.yml diff --git a/Tensile/Configs/miopen/problems/tn/deepbench_gemm_large.yml b/src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_large.yml similarity index 100% rename from Tensile/Configs/miopen/problems/tn/deepbench_gemm_large.yml rename to src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_large.yml diff --git a/Tensile/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml b/src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml similarity index 100% rename from Tensile/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml rename to src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml diff --git a/Tensile/Configs/miopen/problems/tn/resnet50_all.yml b/src/Tensile/data/Configs/miopen/problems/tn/resnet50_all.yml similarity index 100% rename from Tensile/Configs/miopen/problems/tn/resnet50_all.yml rename to src/Tensile/data/Configs/miopen/problems/tn/resnet50_all.yml diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_3.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml rename to src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_3.yml diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_5.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml rename to src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_5.yml diff --git a/Tensile/Configs/miopen/solutions/hgemm_quick.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_quick.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/hgemm_quick.yml rename to src/Tensile/data/Configs/miopen/solutions/hgemm_quick.yml diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_3.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml rename to src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_3.yml diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_5.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml rename to src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_5.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_3.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_3.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_5.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_5.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_7.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_7.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_quick.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_quick.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_quick.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_quick.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_3.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_3.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_4.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_4.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_4.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_4.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_5.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_5.yml diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_7.yml similarity index 100% rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_7.yml diff --git a/Tensile/Configs/miopen/types/hgemm_nn.yml b/src/Tensile/data/Configs/miopen/types/hgemm_nn.yml similarity index 100% rename from Tensile/Configs/miopen/types/hgemm_nn.yml rename to src/Tensile/data/Configs/miopen/types/hgemm_nn.yml diff --git a/Tensile/Configs/miopen/types/hgemm_nt.yml b/src/Tensile/data/Configs/miopen/types/hgemm_nt.yml similarity index 100% rename from Tensile/Configs/miopen/types/hgemm_nt.yml rename to src/Tensile/data/Configs/miopen/types/hgemm_nt.yml diff --git a/Tensile/Configs/miopen/types/hgemm_tn.yml b/src/Tensile/data/Configs/miopen/types/hgemm_tn.yml similarity index 100% rename from Tensile/Configs/miopen/types/hgemm_tn.yml rename to src/Tensile/data/Configs/miopen/types/hgemm_tn.yml diff --git a/Tensile/Configs/miopen/types/hgemm_tt.yml b/src/Tensile/data/Configs/miopen/types/hgemm_tt.yml similarity index 100% rename from Tensile/Configs/miopen/types/hgemm_tt.yml rename to src/Tensile/data/Configs/miopen/types/hgemm_tt.yml diff --git a/Tensile/Configs/miopen/types/igemm_nn.yml b/src/Tensile/data/Configs/miopen/types/igemm_nn.yml similarity index 100% rename from Tensile/Configs/miopen/types/igemm_nn.yml rename to src/Tensile/data/Configs/miopen/types/igemm_nn.yml diff --git a/Tensile/Configs/miopen/types/igemm_nt.yml b/src/Tensile/data/Configs/miopen/types/igemm_nt.yml similarity index 100% rename from Tensile/Configs/miopen/types/igemm_nt.yml rename to src/Tensile/data/Configs/miopen/types/igemm_nt.yml diff --git a/Tensile/Configs/miopen/types/igemm_tn.yml b/src/Tensile/data/Configs/miopen/types/igemm_tn.yml similarity index 100% rename from Tensile/Configs/miopen/types/igemm_tn.yml rename to src/Tensile/data/Configs/miopen/types/igemm_tn.yml diff --git a/Tensile/Configs/miopen/types/igemm_tt.yml b/src/Tensile/data/Configs/miopen/types/igemm_tt.yml similarity index 100% rename from Tensile/Configs/miopen/types/igemm_tt.yml rename to src/Tensile/data/Configs/miopen/types/igemm_tt.yml diff --git a/Tensile/Configs/miopen/types/sgemm_nn.yml b/src/Tensile/data/Configs/miopen/types/sgemm_nn.yml similarity index 100% rename from Tensile/Configs/miopen/types/sgemm_nn.yml rename to src/Tensile/data/Configs/miopen/types/sgemm_nn.yml diff --git a/Tensile/Configs/miopen/types/sgemm_nt.yml b/src/Tensile/data/Configs/miopen/types/sgemm_nt.yml similarity index 100% rename from Tensile/Configs/miopen/types/sgemm_nt.yml rename to src/Tensile/data/Configs/miopen/types/sgemm_nt.yml diff --git a/Tensile/Configs/miopen/types/sgemm_tn.yml b/src/Tensile/data/Configs/miopen/types/sgemm_tn.yml similarity index 100% rename from Tensile/Configs/miopen/types/sgemm_tn.yml rename to src/Tensile/data/Configs/miopen/types/sgemm_tn.yml diff --git a/Tensile/Configs/miopen/types/sgemm_tt.yml b/src/Tensile/data/Configs/miopen/types/sgemm_tt.yml similarity index 100% rename from Tensile/Configs/miopen/types/sgemm_tt.yml rename to src/Tensile/data/Configs/miopen/types/sgemm_tt.yml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/src/Tensile/data/Configs/rocblas_cgemm.yaml similarity index 100% rename from Tensile/Configs/rocblas_cgemm.yaml rename to src/Tensile/data/Configs/rocblas_cgemm.yaml diff --git a/Tensile/Configs/rocblas_cgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_cgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_cgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_cgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_cgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_cgemm_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_cgemm_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_cgemm_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_dgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_dgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_dgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_dgemm_asm_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_asm_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_asm_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_dgemm_asm_square.yaml b/src/Tensile/data/Configs/rocblas_dgemm_asm_square.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_asm_square.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_asm_square.yaml diff --git a/Tensile/Configs/rocblas_dgemm_bufferload_limit.yaml b/src/Tensile/data/Configs/rocblas_dgemm_bufferload_limit.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_bufferload_limit.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_bufferload_limit.yaml diff --git a/Tensile/Configs/rocblas_dgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_dgemm_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nn_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nn_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nt_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_nt_resume_train_exp.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_resume_train_exp.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_nt_resume_train_exp.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_nt_resume_train_exp.yaml diff --git a/Tensile/Configs/rocblas_dgemm_tn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_tn_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_tn_asm_full.yaml diff --git a/Tensile/Configs/rocblas_dgemm_tt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_dgemm_tt_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_dgemm_tt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hgemm_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hgemm_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hgemm_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hgemm_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_hgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_hgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hgemm_asm_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_hgemm_asm_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_hgemm_asm_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_hgemm_bufferload_limit.yaml b/src/Tensile/data/Configs/rocblas_hgemm_bufferload_limit.yaml similarity index 100% rename from Tensile/Configs/rocblas_hgemm_bufferload_limit.yaml rename to src/Tensile/data/Configs/rocblas_hgemm_bufferload_limit.yaml diff --git a/Tensile/Configs/rocblas_hgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hgemm_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hgemm_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_hgemm_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml b/src/Tensile/data/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml diff --git a/Tensile/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml b/src/Tensile/data/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_inc1_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_inc1_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_inc1_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml diff --git a/Tensile/Configs/rocblas_hpa_igemm_nn_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_nn_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_igemm_nn_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_igemm_nn_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_igemm_nt_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_nt_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_igemm_nt_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_igemm_nt_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_igemm_tn_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_tn_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_igemm_tn_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_igemm_tn_hip.yaml diff --git a/Tensile/Configs/rocblas_hpa_igemm_tt_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_tt_hip.yaml similarity index 100% rename from Tensile/Configs/rocblas_hpa_igemm_tt_hip.yaml rename to src/Tensile/data/Configs/rocblas_hpa_igemm_tt_hip.yaml diff --git a/Tensile/Configs/rocblas_hsgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hsgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_hsgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_hsgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_igemm_asm_full_nn.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_nn.yaml similarity index 100% rename from Tensile/Configs/rocblas_igemm_asm_full_nn.yaml rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_nn.yaml diff --git a/Tensile/Configs/rocblas_igemm_asm_full_nt.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_nt.yaml similarity index 100% rename from Tensile/Configs/rocblas_igemm_asm_full_nt.yaml rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_nt.yaml diff --git a/Tensile/Configs/rocblas_igemm_asm_full_tn.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_tn.yaml similarity index 100% rename from Tensile/Configs/rocblas_igemm_asm_full_tn.yaml rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_tn.yaml diff --git a/Tensile/Configs/rocblas_igemm_asm_full_tt.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_tt.yaml similarity index 100% rename from Tensile/Configs/rocblas_igemm_asm_full_tt.yaml rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_tt.yaml diff --git a/Tensile/Configs/rocblas_igemm_hip_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_igemm_hip_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_igemm_hip_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_igemm_hip_single_kernel.yaml diff --git a/Tensile/Configs/rocblas_sgemm_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_asm_full.yaml diff --git a/Tensile/Configs/rocblas_sgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_asm_lite.yaml diff --git a/Tensile/Configs/rocblas_sgemm_asm_only.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_only.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_asm_only.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_asm_only.yaml diff --git a/Tensile/Configs/rocblas_sgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_single_kernel.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_asm_single_kernel.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_asm_single_kernel.yaml diff --git a/Tensile/Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml b/src/Tensile/data/Configs/rocblas_sgemm_bufferload_limit.yaml similarity index 100% rename from Tensile/Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_bufferload_limit.yaml diff --git a/Tensile/Configs/rocblas_sgemm_example.yaml b/src/Tensile/data/Configs/rocblas_sgemm_example.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_example.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_example.yaml diff --git a/Tensile/Configs/rocblas_sgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_sgemm_hip_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_hip_lite.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_hip_lite.yaml diff --git a/Tensile/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml diff --git a/Tensile/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml diff --git a/Tensile/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml similarity index 100% rename from Tensile/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml rename to src/Tensile/data/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/src/Tensile/data/Configs/rocblas_zgemm.yaml similarity index 100% rename from Tensile/Configs/rocblas_zgemm.yaml rename to src/Tensile/data/Configs/rocblas_zgemm.yaml diff --git a/Tensile/Configs/rocblas_zgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_zgemm_asm_lite.yaml similarity index 100% rename from Tensile/Configs/rocblas_zgemm_asm_lite.yaml rename to src/Tensile/data/Configs/rocblas_zgemm_asm_lite.yaml diff --git a/Tensile/Perf/BDAS/dgemm_kmeans.yaml b/src/Tensile/data/Perf/BDAS/dgemm_kmeans.yaml similarity index 100% rename from Tensile/Perf/BDAS/dgemm_kmeans.yaml rename to src/Tensile/data/Perf/BDAS/dgemm_kmeans.yaml diff --git a/Tensile/Perf/BDAS/dgemm_pca.yaml b/src/Tensile/data/Perf/BDAS/dgemm_pca.yaml similarity index 100% rename from Tensile/Perf/BDAS/dgemm_pca.yaml rename to src/Tensile/data/Perf/BDAS/dgemm_pca.yaml diff --git a/Tensile/Perf/BERT/sgemm_xdlops.yaml b/src/Tensile/data/Perf/BERT/sgemm_xdlops.yaml similarity index 100% rename from Tensile/Perf/BERT/sgemm_xdlops.yaml rename to src/Tensile/data/Perf/BERT/sgemm_xdlops.yaml diff --git a/Tensile/Perf/DLRM/sgemm_xdlops.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops.yaml similarity index 100% rename from Tensile/Perf/DLRM/sgemm_xdlops.yaml rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops.yaml diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nn.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn.yaml similarity index 100% rename from Tensile/Perf/DLRM/sgemm_xdlops_nn.yaml rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn.yaml diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml similarity index 100% rename from Tensile/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nt.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt.yaml similarity index 100% rename from Tensile/Perf/DLRM/sgemm_xdlops_nt.yaml rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt.yaml diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml similarity index 100% rename from Tensile/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml similarity index 100% rename from Tensile/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml diff --git a/Tensile/Perf/TRANSFORMER/sgemm_xdlops.yaml b/src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops.yaml similarity index 100% rename from Tensile/Perf/TRANSFORMER/sgemm_xdlops.yaml rename to src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops.yaml diff --git a/Tensile/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml b/src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml similarity index 100% rename from Tensile/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml rename to src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml diff --git a/Tensile/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml b/src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml similarity index 100% rename from Tensile/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml rename to src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml diff --git a/Tensile/Perf/conv/README b/src/Tensile/data/Perf/conv/README similarity index 100% rename from Tensile/Perf/conv/README rename to src/Tensile/data/Perf/conv/README diff --git a/Tensile/Perf/conv/conv_1x1_af0em.yaml b/src/Tensile/data/Perf/conv/conv_1x1_af0em.yaml similarity index 100% rename from Tensile/Perf/conv/conv_1x1_af0em.yaml rename to src/Tensile/data/Perf/conv/conv_1x1_af0em.yaml diff --git a/Tensile/Perf/conv/conv_1x1_oddpbd.yaml b/src/Tensile/data/Perf/conv/conv_1x1_oddpbd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_1x1_oddpbd.yaml rename to src/Tensile/data/Perf/conv/conv_1x1_oddpbd.yaml diff --git a/Tensile/Perf/conv/conv_1x1u2_bdww.yaml b/src/Tensile/data/Perf/conv/conv_1x1u2_bdww.yaml similarity index 100% rename from Tensile/Perf/conv/conv_1x1u2_bdww.yaml rename to src/Tensile/data/Perf/conv/conv_1x1u2_bdww.yaml diff --git a/Tensile/Perf/conv/conv_1x1u2_fwd.yaml b/src/Tensile/data/Perf/conv/conv_1x1u2_fwd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_1x1u2_fwd.yaml rename to src/Tensile/data/Perf/conv/conv_1x1u2_fwd.yaml diff --git a/Tensile/Perf/conv/conv_1x7_fwd.yaml b/src/Tensile/data/Perf/conv/conv_1x7_fwd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_1x7_fwd.yaml rename to src/Tensile/data/Perf/conv/conv_1x7_fwd.yaml diff --git a/Tensile/Perf/conv/conv_7x1_fwd.yaml b/src/Tensile/data/Perf/conv/conv_7x1_fwd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_7x1_fwd.yaml rename to src/Tensile/data/Perf/conv/conv_7x1_fwd.yaml diff --git a/Tensile/Perf/conv/conv_7x1_fwd2.yaml b/src/Tensile/data/Perf/conv/conv_7x1_fwd2.yaml similarity index 100% rename from Tensile/Perf/conv/conv_7x1_fwd2.yaml rename to src/Tensile/data/Perf/conv/conv_7x1_fwd2.yaml diff --git a/Tensile/Perf/conv/conv_7x1_roundup.yaml b/src/Tensile/data/Perf/conv/conv_7x1_roundup.yaml similarity index 100% rename from Tensile/Perf/conv/conv_7x1_roundup.yaml rename to src/Tensile/data/Perf/conv/conv_7x1_roundup.yaml diff --git a/Tensile/Perf/conv/conv_7x7u2_fwd.yaml b/src/Tensile/data/Perf/conv/conv_7x7u2_fwd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_7x7u2_fwd.yaml rename to src/Tensile/data/Perf/conv/conv_7x7u2_fwd.yaml diff --git a/Tensile/Perf/conv/conv_bwdd_pbd.yaml b/src/Tensile/data/Perf/conv/conv_bwdd_pbd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_bwdd_pbd.yaml rename to src/Tensile/data/Perf/conv/conv_bwdd_pbd.yaml diff --git a/Tensile/Perf/conv/conv_fwd.yaml b/src/Tensile/data/Perf/conv/conv_fwd.yaml similarity index 100% rename from Tensile/Perf/conv/conv_fwd.yaml rename to src/Tensile/data/Perf/conv/conv_fwd.yaml diff --git a/Tensile/Perf/conv_bwdd_ex0.yaml b/src/Tensile/data/Perf/conv_bwdd_ex0.yaml similarity index 100% rename from Tensile/Perf/conv_bwdd_ex0.yaml rename to src/Tensile/data/Perf/conv_bwdd_ex0.yaml diff --git a/Tensile/Perf/conv_bwdd_ex1.yaml b/src/Tensile/data/Perf/conv_bwdd_ex1.yaml similarity index 100% rename from Tensile/Perf/conv_bwdd_ex1.yaml rename to src/Tensile/data/Perf/conv_bwdd_ex1.yaml diff --git a/Tensile/Perf/conv_bwdw_big_gsu.yaml b/src/Tensile/data/Perf/conv_bwdw_big_gsu.yaml similarity index 100% rename from Tensile/Perf/conv_bwdw_big_gsu.yaml rename to src/Tensile/data/Perf/conv_bwdw_big_gsu.yaml diff --git a/Tensile/Perf/conv_bwdw_small_gsu.yaml b/src/Tensile/data/Perf/conv_bwdw_small_gsu.yaml similarity index 100% rename from Tensile/Perf/conv_bwdw_small_gsu.yaml rename to src/Tensile/data/Perf/conv_bwdw_small_gsu.yaml diff --git a/Tensile/Perf/conv_fwd_ex0.yaml b/src/Tensile/data/Perf/conv_fwd_ex0.yaml similarity index 100% rename from Tensile/Perf/conv_fwd_ex0.yaml rename to src/Tensile/data/Perf/conv_fwd_ex0.yaml diff --git a/Tensile/Perf/dgemm_large_square.yaml b/src/Tensile/data/Perf/dgemm_large_square.yaml similarity index 100% rename from Tensile/Perf/dgemm_large_square.yaml rename to src/Tensile/data/Perf/dgemm_large_square.yaml diff --git a/Tensile/Perf/hpl.yaml b/src/Tensile/data/Perf/hpl.yaml similarity index 100% rename from Tensile/Perf/hpl.yaml rename to src/Tensile/data/Perf/hpl.yaml diff --git a/Tensile/Perf/hpl_one.yaml b/src/Tensile/data/Perf/hpl_one.yaml similarity index 100% rename from Tensile/Perf/hpl_one.yaml rename to src/Tensile/data/Perf/hpl_one.yaml diff --git a/Tensile/Perf/hpl_quick.yaml b/src/Tensile/data/Perf/hpl_quick.yaml similarity index 100% rename from Tensile/Perf/hpl_quick.yaml rename to src/Tensile/data/Perf/hpl_quick.yaml diff --git a/Tensile/Perf/hpl_quick44k.yaml b/src/Tensile/data/Perf/hpl_quick44k.yaml similarity index 100% rename from Tensile/Perf/hpl_quick44k.yaml rename to src/Tensile/data/Perf/hpl_quick44k.yaml diff --git a/Tensile/Perf/inception/conv_1x1u1.yaml b/src/Tensile/data/Perf/inception/conv_1x1u1.yaml similarity index 100% rename from Tensile/Perf/inception/conv_1x1u1.yaml rename to src/Tensile/data/Perf/inception/conv_1x1u1.yaml diff --git a/Tensile/Perf/inception/conv_1x1u1_starter.yaml b/src/Tensile/data/Perf/inception/conv_1x1u1_starter.yaml similarity index 100% rename from Tensile/Perf/inception/conv_1x1u1_starter.yaml rename to src/Tensile/data/Perf/inception/conv_1x1u1_starter.yaml diff --git a/Tensile/Perf/inception/conv_NxN.yaml b/src/Tensile/data/Perf/inception/conv_NxN.yaml similarity index 100% rename from Tensile/Perf/inception/conv_NxN.yaml rename to src/Tensile/data/Perf/inception/conv_NxN.yaml diff --git a/Tensile/Perf/sgemm_large_square_nn.yaml b/src/Tensile/data/Perf/sgemm_large_square_nn.yaml similarity index 100% rename from Tensile/Perf/sgemm_large_square_nn.yaml rename to src/Tensile/data/Perf/sgemm_large_square_nn.yaml diff --git a/Tensile/Perf/sgemm_large_square_nt.yaml b/src/Tensile/data/Perf/sgemm_large_square_nt.yaml similarity index 100% rename from Tensile/Perf/sgemm_large_square_nt.yaml rename to src/Tensile/data/Perf/sgemm_large_square_nt.yaml diff --git a/Tensile/Perf/sgemm_large_square_tn.yaml b/src/Tensile/data/Perf/sgemm_large_square_tn.yaml similarity index 100% rename from Tensile/Perf/sgemm_large_square_tn.yaml rename to src/Tensile/data/Perf/sgemm_large_square_tn.yaml diff --git a/Tensile/Perf/use_initial_strides_cd/README b/src/Tensile/data/Perf/use_initial_strides_cd/README similarity index 100% rename from Tensile/Perf/use_initial_strides_cd/README rename to src/Tensile/data/Perf/use_initial_strides_cd/README diff --git a/Tensile/Perf/use_initial_strides_cd/perf_baseline0.yaml b/src/Tensile/data/Perf/use_initial_strides_cd/perf_baseline0.yaml similarity index 100% rename from Tensile/Perf/use_initial_strides_cd/perf_baseline0.yaml rename to src/Tensile/data/Perf/use_initial_strides_cd/perf_baseline0.yaml diff --git a/Tensile/Perf/use_initial_strides_cd/perf_uis_cd0.yaml b/src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd0.yaml similarity index 100% rename from Tensile/Perf/use_initial_strides_cd/perf_uis_cd0.yaml rename to src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd0.yaml diff --git a/Tensile/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml b/src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml similarity index 100% rename from Tensile/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml rename to src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml diff --git a/Tensile/Source/CMakeLists.txt b/src/Tensile/data/Source/CMakeLists.txt similarity index 100% rename from Tensile/Source/CMakeLists.txt rename to src/Tensile/data/Source/CMakeLists.txt diff --git a/Tensile/Source/EnableWarnings.cmake b/src/Tensile/data/Source/EnableWarnings.cmake similarity index 100% rename from Tensile/Source/EnableWarnings.cmake rename to src/Tensile/data/Source/EnableWarnings.cmake diff --git a/Tensile/Source/FindHIP.cmake b/src/Tensile/data/Source/FindHIP.cmake similarity index 100% rename from Tensile/Source/FindHIP.cmake rename to src/Tensile/data/Source/FindHIP.cmake diff --git a/Tensile/Source/FindOpenCL.cmake b/src/Tensile/data/Source/FindOpenCL.cmake similarity index 100% rename from Tensile/Source/FindOpenCL.cmake rename to src/Tensile/data/Source/FindOpenCL.cmake diff --git a/Tensile/Source/KernelHeader.h b/src/Tensile/data/Source/KernelHeader.h similarity index 100% rename from Tensile/Source/KernelHeader.h rename to src/Tensile/data/Source/KernelHeader.h diff --git a/Tensile/Source/TensileTypes.h b/src/Tensile/data/Source/TensileTypes.h similarity index 100% rename from Tensile/Source/TensileTypes.h rename to src/Tensile/data/Source/TensileTypes.h diff --git a/Tensile/Source/client/CMakeLists.txt b/src/Tensile/data/Source/client/CMakeLists.txt similarity index 100% rename from Tensile/Source/client/CMakeLists.txt rename to src/Tensile/data/Source/client/CMakeLists.txt diff --git a/Tensile/Source/client/include/BenchmarkTimer.hpp b/src/Tensile/data/Source/client/include/BenchmarkTimer.hpp similarity index 100% rename from Tensile/Source/client/include/BenchmarkTimer.hpp rename to src/Tensile/data/Source/client/include/BenchmarkTimer.hpp diff --git a/Tensile/Source/client/include/CSVStackFile.hpp b/src/Tensile/data/Source/client/include/CSVStackFile.hpp similarity index 100% rename from Tensile/Source/client/include/CSVStackFile.hpp rename to src/Tensile/data/Source/client/include/CSVStackFile.hpp diff --git a/Tensile/Source/client/include/ClientProblemFactory.hpp b/src/Tensile/data/Source/client/include/ClientProblemFactory.hpp similarity index 100% rename from Tensile/Source/client/include/ClientProblemFactory.hpp rename to src/Tensile/data/Source/client/include/ClientProblemFactory.hpp diff --git a/Tensile/Source/client/include/ConvolutionProblem.hpp b/src/Tensile/data/Source/client/include/ConvolutionProblem.hpp similarity index 100% rename from Tensile/Source/client/include/ConvolutionProblem.hpp rename to src/Tensile/data/Source/client/include/ConvolutionProblem.hpp diff --git a/Tensile/Source/client/include/DataInitialization.hpp b/src/Tensile/data/Source/client/include/DataInitialization.hpp similarity index 100% rename from Tensile/Source/client/include/DataInitialization.hpp rename to src/Tensile/data/Source/client/include/DataInitialization.hpp diff --git a/Tensile/Source/client/include/DataInitializationTyped.hpp b/src/Tensile/data/Source/client/include/DataInitializationTyped.hpp similarity index 100% rename from Tensile/Source/client/include/DataInitializationTyped.hpp rename to src/Tensile/data/Source/client/include/DataInitializationTyped.hpp diff --git a/Tensile/Source/client/include/HardwareMonitor.hpp b/src/Tensile/data/Source/client/include/HardwareMonitor.hpp similarity index 100% rename from Tensile/Source/client/include/HardwareMonitor.hpp rename to src/Tensile/data/Source/client/include/HardwareMonitor.hpp diff --git a/Tensile/Source/client/include/HardwareMonitorListener.hpp b/src/Tensile/data/Source/client/include/HardwareMonitorListener.hpp similarity index 100% rename from Tensile/Source/client/include/HardwareMonitorListener.hpp rename to src/Tensile/data/Source/client/include/HardwareMonitorListener.hpp diff --git a/Tensile/Source/client/include/HardwareMonitorType.hpp b/src/Tensile/data/Source/client/include/HardwareMonitorType.hpp similarity index 100% rename from Tensile/Source/client/include/HardwareMonitorType.hpp rename to src/Tensile/data/Source/client/include/HardwareMonitorType.hpp diff --git a/Tensile/Source/client/include/HardwareMonitorWindows.hpp b/src/Tensile/data/Source/client/include/HardwareMonitorWindows.hpp similarity index 100% rename from Tensile/Source/client/include/HardwareMonitorWindows.hpp rename to src/Tensile/data/Source/client/include/HardwareMonitorWindows.hpp diff --git a/Tensile/Source/client/include/HardwareMonitor_fwd.hpp b/src/Tensile/data/Source/client/include/HardwareMonitor_fwd.hpp similarity index 100% rename from Tensile/Source/client/include/HardwareMonitor_fwd.hpp rename to src/Tensile/data/Source/client/include/HardwareMonitor_fwd.hpp diff --git a/Tensile/Source/client/include/LibraryUpdateReporter.hpp b/src/Tensile/data/Source/client/include/LibraryUpdateReporter.hpp similarity index 100% rename from Tensile/Source/client/include/LibraryUpdateReporter.hpp rename to src/Tensile/data/Source/client/include/LibraryUpdateReporter.hpp diff --git a/Tensile/Source/client/include/LogReporter.hpp b/src/Tensile/data/Source/client/include/LogReporter.hpp similarity index 100% rename from Tensile/Source/client/include/LogReporter.hpp rename to src/Tensile/data/Source/client/include/LogReporter.hpp diff --git a/Tensile/Source/client/include/MetaResultReporter.hpp b/src/Tensile/data/Source/client/include/MetaResultReporter.hpp similarity index 100% rename from Tensile/Source/client/include/MetaResultReporter.hpp rename to src/Tensile/data/Source/client/include/MetaResultReporter.hpp diff --git a/Tensile/Source/client/include/MetaRunListener.hpp b/src/Tensile/data/Source/client/include/MetaRunListener.hpp similarity index 100% rename from Tensile/Source/client/include/MetaRunListener.hpp rename to src/Tensile/data/Source/client/include/MetaRunListener.hpp diff --git a/Tensile/Source/client/include/PerformanceReporter.hpp b/src/Tensile/data/Source/client/include/PerformanceReporter.hpp similarity index 100% rename from Tensile/Source/client/include/PerformanceReporter.hpp rename to src/Tensile/data/Source/client/include/PerformanceReporter.hpp diff --git a/Tensile/Source/client/include/ProgressListener.hpp b/src/Tensile/data/Source/client/include/ProgressListener.hpp similarity index 100% rename from Tensile/Source/client/include/ProgressListener.hpp rename to src/Tensile/data/Source/client/include/ProgressListener.hpp diff --git a/Tensile/Source/client/include/Reference.hpp b/src/Tensile/data/Source/client/include/Reference.hpp similarity index 100% rename from Tensile/Source/client/include/Reference.hpp rename to src/Tensile/data/Source/client/include/Reference.hpp diff --git a/Tensile/Source/client/include/ReferenceValidator.hpp b/src/Tensile/data/Source/client/include/ReferenceValidator.hpp similarity index 100% rename from Tensile/Source/client/include/ReferenceValidator.hpp rename to src/Tensile/data/Source/client/include/ReferenceValidator.hpp diff --git a/Tensile/Source/client/include/ResultComparison.hpp b/src/Tensile/data/Source/client/include/ResultComparison.hpp similarity index 100% rename from Tensile/Source/client/include/ResultComparison.hpp rename to src/Tensile/data/Source/client/include/ResultComparison.hpp diff --git a/Tensile/Source/client/include/ResultFileReporter.hpp b/src/Tensile/data/Source/client/include/ResultFileReporter.hpp similarity index 100% rename from Tensile/Source/client/include/ResultFileReporter.hpp rename to src/Tensile/data/Source/client/include/ResultFileReporter.hpp diff --git a/Tensile/Source/client/include/ResultReporter.hpp b/src/Tensile/data/Source/client/include/ResultReporter.hpp similarity index 100% rename from Tensile/Source/client/include/ResultReporter.hpp rename to src/Tensile/data/Source/client/include/ResultReporter.hpp diff --git a/Tensile/Source/client/include/ResultReporter_fwd.hpp b/src/Tensile/data/Source/client/include/ResultReporter_fwd.hpp similarity index 100% rename from Tensile/Source/client/include/ResultReporter_fwd.hpp rename to src/Tensile/data/Source/client/include/ResultReporter_fwd.hpp diff --git a/Tensile/Source/client/include/RunListener.hpp b/src/Tensile/data/Source/client/include/RunListener.hpp similarity index 100% rename from Tensile/Source/client/include/RunListener.hpp rename to src/Tensile/data/Source/client/include/RunListener.hpp diff --git a/Tensile/Source/client/include/SolutionIterator.hpp b/src/Tensile/data/Source/client/include/SolutionIterator.hpp similarity index 100% rename from Tensile/Source/client/include/SolutionIterator.hpp rename to src/Tensile/data/Source/client/include/SolutionIterator.hpp diff --git a/Tensile/Source/client/include/TimingEvents.hpp b/src/Tensile/data/Source/client/include/TimingEvents.hpp similarity index 100% rename from Tensile/Source/client/include/TimingEvents.hpp rename to src/Tensile/data/Source/client/include/TimingEvents.hpp diff --git a/Tensile/Source/client/main.cpp b/src/Tensile/data/Source/client/main.cpp similarity index 100% rename from Tensile/Source/client/main.cpp rename to src/Tensile/data/Source/client/main.cpp diff --git a/Tensile/Source/client/source/BenchmarkTimer.cpp b/src/Tensile/data/Source/client/source/BenchmarkTimer.cpp similarity index 100% rename from Tensile/Source/client/source/BenchmarkTimer.cpp rename to src/Tensile/data/Source/client/source/BenchmarkTimer.cpp diff --git a/Tensile/Source/client/source/CSVStackFile.cpp b/src/Tensile/data/Source/client/source/CSVStackFile.cpp similarity index 100% rename from Tensile/Source/client/source/CSVStackFile.cpp rename to src/Tensile/data/Source/client/source/CSVStackFile.cpp diff --git a/Tensile/Source/client/source/ClientProblemFactory.cpp b/src/Tensile/data/Source/client/source/ClientProblemFactory.cpp similarity index 100% rename from Tensile/Source/client/source/ClientProblemFactory.cpp rename to src/Tensile/data/Source/client/source/ClientProblemFactory.cpp diff --git a/Tensile/Source/client/source/ConvolutionProblem.cpp b/src/Tensile/data/Source/client/source/ConvolutionProblem.cpp similarity index 100% rename from Tensile/Source/client/source/ConvolutionProblem.cpp rename to src/Tensile/data/Source/client/source/ConvolutionProblem.cpp diff --git a/Tensile/Source/client/source/DataInitialization.cpp b/src/Tensile/data/Source/client/source/DataInitialization.cpp similarity index 100% rename from Tensile/Source/client/source/DataInitialization.cpp rename to src/Tensile/data/Source/client/source/DataInitialization.cpp diff --git a/Tensile/Source/client/source/HardwareMonitor.cpp b/src/Tensile/data/Source/client/source/HardwareMonitor.cpp similarity index 100% rename from Tensile/Source/client/source/HardwareMonitor.cpp rename to src/Tensile/data/Source/client/source/HardwareMonitor.cpp diff --git a/Tensile/Source/client/source/HardwareMonitorListener.cpp b/src/Tensile/data/Source/client/source/HardwareMonitorListener.cpp similarity index 100% rename from Tensile/Source/client/source/HardwareMonitorListener.cpp rename to src/Tensile/data/Source/client/source/HardwareMonitorListener.cpp diff --git a/Tensile/Source/client/source/LibraryUpdateReporter.cpp b/src/Tensile/data/Source/client/source/LibraryUpdateReporter.cpp similarity index 100% rename from Tensile/Source/client/source/LibraryUpdateReporter.cpp rename to src/Tensile/data/Source/client/source/LibraryUpdateReporter.cpp diff --git a/Tensile/Source/client/source/MetaRunListener.cpp b/src/Tensile/data/Source/client/source/MetaRunListener.cpp similarity index 100% rename from Tensile/Source/client/source/MetaRunListener.cpp rename to src/Tensile/data/Source/client/source/MetaRunListener.cpp diff --git a/Tensile/Source/client/source/PerformanceReporter.cpp b/src/Tensile/data/Source/client/source/PerformanceReporter.cpp similarity index 100% rename from Tensile/Source/client/source/PerformanceReporter.cpp rename to src/Tensile/data/Source/client/source/PerformanceReporter.cpp diff --git a/Tensile/Source/client/source/ProgressListener.cpp b/src/Tensile/data/Source/client/source/ProgressListener.cpp similarity index 100% rename from Tensile/Source/client/source/ProgressListener.cpp rename to src/Tensile/data/Source/client/source/ProgressListener.cpp diff --git a/Tensile/Source/client/source/Reference.cpp b/src/Tensile/data/Source/client/source/Reference.cpp similarity index 100% rename from Tensile/Source/client/source/Reference.cpp rename to src/Tensile/data/Source/client/source/Reference.cpp diff --git a/Tensile/Source/client/source/ReferenceValidator.cpp b/src/Tensile/data/Source/client/source/ReferenceValidator.cpp similarity index 100% rename from Tensile/Source/client/source/ReferenceValidator.cpp rename to src/Tensile/data/Source/client/source/ReferenceValidator.cpp diff --git a/Tensile/Source/client/source/ResultFileReporter.cpp b/src/Tensile/data/Source/client/source/ResultFileReporter.cpp similarity index 100% rename from Tensile/Source/client/source/ResultFileReporter.cpp rename to src/Tensile/data/Source/client/source/ResultFileReporter.cpp diff --git a/Tensile/Source/client/source/ResultReporter.cpp b/src/Tensile/data/Source/client/source/ResultReporter.cpp similarity index 100% rename from Tensile/Source/client/source/ResultReporter.cpp rename to src/Tensile/data/Source/client/source/ResultReporter.cpp diff --git a/Tensile/Source/client/source/SolutionIterator.cpp b/src/Tensile/data/Source/client/source/SolutionIterator.cpp similarity index 100% rename from Tensile/Source/client/source/SolutionIterator.cpp rename to src/Tensile/data/Source/client/source/SolutionIterator.cpp diff --git a/Tensile/Source/client/source/TimingEvents.cpp b/src/Tensile/data/Source/client/source/TimingEvents.cpp similarity index 100% rename from Tensile/Source/client/source/TimingEvents.cpp rename to src/Tensile/data/Source/client/source/TimingEvents.cpp diff --git a/Tensile/Source/cmake/FindROCmSMI.cmake b/src/Tensile/data/Source/cmake/FindROCmSMI.cmake similarity index 100% rename from Tensile/Source/cmake/FindROCmSMI.cmake rename to src/Tensile/data/Source/cmake/FindROCmSMI.cmake diff --git a/Tensile/Source/hip_f8_impl.h b/src/Tensile/data/Source/hip_f8_impl.h similarity index 100% rename from Tensile/Source/hip_f8_impl.h rename to src/Tensile/data/Source/hip_f8_impl.h diff --git a/Tensile/Source/lib/CMakeLists.txt b/src/Tensile/data/Source/lib/CMakeLists.txt similarity index 100% rename from Tensile/Source/lib/CMakeLists.txt rename to src/Tensile/data/Source/lib/CMakeLists.txt diff --git a/Tensile/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml b/src/Tensile/data/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml similarity index 100% rename from Tensile/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml rename to src/Tensile/data/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml similarity index 100% rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml similarity index 100% rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml similarity index 100% rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml similarity index 100% rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/src/Tensile/data/Source/lib/include/Tensile/AMDGPU.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/AMDGPU.hpp rename to src/Tensile/data/Source/lib/include/Tensile/AMDGPU.hpp diff --git a/Tensile/Source/lib/include/Tensile/AMDGPUPredicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/AMDGPUPredicates.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/AMDGPUPredicates.hpp rename to src/Tensile/data/Source/lib/include/Tensile/AMDGPUPredicates.hpp diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU_Detail.hpp b/src/Tensile/data/Source/lib/include/Tensile/AMDGPU_Detail.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/AMDGPU_Detail.hpp rename to src/Tensile/data/Source/lib/include/Tensile/AMDGPU_Detail.hpp diff --git a/Tensile/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp diff --git a/Tensile/Source/lib/include/Tensile/CachingLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/CachingLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/CachingLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/CachingLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Comparison.hpp b/src/Tensile/data/Source/lib/include/Tensile/Comparison.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Comparison.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Comparison.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblem.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblem.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionProblem.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblem.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblemPredicates.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblemPredicates.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblemProperties.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblemProperties.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionProblemProperties.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblemProperties.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblem_Detail.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_Detail.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionProblem_Detail.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_Detail.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblem_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_fwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionProblem_fwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_fwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionSolution.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionSolution.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionSolution.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionSolution.hpp diff --git a/Tensile/Source/lib/include/Tensile/ContractionSolution_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionSolution_fwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ContractionSolution_fwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ContractionSolution_fwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/Contractions.hpp b/src/Tensile/data/Source/lib/include/Tensile/Contractions.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Contractions.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Contractions.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_BFloat16.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_BFloat16.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes_BFloat16.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_BFloat16.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Half.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Half.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes_Half.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Half.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Int8.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes_Int8.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Int8x4.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8x4.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes_Int8x4.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8x4.hpp diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_XFloat32.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_XFloat32.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DataTypes_XFloat32.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_XFloat32.hpp diff --git a/Tensile/Source/lib/include/Tensile/Debug.hpp b/src/Tensile/data/Source/lib/include/Tensile/Debug.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Debug.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Debug.hpp diff --git a/Tensile/Source/lib/include/Tensile/DecisionTree.hpp b/src/Tensile/data/Source/lib/include/Tensile/DecisionTree.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DecisionTree.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DecisionTree.hpp diff --git a/Tensile/Source/lib/include/Tensile/DecisionTreeLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/DecisionTreeLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DecisionTreeLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DecisionTreeLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Distance.hpp b/src/Tensile/data/Source/lib/include/Tensile/Distance.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Distance.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Distance.hpp diff --git a/Tensile/Source/lib/include/Tensile/DistinctType.hpp b/src/Tensile/data/Source/lib/include/Tensile/DistinctType.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/DistinctType.hpp rename to src/Tensile/data/Source/lib/include/Tensile/DistinctType.hpp diff --git a/Tensile/Source/lib/include/Tensile/EmbeddedData.hpp b/src/Tensile/data/Source/lib/include/Tensile/EmbeddedData.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/EmbeddedData.hpp rename to src/Tensile/data/Source/lib/include/Tensile/EmbeddedData.hpp diff --git a/Tensile/Source/lib/include/Tensile/EmbeddedLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/EmbeddedLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/EmbeddedLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/EmbeddedLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/ExactLogicLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/ExactLogicLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ExactLogicLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ExactLogicLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/KernelArguments.hpp b/src/Tensile/data/Source/lib/include/Tensile/KernelArguments.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/KernelArguments.hpp rename to src/Tensile/data/Source/lib/include/Tensile/KernelArguments.hpp diff --git a/Tensile/Source/lib/include/Tensile/KernelLanguageTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/KernelLanguageTypes.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/KernelLanguageTypes.hpp rename to src/Tensile/data/Source/lib/include/Tensile/KernelLanguageTypes.hpp diff --git a/Tensile/Source/lib/include/Tensile/MLFeatures.hpp b/src/Tensile/data/Source/lib/include/Tensile/MLFeatures.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/MLFeatures.hpp rename to src/Tensile/data/Source/lib/include/Tensile/MLFeatures.hpp diff --git a/Tensile/Source/lib/include/Tensile/Macros.hpp b/src/Tensile/data/Source/lib/include/Tensile/Macros.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Macros.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Macros.hpp diff --git a/Tensile/Source/lib/include/Tensile/MapLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/MapLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/MapLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/MapLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/MasterSolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/MasterSolutionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/MasterSolutionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/MasterSolutionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/MatchingLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/MatchingLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/MatchingLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/MatchingLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/PerformanceMetricTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/PerformanceMetricTypes.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/PerformanceMetricTypes.hpp rename to src/Tensile/data/Source/lib/include/Tensile/PerformanceMetricTypes.hpp diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/PlaceholderLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/PlaceholderLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Predicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/Predicates.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Predicates.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Predicates.hpp diff --git a/Tensile/Source/lib/include/Tensile/ProblemKey.hpp b/src/Tensile/data/Source/lib/include/Tensile/ProblemKey.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ProblemKey.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ProblemKey.hpp diff --git a/Tensile/Source/lib/include/Tensile/Properties.hpp b/src/Tensile/data/Source/lib/include/Tensile/Properties.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Properties.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Properties.hpp diff --git a/Tensile/Source/lib/include/Tensile/PropertyMatching.hpp b/src/Tensile/data/Source/lib/include/Tensile/PropertyMatching.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/PropertyMatching.hpp rename to src/Tensile/data/Source/lib/include/Tensile/PropertyMatching.hpp diff --git a/Tensile/Source/lib/include/Tensile/ScalarValueTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/ScalarValueTypes.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ScalarValueTypes.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ScalarValueTypes.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Base.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Base.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/Base.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Base.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Containers.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Containers.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/Containers.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Containers.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/HasTraits.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/HasTraits.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/HasTraits.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/HasTraits.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/MLFeatures.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/MLFeatures.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/MLFeatures.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/MLFeatures.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/MapLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/MapLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/MapLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/MapLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Predicates.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Predicates.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Properties.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Properties.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/Properties.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Properties.hpp diff --git a/Tensile/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/SingleSolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/SingleSolutionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/SingleSolutionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/SingleSolutionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Singleton.hpp b/src/Tensile/data/Source/lib/include/Tensile/Singleton.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Singleton.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Singleton.hpp diff --git a/Tensile/Source/lib/include/Tensile/SolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/SolutionLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/SolutionMapLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/SolutionMapLibrary.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/SolutionMapLibrary.hpp rename to src/Tensile/data/Source/lib/include/Tensile/SolutionMapLibrary.hpp diff --git a/Tensile/Source/lib/include/Tensile/Tensile.hpp b/src/Tensile/data/Source/lib/include/Tensile/Tensile.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Tensile.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Tensile.hpp diff --git a/Tensile/Source/lib/include/Tensile/Tensile_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/Tensile_fwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Tensile_fwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Tensile_fwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/TensorDescriptor.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/TensorDescriptor.hpp rename to src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor.hpp diff --git a/Tensile/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp rename to src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp diff --git a/Tensile/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/TensorOps.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorOps.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/TensorOps.hpp rename to src/Tensile/data/Source/lib/include/Tensile/TensorOps.hpp diff --git a/Tensile/Source/lib/include/Tensile/TensorOps_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorOps_fwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/TensorOps_fwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/TensorOps_fwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/UserDrivenTuningParser.hpp b/src/Tensile/data/Source/lib/include/Tensile/UserDrivenTuningParser.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/UserDrivenTuningParser.hpp rename to src/Tensile/data/Source/lib/include/Tensile/UserDrivenTuningParser.hpp diff --git a/Tensile/Source/lib/include/Tensile/Utils.hpp b/src/Tensile/data/Source/lib/include/Tensile/Utils.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/Utils.hpp rename to src/Tensile/data/Source/lib/include/Tensile/Utils.hpp diff --git a/Tensile/Source/lib/include/Tensile/geom.hpp b/src/Tensile/data/Source/lib/include/Tensile/geom.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/geom.hpp rename to src/Tensile/data/Source/lib/include/Tensile/geom.hpp diff --git a/Tensile/Source/lib/include/Tensile/hip/HipHardware.hpp b/src/Tensile/data/Source/lib/include/Tensile/hip/HipHardware.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/hip/HipHardware.hpp rename to src/Tensile/data/Source/lib/include/Tensile/hip/HipHardware.hpp diff --git a/Tensile/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp b/src/Tensile/data/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp rename to src/Tensile/data/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp diff --git a/Tensile/Source/lib/include/Tensile/hip/HipUtils.hpp b/src/Tensile/data/Source/lib/include/Tensile/hip/HipUtils.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/hip/HipUtils.hpp rename to src/Tensile/data/Source/lib/include/Tensile/hip/HipUtils.hpp diff --git a/Tensile/Source/lib/include/Tensile/hip_f8_impl.h b/src/Tensile/data/Source/lib/include/Tensile/hip_f8_impl.h similarity index 100% rename from Tensile/Source/lib/include/Tensile/hip_f8_impl.h rename to src/Tensile/data/Source/lib/include/Tensile/hip_f8_impl.h diff --git a/Tensile/Source/lib/include/Tensile/llvm/Loading.hpp b/src/Tensile/data/Source/lib/include/Tensile/llvm/Loading.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/llvm/Loading.hpp rename to src/Tensile/data/Source/lib/include/Tensile/llvm/Loading.hpp diff --git a/Tensile/Source/lib/include/Tensile/llvm/YAML.hpp b/src/Tensile/data/Source/lib/include/Tensile/llvm/YAML.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/llvm/YAML.hpp rename to src/Tensile/data/Source/lib/include/Tensile/llvm/YAML.hpp diff --git a/Tensile/Source/lib/include/Tensile/msgpack/Loading.hpp b/src/Tensile/data/Source/lib/include/Tensile/msgpack/Loading.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/msgpack/Loading.hpp rename to src/Tensile/data/Source/lib/include/Tensile/msgpack/Loading.hpp diff --git a/Tensile/Source/lib/include/Tensile/msgpack/MessagePack.hpp b/src/Tensile/data/Source/lib/include/Tensile/msgpack/MessagePack.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/msgpack/MessagePack.hpp rename to src/Tensile/data/Source/lib/include/Tensile/msgpack/MessagePack.hpp diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclFwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclFwd.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ocl/OclFwd.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclFwd.hpp diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclHardware.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclHardware.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ocl/OclHardware.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclHardware.hpp diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclUtils.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclUtils.hpp similarity index 100% rename from Tensile/Source/lib/include/Tensile/ocl/OclUtils.hpp rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclUtils.hpp diff --git a/Tensile/Source/lib/source/AMDGPU.cpp b/src/Tensile/data/Source/lib/source/AMDGPU.cpp similarity index 100% rename from Tensile/Source/lib/source/AMDGPU.cpp rename to src/Tensile/data/Source/lib/source/AMDGPU.cpp diff --git a/Tensile/Source/lib/source/ArithmeticUnitTypes.cpp b/src/Tensile/data/Source/lib/source/ArithmeticUnitTypes.cpp similarity index 100% rename from Tensile/Source/lib/source/ArithmeticUnitTypes.cpp rename to src/Tensile/data/Source/lib/source/ArithmeticUnitTypes.cpp diff --git a/Tensile/Source/lib/source/ContractionProblem.cpp b/src/Tensile/data/Source/lib/source/ContractionProblem.cpp similarity index 100% rename from Tensile/Source/lib/source/ContractionProblem.cpp rename to src/Tensile/data/Source/lib/source/ContractionProblem.cpp diff --git a/Tensile/Source/lib/source/ContractionSolution.cpp b/src/Tensile/data/Source/lib/source/ContractionSolution.cpp similarity index 100% rename from Tensile/Source/lib/source/ContractionSolution.cpp rename to src/Tensile/data/Source/lib/source/ContractionSolution.cpp diff --git a/Tensile/Source/lib/source/DataTypes.cpp b/src/Tensile/data/Source/lib/source/DataTypes.cpp similarity index 100% rename from Tensile/Source/lib/source/DataTypes.cpp rename to src/Tensile/data/Source/lib/source/DataTypes.cpp diff --git a/Tensile/Source/lib/source/Debug.cpp b/src/Tensile/data/Source/lib/source/Debug.cpp similarity index 100% rename from Tensile/Source/lib/source/Debug.cpp rename to src/Tensile/data/Source/lib/source/Debug.cpp diff --git a/Tensile/Source/lib/source/EmbeddedData.cpp b/src/Tensile/data/Source/lib/source/EmbeddedData.cpp similarity index 100% rename from Tensile/Source/lib/source/EmbeddedData.cpp rename to src/Tensile/data/Source/lib/source/EmbeddedData.cpp diff --git a/Tensile/Source/lib/source/EmbeddedLibrary.cpp b/src/Tensile/data/Source/lib/source/EmbeddedLibrary.cpp similarity index 100% rename from Tensile/Source/lib/source/EmbeddedLibrary.cpp rename to src/Tensile/data/Source/lib/source/EmbeddedLibrary.cpp diff --git a/Tensile/Source/lib/source/KernelArguments.cpp b/src/Tensile/data/Source/lib/source/KernelArguments.cpp similarity index 100% rename from Tensile/Source/lib/source/KernelArguments.cpp rename to src/Tensile/data/Source/lib/source/KernelArguments.cpp diff --git a/Tensile/Source/lib/source/KernelLanguageTypes.cpp b/src/Tensile/data/Source/lib/source/KernelLanguageTypes.cpp similarity index 100% rename from Tensile/Source/lib/source/KernelLanguageTypes.cpp rename to src/Tensile/data/Source/lib/source/KernelLanguageTypes.cpp diff --git a/Tensile/Source/lib/source/MLFeatures.cpp b/src/Tensile/data/Source/lib/source/MLFeatures.cpp similarity index 100% rename from Tensile/Source/lib/source/MLFeatures.cpp rename to src/Tensile/data/Source/lib/source/MLFeatures.cpp diff --git a/Tensile/Source/lib/source/PerformanceMetricTypes.cpp b/src/Tensile/data/Source/lib/source/PerformanceMetricTypes.cpp similarity index 100% rename from Tensile/Source/lib/source/PerformanceMetricTypes.cpp rename to src/Tensile/data/Source/lib/source/PerformanceMetricTypes.cpp diff --git a/Tensile/Source/lib/source/ScalarValueTypes.cpp b/src/Tensile/data/Source/lib/source/ScalarValueTypes.cpp similarity index 100% rename from Tensile/Source/lib/source/ScalarValueTypes.cpp rename to src/Tensile/data/Source/lib/source/ScalarValueTypes.cpp diff --git a/Tensile/Source/lib/source/Tensile.cpp b/src/Tensile/data/Source/lib/source/Tensile.cpp similarity index 100% rename from Tensile/Source/lib/source/Tensile.cpp rename to src/Tensile/data/Source/lib/source/Tensile.cpp diff --git a/Tensile/Source/lib/source/TensorDescriptor.cpp b/src/Tensile/data/Source/lib/source/TensorDescriptor.cpp similarity index 100% rename from Tensile/Source/lib/source/TensorDescriptor.cpp rename to src/Tensile/data/Source/lib/source/TensorDescriptor.cpp diff --git a/Tensile/Source/lib/source/TensorOps.cpp b/src/Tensile/data/Source/lib/source/TensorOps.cpp similarity index 100% rename from Tensile/Source/lib/source/TensorOps.cpp rename to src/Tensile/data/Source/lib/source/TensorOps.cpp diff --git a/Tensile/Source/lib/source/UserDrivenTuningParser.cpp b/src/Tensile/data/Source/lib/source/UserDrivenTuningParser.cpp similarity index 100% rename from Tensile/Source/lib/source/UserDrivenTuningParser.cpp rename to src/Tensile/data/Source/lib/source/UserDrivenTuningParser.cpp diff --git a/Tensile/Source/lib/source/Utils.cpp b/src/Tensile/data/Source/lib/source/Utils.cpp similarity index 100% rename from Tensile/Source/lib/source/Utils.cpp rename to src/Tensile/data/Source/lib/source/Utils.cpp diff --git a/Tensile/Source/lib/source/hip/CMakeLists.txt b/src/Tensile/data/Source/lib/source/hip/CMakeLists.txt similarity index 100% rename from Tensile/Source/lib/source/hip/CMakeLists.txt rename to src/Tensile/data/Source/lib/source/hip/CMakeLists.txt diff --git a/Tensile/Source/lib/source/hip/HipHardware.cpp b/src/Tensile/data/Source/lib/source/hip/HipHardware.cpp similarity index 100% rename from Tensile/Source/lib/source/hip/HipHardware.cpp rename to src/Tensile/data/Source/lib/source/hip/HipHardware.cpp diff --git a/Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp b/src/Tensile/data/Source/lib/source/hip/HipSolutionAdapter.cpp similarity index 100% rename from Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp rename to src/Tensile/data/Source/lib/source/hip/HipSolutionAdapter.cpp diff --git a/Tensile/Source/lib/source/llvm/Loading.cpp b/src/Tensile/data/Source/lib/source/llvm/Loading.cpp similarity index 100% rename from Tensile/Source/lib/source/llvm/Loading.cpp rename to src/Tensile/data/Source/lib/source/llvm/Loading.cpp diff --git a/Tensile/Source/lib/source/llvm/YAML.cpp b/src/Tensile/data/Source/lib/source/llvm/YAML.cpp similarity index 100% rename from Tensile/Source/lib/source/llvm/YAML.cpp rename to src/Tensile/data/Source/lib/source/llvm/YAML.cpp diff --git a/Tensile/Source/lib/source/msgpack/MessagePack.cpp b/src/Tensile/data/Source/lib/source/msgpack/MessagePack.cpp similarity index 100% rename from Tensile/Source/lib/source/msgpack/MessagePack.cpp rename to src/Tensile/data/Source/lib/source/msgpack/MessagePack.cpp diff --git a/Tensile/Source/lib/source/ocl/CMakeLists.txt b/src/Tensile/data/Source/lib/source/ocl/CMakeLists.txt similarity index 100% rename from Tensile/Source/lib/source/ocl/CMakeLists.txt rename to src/Tensile/data/Source/lib/source/ocl/CMakeLists.txt diff --git a/Tensile/Source/lib/source/ocl/OclHardware.cpp b/src/Tensile/data/Source/lib/source/ocl/OclHardware.cpp similarity index 100% rename from Tensile/Source/lib/source/ocl/OclHardware.cpp rename to src/Tensile/data/Source/lib/source/ocl/OclHardware.cpp diff --git a/Tensile/Source/lib/source/ocl/OclSolutionAdapter.cpp b/src/Tensile/data/Source/lib/source/ocl/OclSolutionAdapter.cpp similarity index 100% rename from Tensile/Source/lib/source/ocl/OclSolutionAdapter.cpp rename to src/Tensile/data/Source/lib/source/ocl/OclSolutionAdapter.cpp diff --git a/Tensile/Source/lib/source/ocl/OclUtils.cpp b/src/Tensile/data/Source/lib/source/ocl/OclUtils.cpp similarity index 100% rename from Tensile/Source/lib/source/ocl/OclUtils.cpp rename to src/Tensile/data/Source/lib/source/ocl/OclUtils.cpp diff --git a/Tensile/Source/multigpu.sh b/src/Tensile/data/Source/multigpu.sh similarity index 100% rename from Tensile/Source/multigpu.sh rename to src/Tensile/data/Source/multigpu.sh diff --git a/Tensile/Source/tensile_bfloat16.h b/src/Tensile/data/Source/tensile_bfloat16.h similarity index 100% rename from Tensile/Source/tensile_bfloat16.h rename to src/Tensile/data/Source/tensile_bfloat16.h diff --git a/Tensile/Source/tensile_float8_bfloat8.h b/src/Tensile/data/Source/tensile_float8_bfloat8.h similarity index 100% rename from Tensile/Source/tensile_float8_bfloat8.h rename to src/Tensile/data/Source/tensile_float8_bfloat8.h diff --git a/Tensile/Source/winners.awk b/src/Tensile/data/Source/winners.awk similarity index 100% rename from Tensile/Source/winners.awk rename to src/Tensile/data/Source/winners.awk diff --git a/Tensile/Utilities/archive/merge_rocblas_yaml_files.py b/src/Tensile/data/Utilities/archive/merge_rocblas_yaml_files.py similarity index 100% rename from Tensile/Utilities/archive/merge_rocblas_yaml_files.py rename to src/Tensile/data/Utilities/archive/merge_rocblas_yaml_files.py diff --git a/Tensile/Utilities/merge.py b/src/Tensile/data/Utilities/merge.py similarity index 100% rename from Tensile/Utilities/merge.py rename to src/Tensile/data/Utilities/merge.py diff --git a/Tensile/cmake/TensileConfig.cmake b/src/Tensile/data/cmake/TensileConfig.cmake similarity index 100% rename from Tensile/cmake/TensileConfig.cmake rename to src/Tensile/data/cmake/TensileConfig.cmake diff --git a/Tensile/cmake/TensileConfigVersion.cmake b/src/Tensile/data/cmake/TensileConfigVersion.cmake similarity index 100% rename from Tensile/cmake/TensileConfigVersion.cmake rename to src/Tensile/data/cmake/TensileConfigVersion.cmake From d24a9531a93e20ec4dae6f2594a0a5f088e52884 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Tue, 12 Dec 2023 01:31:06 +0100 Subject: [PATCH 02/13] Let poetry to create executable --- pyproject.toml | 39 ++++++++++++++++++++ src/Tensile/GenerateSummations.py | 3 +- src/Tensile/TensileCreateLibrary.py | 2 +- src/Tensile/bin/Tensile | 39 -------------------- src/Tensile/bin/TensileBenchmarkCluster | 39 -------------------- src/Tensile/bin/TensileClientConfig | 39 -------------------- src/Tensile/bin/TensileCreateLibrary | 43 ----------------------- src/Tensile/bin/TensileGenerateSummations | 41 --------------------- src/Tensile/bin/TensileLibLogicToYaml | 39 -------------------- src/Tensile/bin/TensileMergeLibrary | 41 --------------------- src/Tensile/bin/TensileRetuneLibrary | 39 -------------------- src/Tensile/bin/TensileUpdateLibrary | 41 --------------------- 12 files changed, 42 insertions(+), 363 deletions(-) create mode 100644 pyproject.toml delete mode 100755 src/Tensile/bin/Tensile delete mode 100755 src/Tensile/bin/TensileBenchmarkCluster delete mode 100755 src/Tensile/bin/TensileClientConfig delete mode 100755 src/Tensile/bin/TensileCreateLibrary delete mode 100755 src/Tensile/bin/TensileGenerateSummations delete mode 100755 src/Tensile/bin/TensileLibLogicToYaml delete mode 100755 src/Tensile/bin/TensileMergeLibrary delete mode 100755 src/Tensile/bin/TensileRetuneLibrary delete mode 100755 src/Tensile/bin/TensileUpdateLibrary diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..9fa1466336 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,39 @@ +[build-system] +requires = ["setuptools>=65.5.1", "wheel", "poetry_core>=1.5.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "Tensile" +version = "4.40.0" +description = "Tensile is a tool for creating benchmark-driven backend libraries for GEMMs" +license = "MIT" +classifiers = [ + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", +] +packages = [ + { include = "Tensile", from = "src" }, +] + +[tool.poetry.dependencies] +python = ">=3.8" +toml = ">=0.10" +pyyaml = ">=6.0" +msgpack = ">=1.0" +joblib = ">=1.2" +pandas = ">=1.5" + +[tool.poetry.group.dev.dependencies] +pytest = "7.3.2" + + +[tool.poetry.scripts] +tensile = "Tensile.Tensile:main" +tensile_benchmark_cluster = "Tensile.TensileBenchmarkCluster:main" +tensile_client_config = "Tensile.TensileClientConfig:main" +tensile_create_library = "Tensile.TensileCreateLibrary:main" +tensile_generate_summations = "Tensile.GenerateSummations:main" +tensile_lib_logic_to_yaml = "Tensile.TensileLibLogicToYaml:main" +tensile_merge_library = "Tensile.TensileMergeLibrary:main" +tensile_retune_library = "Tensile.TensileRetuneLibrary:main" +tensile_update_library = "Tensile.TensileUpdateLibrary:main" \ No newline at end of file diff --git a/src/Tensile/GenerateSummations.py b/src/Tensile/GenerateSummations.py index bd255414f7..404300fb2d 100644 --- a/src/Tensile/GenerateSummations.py +++ b/src/Tensile/GenerateSummations.py @@ -29,6 +29,7 @@ import yaml import subprocess import glob +import sys from shutil import copyfile from copy import deepcopy @@ -70,7 +71,7 @@ def createLibraryForBenchmark(logicPath, libraryPath, currentPath): except (subprocess.CalledProcessError, OSError) as e: printExit("ClientWriter Benchmark Process exited with error: {}".format(e)) -def GenerateSummations(userArgs): +def main(userArgs = sys.argv[1:]): inputLogicPath = userArgs[0] outputPath = userArgs[1] diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py index dae1d0502e..e3bcdb4222 100644 --- a/src/Tensile/TensileCreateLibrary.py +++ b/src/Tensile/TensileCreateLibrary.py @@ -1023,7 +1023,7 @@ def WriteClientLibraryFromSolutions(solutionList, libraryWorkingPath, tensileSou ################################################################################ # Tensile Create Library ################################################################################ -def TensileCreateLibrary(): +def main(): print1("") print1(HR) print1("# Tensile Create Library") diff --git a/src/Tensile/bin/Tensile b/src/Tensile/bin/Tensile deleted file mode 100755 index 1c53682cdd..0000000000 --- a/src/Tensile/bin/Tensile +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -try: - from Tensile import Tensile -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import Tensile - -# script run from commandline -if __name__ == "__main__": - Tensile.main() diff --git a/src/Tensile/bin/TensileBenchmarkCluster b/src/Tensile/bin/TensileBenchmarkCluster deleted file mode 100755 index e1ac2592ec..0000000000 --- a/src/Tensile/bin/TensileBenchmarkCluster +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -try: - from Tensile import TensileBenchmarkCluster -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import TensileBenchmarkCluster - -# script run from commandline -if __name__ == "__main__": - TensileBenchmarkCluster.main() diff --git a/src/Tensile/bin/TensileClientConfig b/src/Tensile/bin/TensileClientConfig deleted file mode 100755 index 3c076ccb92..0000000000 --- a/src/Tensile/bin/TensileClientConfig +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -try: - from Tensile import TensileClientConfig -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import TensileClientConfig - -# script run from commandline -if __name__ == "__main__": - TensileClientConfig.main() diff --git a/src/Tensile/bin/TensileCreateLibrary b/src/Tensile/bin/TensileCreateLibrary deleted file mode 100755 index e90be28536..0000000000 --- a/src/Tensile/bin/TensileCreateLibrary +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -# This script only gets called by CMake - -try: - from Tensile.TensileCreateLibrary import TensileCreateLibrary -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile.TensileCreateLibrary import TensileCreateLibrary - -################################################################################ -# Main -################################################################################ -if __name__ == "__main__": - TensileCreateLibrary() diff --git a/src/Tensile/bin/TensileGenerateSummations b/src/Tensile/bin/TensileGenerateSummations deleted file mode 100755 index 3807c2b1dc..0000000000 --- a/src/Tensile/bin/TensileGenerateSummations +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -try: - from Tensile.GenerateSummations import GenerateSummations -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile.GenerateSummations import GenerateSummations - -################################################################################ -# Main -################################################################################ -if __name__ == "__main__": - GenerateSummations(sys.argv[1:]) diff --git a/src/Tensile/bin/TensileLibLogicToYaml b/src/Tensile/bin/TensileLibLogicToYaml deleted file mode 100755 index a16867cde0..0000000000 --- a/src/Tensile/bin/TensileLibLogicToYaml +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -try: - from Tensile import TensileLibLogicToYaml -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import TensileLibLogicToYaml - -# script run from commandline -if __name__ == "__main__": - TensileLibLogicToYaml.main() diff --git a/src/Tensile/bin/TensileMergeLibrary b/src/Tensile/bin/TensileMergeLibrary deleted file mode 100755 index a2bb92fa31..0000000000 --- a/src/Tensile/bin/TensileMergeLibrary +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -# This script only gets called by CMake - -try: - from Tensile import TensileMergeLibrary -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import TensileMergeLibrary - -# script run from commandline -if __name__ == "__main__": - TensileMergeLibrary.main() diff --git a/src/Tensile/bin/TensileRetuneLibrary b/src/Tensile/bin/TensileRetuneLibrary deleted file mode 100755 index 21a8fca46c..0000000000 --- a/src/Tensile/bin/TensileRetuneLibrary +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -try: - from Tensile import TensileRetuneLibrary -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import TensileRetuneLibrary - -# script run from commandline -if __name__ == "__main__": - TensileRetuneLibrary.main() diff --git a/src/Tensile/bin/TensileUpdateLibrary b/src/Tensile/bin/TensileUpdateLibrary deleted file mode 100755 index 84a55de88a..0000000000 --- a/src/Tensile/bin/TensileUpdateLibrary +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# -# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -# This script only gets called by CMake - -try: - from Tensile import TensileUpdateLibrary -except ImportError: - import os.path - import sys - parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) - sys.path.append(parentdir) - - from Tensile import TensileUpdateLibrary - -# script run from commandline -if __name__ == "__main__": - TensileUpdateLibrary.main() From 5b3abbfd2e9335f301416d807d7853733f2058e5 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 13 Dec 2023 01:13:20 +0100 Subject: [PATCH 03/13] fix typo } used instead of ) --- src/Tensile/data/cmake/TensileConfigVersion.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake b/src/Tensile/data/cmake/TensileConfigVersion.cmake index 8c97bc461d..138865de22 100644 --- a/src/Tensile/data/cmake/TensileConfigVersion.cmake +++ b/src/Tensile/data/cmake/TensileConfigVersion.cmake @@ -36,7 +36,7 @@ else() set(PACKAGE_VERSION_EXACT FALSE) endif() -if(PACKAGE_VERSION_EXACT} OR (PACKAGE_FIND_VERSION VERSION_GREATER PACKAGE_VERSION)) +if(PACKAGE_VERSION_EXACT) OR (PACKAGE_FIND_VERSION VERSION_GREATER PACKAGE_VERSION)) set(PACKAGE_VERSION_COMPATIBLE TRUE) else() set(PACKAGE_VERSION_COMPATIBLE FALSE) From ece0fc03d5ed97efe97284aa5864cb706cbbad90 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 13 Dec 2023 01:15:39 +0100 Subject: [PATCH 04/13] Version is filled dynamically --- ...ileConfigVersion.cmake => TensileConfigVersion.cmake.j2} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename src/Tensile/data/cmake/{TensileConfigVersion.cmake => TensileConfigVersion.cmake.j2} (95%) diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 similarity index 95% rename from src/Tensile/data/cmake/TensileConfigVersion.cmake rename to src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 index 138865de22..d2267a7a33 100644 --- a/src/Tensile/data/cmake/TensileConfigVersion.cmake +++ b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 @@ -23,9 +23,9 @@ ################################################################################ # hardcoded tensile version; also in Tensile/__init__.py -set(TENSILE_VERSION_MAJOR 4) -set(TENSILE_VERSION_MINOR 40) -set(TENSILE_VERSION_PATCH 0) +set(TENSILE_VERSION_MAJOR {TENSILE_VERSION_MAJOR}) +set(TENSILE_VERSION_MINOR {TENSILE_VERSION_MINOR}) +set(TENSILE_VERSION_PATCH {TENSILE_VERSION_PATCH}) # export version set(PACKAGE_VERSION "${TENSILE_VERSION_MAJOR}.${TENSILE_VERSION_MINOR}.${TENSILE_VERSION_PATCH}") From 736244ee5b031345e26906fbee89921a03fc7c6a Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 13 Dec 2023 01:24:02 +0100 Subject: [PATCH 05/13] use f'string --- src/Tensile/BenchmarkProblems.py | 35 +++++++++++++++----------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/Tensile/BenchmarkProblems.py b/src/Tensile/BenchmarkProblems.py index 2fb094d609..35f896429e 100644 --- a/src/Tensile/BenchmarkProblems.py +++ b/src/Tensile/BenchmarkProblems.py @@ -83,7 +83,7 @@ def generateCustomKernelSolutions(problemType, customKernels, failOnMismatch): """Creates a list with a Solution object for each name in customKernel""" solutions = [] for kernelName in customKernels: - print1("# Processing custom kernel {}".format(kernelName)) + print1(f"# Processing custom kernel {kernelName}") solution = getCustomKernelSolutionObj(kernelName) if solution["ProblemType"] != problemType: # Raise error if this kernel was specifically requested and problem type doesn't match @@ -93,8 +93,8 @@ def generateCustomKernelSolutions(problemType, customKernels, failOnMismatch): customSet = set([(k,tuple(v)) if type(v) is list else (k,v) \ for k,v in solution["ProblemType"].items()]) - msg = "The problem type in the config file does not match " \ - "that of the custom kernel, {}.".format(kernelName) \ + msg = f"The problem type in the config file does not match " \ + "that of the custom kernel, {kernelName}." \ + "\nDiffering parameters:\n" \ + "\tConfig values:\n\t" \ + str(sorted(benchmarkSet - (customSet & benchmarkSet))) \ @@ -102,9 +102,9 @@ def generateCustomKernelSolutions(problemType, customKernels, failOnMismatch): + str(sorted(customSet - (customSet & benchmarkSet))) printExit(msg) else: - print1("# Rejected {}: Problem Type doesn't match".format(kernelName)) + print1(f"# Rejected {kernelName}: Problem Type doesn't match") else: - print1("# Added {} to solutions".format(kernelName)) + print1(f"# Added {kernelName} to solutions") if solution["Valid"]: solutions.append(solution) elif globalParameters["PrintSolutionRejectionReason"]: @@ -214,14 +214,15 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG benchmarkProcess = BenchmarkProcess(problemTypeConfig, problemSizeGroupConfig) enableTileSelection = benchmarkProcess.problemType["TileAwareSelection"] - groupName = "{}_{:02d}".format(str(benchmarkProcess.problemType), problemSizeGroupIdx) + problemType = str(benchmarkProcess.problemType) + groupName = f"{problemType}_{problemSizeGroupIdx:02d}" pushWorkingPath(groupName) ensurePath(os.path.join(globalParameters["WorkingPath"], "Data")) totalBenchmarkSteps = len(benchmarkProcess) resultsFileBaseFinal = None - print1("# NumBenchmarkSteps: {}".format(totalBenchmarkSteps)) + print1(f"# NumBenchmarkSteps: {totalBenchmarkSteps}") print1("") print1(HR) print1("# Done Creating BenchmarkProcess Object") @@ -236,11 +237,11 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG print1(HR) currentTime = time.time() elapsedTime = currentTime - startTime - print1("# Benchmark Step: {} - {} {:.3f}s".format(groupName, stepName, elapsedTime)) - print1("# Num Sizes: {}".format(benchmarkStep.problemSizes.totalProblemSizes)) + print1(f"# Benchmark Step: {groupName} - {stepName} {elapsedTime:.3f}s") + print1(f"# Num Sizes: {benchmarkStep.problemSizes.totalProblemSizes}") print1("# Fork Parameters:") for k, v in sorted(benchmarkStep.forkParams.items()): - print1("# {}: {}".format(k, v)) + print1(f"# {k}: {v}") pushWorkingPath(shortName) stepBaseDir = globalParameters["WorkingPath"] @@ -284,8 +285,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG maxPossibleSolutions += len(kcSolutions) solutions = regSolutions + kcSolutions - print1("# Actual Solutions: {} / {} after SolutionStructs\n" \ - .format(len(solutions), maxPossibleSolutions)) + print1(f"# Actual Solutions: {len(solutions)} / {maxPossibleSolutions} after SolutionStructs\n") # handle no valid solutions if len(solutions) == 0: @@ -300,7 +300,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG if globalParameters["PrintLevel"] >= 1: for solution in solutions: - print2("# ({}:{}) {}".format(0, 0, Solution.getNameFull(solution))) + print2f("# (0:0) {Solution.getNameFull(solution)}") print2(HR) # write benchmarkFiles @@ -320,8 +320,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG } LibraryIO.writeYAML(cachePath, cacheData) - print1("# Actual Solutions: {} / {} after KernelWriter\n" \ - .format(len(solutions), prevCount )) + print1(f"# Actual Solutions: {len(solutions)} / {prevCount} after KernelWriter\n") else: solutions = None print1("# Using cached solution data") @@ -348,8 +347,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG if returncode: benchmarkTestFails += 1 - printWarning("BenchmarkProblems: Benchmark Process exited with code {}" \ - .format(returncode)) + printWarning("BenchmarkProblems: Benchmark Process exited with code {returncode}") else: print1("# Already benchmarked; skipping.") @@ -357,8 +355,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG popWorkingPath() # stepName currentTime = time.time() elapsedTime = currentTime - startTime - print1("{}\n# {}\n# {}: End - {:.3f}s\n{}\n" \ - .format(HR, groupName, shortName, elapsedTime, HR)) + print1(f"{HR}\n# {groupName}\n# {shortName}: End - {elapsedTime:.3f}s\n{HR}\n") popWorkingPath() # ProblemType return (resultsFileBaseFinal, benchmarkTestFails) From 4d74bedd37dcafd85b2641797ce8f9cd53f480f3 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 13 Dec 2023 01:31:59 +0100 Subject: [PATCH 06/13] replace multiple call len(solutions) to num_of_solutions var --- src/Tensile/BenchmarkProblems.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Tensile/BenchmarkProblems.py b/src/Tensile/BenchmarkProblems.py index 35f896429e..04f6163c5c 100644 --- a/src/Tensile/BenchmarkProblems.py +++ b/src/Tensile/BenchmarkProblems.py @@ -284,11 +284,12 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG maxPossibleSolutions += len(kcSolutions) solutions = regSolutions + kcSolutions + num_of_solutions = len(solutions) - print1(f"# Actual Solutions: {len(solutions)} / {maxPossibleSolutions} after SolutionStructs\n") + print1(f"# Actual Solutions: {num_of_solutions} / {maxPossibleSolutions} after SolutionStructs\n") # handle no valid solutions - if len(solutions) == 0: + if num_of_solutions == 0: msg = "Your parameters resulted in 0 valid solutions." if globalParameters["PrintSolutionRejectionReason"]: msg += "\nExamine reject and backtrace messages above to see why" \ @@ -304,7 +305,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG print2(HR) # write benchmarkFiles - prevCount = len(solutions) + prevCount = num_of_solutions codeObjectFiles = writeBenchmarkFiles(stepBaseDir, solutions, \ benchmarkStep.problemSizes, shortName, []) # ^ this mutates solutions @@ -320,7 +321,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG } LibraryIO.writeYAML(cachePath, cacheData) - print1(f"# Actual Solutions: {len(solutions)} / {prevCount} after KernelWriter\n") + print1(f"# Actual Solutions: {num_of_solutions} / {prevCount} after KernelWriter\n") else: solutions = None print1("# Using cached solution data") From 9101d9277dbd60732b32088cec6994fa6ac86a76 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 13 Dec 2023 01:33:23 +0100 Subject: [PATCH 07/13] use f'string --- src/Tensile/ClientExecutable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tensile/ClientExecutable.py b/src/Tensile/ClientExecutable.py index cde870b73e..b8349284a0 100644 --- a/src/Tensile/ClientExecutable.py +++ b/src/Tensile/ClientExecutable.py @@ -43,7 +43,7 @@ def generate(self): args = ['cmake'] args += ['-G', 'Ninja'] if (os.name == 'nt') else [] - args += itertools.chain.from_iterable([ ['-D{}={}'.format(key, value)] for key,value in self.options.items()]) + args += itertools.chain.from_iterable([ [f'-D{key}={value}'] for key,value in self.options.items()]) args += [self.sourceDir] args = [cmake_path(arg) for arg in args] From 66b35b15d0421c0a82d97c3d9e899631b41a1af4 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 13 Dec 2023 01:34:37 +0100 Subject: [PATCH 08/13] Error output got to stderr --- src/Tensile/ClientExecutable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Tensile/ClientExecutable.py b/src/Tensile/ClientExecutable.py index b8349284a0..0e739b979f 100644 --- a/src/Tensile/ClientExecutable.py +++ b/src/Tensile/ClientExecutable.py @@ -25,6 +25,7 @@ import itertools import os import subprocess +from sys import stderr from . import Common from .Common import globalParameters, print2 @@ -54,7 +55,7 @@ def generate(self): out = subprocess.check_output(args, stderr=subprocess.STDOUT, cwd=Common.ensurePath(self.buildDir)) print2(out) except subprocess.CalledProcessError as err: - print(err.output) + print(err.output, file=stderr) raise @@ -67,7 +68,7 @@ def build(self): out = subprocess.check_output(args, stderr=subprocess.STDOUT, cwd=self.buildDir) print2(out) except subprocess.CalledProcessError as err: - print(err.output) + print(err.output, file=stderr) raise def builtPath(self, path, *paths): From bc48adbdd737cadb25460c881b9538fd0862619e Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 10 Jan 2024 22:44:40 +0100 Subject: [PATCH 09/13] Fetch version from metadata to avoid hardcoded value --- src/Tensile/__init__.py | 4 ++-- src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Tensile/__init__.py b/src/Tensile/__init__.py index bcbaed7d9c..f6b58f4853 100644 --- a/src/Tensile/__init__.py +++ b/src/Tensile/__init__.py @@ -24,9 +24,9 @@ # Even though we don't support python 2, this is still packaged sometimes with python 2. from __future__ import print_function +from importlib import metadata -# hardcoded tensile version; also in Tensile/Source/TensileConfigVersion.cmake -__version__ = "4.40.0" +__version__ = metadata.version("Tensile") def PrintTensileRoot(): import os.path diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 index d2267a7a33..1c56f4a612 100644 --- a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 +++ b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 @@ -22,7 +22,6 @@ # ################################################################################ -# hardcoded tensile version; also in Tensile/__init__.py set(TENSILE_VERSION_MAJOR {TENSILE_VERSION_MAJOR}) set(TENSILE_VERSION_MINOR {TENSILE_VERSION_MINOR}) set(TENSILE_VERSION_PATCH {TENSILE_VERSION_PATCH}) From 62b10e4024ad4b287c63652bbe6956169265e803 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 10 Jan 2024 23:17:35 +0100 Subject: [PATCH 10/13] The variable CMAKE_SOURCE_DIR has the same value and should be preferred --- src/Tensile/data/Source/client/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tensile/data/Source/client/CMakeLists.txt b/src/Tensile/data/Source/client/CMakeLists.txt index a9ac80a919..cae52f54c1 100644 --- a/src/Tensile/data/Source/client/CMakeLists.txt +++ b/src/Tensile/data/Source/client/CMakeLists.txt @@ -60,7 +60,7 @@ find_package(Boost COMPONENTS program_options REQUIRED) if (NOT WIN32) find_package(ROCmSMI QUIET) if(NOT ROCmSMI_FOUND) - set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${Tensile_DIR}" "${Tensile_DIR}/../Source/cmake" "${CMAKE_HOME_DIRECTORY}/cmake") + set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${Tensile_DIR}" "${Tensile_DIR}/../Source/cmake" "${CMAKE_SOURCE_DIR}/cmake") find_package(ROCmSMI REQUIRED) endif() endif() From 2ba8577e5c25cefda8fdbf714c63a881e3048e62 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 10 Jan 2024 23:37:06 +0100 Subject: [PATCH 11/13] Use comprehension list for readability --- src/Tensile/TensileCreateLibrary.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py index e3bcdb4222..cd20da9ee3 100644 --- a/src/Tensile/TensileCreateLibrary.py +++ b/src/Tensile/TensileCreateLibrary.py @@ -862,9 +862,7 @@ def writeCMake(outputPath, solutionFiles, kernelFiles, libraryStaticFiles, maste buildObjectFilePaths(cmakeSrcDir, solutionFiles, kernelFiles, [], [], [], masterLibraries) # Build full paths the static library files - staticFilePaths = [] - for staticFile in libraryStaticFiles: - staticFilePaths += [ os.path.join(cmakeSrcDir, staticFile) ] + staticFilePaths = (os.path.join(cmakeSrcDir, staticFile) for staticFile in libraryStaticFiles) # Proceed to generate cmake file generatedFile = open(os.path.join(os.path.normcase(outputPath), "Generated.cmake"), "w") From be0d3f0c9d28b2ddb7847192207b83e481fbe6b6 Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Wed, 10 Jan 2024 23:53:19 +0100 Subject: [PATCH 12/13] Static file instruction already done few line above from the function copyStaticFiles --- src/Tensile/TensileCreateLibrary.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py index cd20da9ee3..455435c85c 100644 --- a/src/Tensile/TensileCreateLibrary.py +++ b/src/Tensile/TensileCreateLibrary.py @@ -1216,10 +1216,7 @@ def splitExtraParameters(par): if not arguments["GenerateSourcesAndExit"]: writeCMake(outputPath, solutionFiles, sourceKernelFiles, staticFiles, masterLibraries) - # Make sure to copy the library static files. - for fileName in staticFiles: - shutil.copy( os.path.join(globalParameters["SourcePath"], fileName), \ - outputPath ) + # write solutions and kernels codeObjectFiles = writeSolutionsAndKernels(outputPath, CxxCompiler, None, solutions, From df083f6539108c34b5f4eeaf24afe07f4c12e89a Mon Sep 17 00:00:00 2001 From: Jonathan MERCIER Date: Thu, 18 Jan 2024 14:21:14 +0100 Subject: [PATCH 13/13] WIP refactoring by using importlib --- pyproject.toml | 5 + src/Tensile/ClientWriter.py | 2 +- src/Tensile/Common.py | 20 +- src/Tensile/CustomKernels.py | 26 +- ...128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s | 10827 ---------------- src/Tensile/KernelWriter.py | 2 +- src/Tensile/TensileCreateLibrary.py | 9 +- src/Tensile/data/Source/client/CMakeLists.txt | 2 +- .../cmake/TensileConfigVersion.cmake.j2 | 2 +- 9 files changed, 49 insertions(+), 10846 deletions(-) delete mode 100644 src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s rename src/Tensile/data/{ => template}/cmake/TensileConfigVersion.cmake.j2 (96%) diff --git a/pyproject.toml b/pyproject.toml index 9fa1466336..c1e4b64878 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ build-backend = "poetry.core.masonry.api" name = "Tensile" version = "4.40.0" description = "Tensile is a tool for creating benchmark-driven backend libraries for GEMMs" +authors = ["Rocm team"] license = "MIT" classifiers = [ "Intended Audience :: Science/Research", @@ -15,6 +16,10 @@ packages = [ { include = "Tensile", from = "src" }, ] +include = [ + "src/Tensile/data/**/*", +] + [tool.poetry.dependencies] python = ">=3.8" toml = ">=0.10" diff --git a/src/Tensile/ClientWriter.py b/src/Tensile/ClientWriter.py index 49f8a24612..bea7c7fb02 100644 --- a/src/Tensile/ClientWriter.py +++ b/src/Tensile/ClientWriter.py @@ -176,7 +176,7 @@ def runClient(libraryLogicPath, forBenchmark, enableTileSelection, configPaths=N def getBuildClientLibraryScript(buildPath, libraryLogicPath): callCreateLibraryCmd = ["python"] if os.name == "nt" else [] - callCreateLibraryCmd += [os.path.join(globalParameters["ScriptPath"] , "bin", "TensileCreateLibrary")] + callCreateLibraryCmd += ['tensile_create_library'] if globalParameters["MergeFiles"]: callCreateLibraryCmd += ["--merge-files"] diff --git a/src/Tensile/Common.py b/src/Tensile/Common.py index caf361fcb4..3e0fb7937c 100644 --- a/src/Tensile/Common.py +++ b/src/Tensile/Common.py @@ -34,6 +34,9 @@ import sys import time +import shutil +from importlib.resources import path, contents, files + startTime = time.time() ParallelMap = Parallel.ParallelMap @@ -225,7 +228,7 @@ globalParameters["AssemblerPath"] = None # /opt/rocm/hip/bin/hipcc globalParameters["WorkingPath"] = os.getcwd() # path where tensile called from globalParameters["IndexChars"] = "IJKLMNOPQRSTUVWXYZ" # which characters to use for C[ij]=Sum[k] A[ik]*B[jk] -globalParameters["ScriptPath"] = os.path.dirname(os.path.realpath(__file__)) # path to Tensile/Tensile.py +# FIXME source is now package with importlib globalParameters["SourcePath"] = os.path.join(globalParameters["ScriptPath"], "Source") # path to Tensile/Source/ globalParameters["HipClangVersion"] = "0.0.0" @@ -256,7 +259,7 @@ globalParameters["GranularityThreshold"] = 0.0 # directory where custom kernels are located -globalParameters["CustomKernelDirectory"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "CustomKernels") +globalParameters["CustomKernelDirectory"] = files('Tensile.data').joinpath("CustomKernels") globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile training verification or not @@ -2517,3 +2520,16 @@ def __del__(self): """ HR = "################################################################################" + +def copy_data_files(data_to_copy: List[str], destination_path: str) -> None: + if not os.path.exists(destination_path): + os.makedirs(destination_path) + + for resource in contents('Tensile.data'): + for data in data_to_copy: + if resource.startswith(data): + with path(Tensile.data, resource) as resource_path: + if os.path.isfile(resource_path): + shutil.copy(resource_path, destination_path) + elif os.path.isdir(resource_path): + shutil.copytree(resource_path, os.path.join(destination_path, resource)) diff --git a/src/Tensile/CustomKernels.py b/src/Tensile/CustomKernels.py index f3254b55f4..9c6a3907c5 100644 --- a/src/Tensile/CustomKernels.py +++ b/src/Tensile/CustomKernels.py @@ -27,24 +27,34 @@ import yaml import os +from pathlib import Path +from typing import Union, List def isCustomKernelConfig(config): return "CustomKernelName" in config and config["CustomKernelName"] -def getCustomKernelFilepath(name, directory=globalParameters["CustomKernelDirectory"]): - return os.path.join(directory, (name + ".s")) +def getCustomKernelFilepath(name, directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]): + if not isinstance(directory, Path): + directory = Path(directory) + return directory.join(name + ".s") -def getAllCustomKernelNames(directory=globalParameters["CustomKernelDirectory"]): - return [fname[:-2] for fname in os.listdir(directory) if fname.endswith(".s")] +def getAllCustomKernelNames(directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]): + if not isinstance(directory, Path): + directory = Path(directory) + return [fname[:-2] for fname in directory.iterdir() if fname.endswith(".s")] -def getCustomKernelContents(name, directory=globalParameters["CustomKernelDirectory"]): +def getCustomKernelContents(name, directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]): + if not isinstance(directory, Path): + directory = Path(directory) try: with open(getCustomKernelFilepath(name, directory)) as f: return f.read() except: raise RuntimeError("Failed to find custom kernel: {}".format(os.path.join(directory, name))) -def getCustomKernelConfigAndAssembly(name, directory=globalParameters["CustomKernelDirectory"]): +def getCustomKernelConfigAndAssembly(name, directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]): + if not isinstance(directory, Path): + directory = Path(directory) contents = getCustomKernelContents(name, directory) config = "\n" #Yaml configuration properties assembly = "" @@ -57,7 +67,9 @@ def getCustomKernelConfigAndAssembly(name, directory=globalParameters["CustomKer return (config, assembly) -def getCustomKernelConfig(name, directory=globalParameters["CustomKernelDirectory"]): +def getCustomKernelConfig(name,directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]): + if not isinstance(directory, Path): + directory = Path(directory) rawConfig, _ = getCustomKernelConfigAndAssembly(name, directory) try: return yaml.safe_load(rawConfig)["custom.config"] diff --git a/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s b/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s deleted file mode 100644 index 6199997f34..0000000000 --- a/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s +++ /dev/null @@ -1,10827 +0,0 @@ -/***********************************************************************************/ -/* */ -/* Copyright (C) 2021-2022 Advanced Micro Devices, Inc. All rights reserved. */ -/* */ -/* Permission is hereby granted, free of charge, to any person obtaining a copy */ -/* of this software and associated documentation files (the "Software"), to deal */ -/* in the Software without restriction, including without limitation the rights */ -/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ -/* copies of the Software, and to permit persons to whom the Software is */ -/* furnished to do so, subject to the following conditions: */ -/* */ -/* The above copyright notice and this permission notice shall be included in */ -/* all copies or substantial portions of the Software. */ -/* */ -/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ -/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ -/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ -/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ -/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ -/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ -/* SOFTWARE. */ -/* */ -/***********************************************************************************/ - - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -// Component.Signature.SignatureCOV3 -.amdgcn_target "amdgcn-amd-amdhsa--gfx90a" -.text -.protected DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4 -.globl DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4 -.p2align 8 -.type DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_accum_offset 256 // accvgpr offset - .amdhsa_next_free_vgpr 256 // vgprs - .amdhsa_next_free_sgpr 73 // sgprs - .amdhsa_group_segment_fixed_size 32768 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 8 x 8 */ -/* SubGroup= 16 x 16 */ -/* VectorWidth=2 */ -/* GlobalLoadVectorWidthA=2, GlobalLoadVectorWidthB=2 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=False */ -.amdgpu_metadata ---- -custom.config: - ProblemType: - OperationType: GEMM - DataType: D - TransposeA: False - TransposeB: False - UseBeta: True - Batched: True - MatrixInstruction: [ 16, 16, 4, 1 ] - ThreadTile: [ 2, 128 ] - WorkGroup: [ 64, 4, 1 ] - DepthU: 16 - VectorWidth: 2 - SourceSwap: 1 - GlobalReadVectorWidth: 2 - StaggerUStride: 128 - StaggerU: 4 - WorkGroupMapping: 4 - AssertSizeMultiple: {3: 32} -amdhsa.version: - - 1 - - 1 -amdhsa.kernels: - - .name: DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4 - .symbol: 'DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f64 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f64 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f64 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f64 - .address_space: generic - - .name: alpha - .size: 8 - .offset: 56 - .value_kind: by_value - .value_type: f64 - - .name: beta - .size: 8 - .offset: 64 - .value_kind: by_value - .value_type: f64 - - .name: strideD0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 32768 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 256 - .private_segment_fixed_size: 0 - .sgpr_count: 73 - .sgpr_spill_count: 0 - .vgpr_count: 256 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst:req, cc:req, src0:req, src1:req, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst:req, src0:req, src1:req, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_add_i32 dst:req, src0:req, src1:req, dpp= - v_add_i32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst:req, ccOut:req, src0:req, ccIn:req, src1:req, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst:req, cc:req, src0:req, src1:req, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst:req, src0:req, src1:req, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_i32 dst:req, src0:req, src1:req, dpp= - v_sub_i32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst:req, src0:req, src1:req, shiftCnt:req - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst:req, src0:req, src1:req, shiftCnt:req - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_or_b32 dst:req, src0:req, shiftCnt:req, src1:req - v_lshl_or_b32 \dst, \src0, \shiftCnt, \src1 -.endm - -.macro _v_cmpx_lt_i16 dst, src0, src1= - v_cmpx_lt_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lt_i32 dst, src0, src1= - v_cmpx_lt_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lt_i64 dst, src0, src1= - v_cmpx_lt_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lt_u16 dst, src0, src1= - v_cmpx_lt_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lt_u32 dst, src0, src1= - v_cmpx_lt_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lt_u64 dst, src0, src1= - v_cmpx_lt_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_eq_i16 dst, src0, src1= - v_cmpx_eq_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_eq_i32 dst, src0, src1= - v_cmpx_eq_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_eq_i64 dst, src0, src1= - v_cmpx_eq_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_eq_u16 dst, src0, src1= - v_cmpx_eq_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_eq_u32 dst, src0, src1= - v_cmpx_eq_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_eq_u64 dst, src0, src1= - v_cmpx_eq_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_le_i16 dst, src0, src1= - v_cmpx_le_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_le_i32 dst, src0, src1= - v_cmpx_le_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_le_i64 dst, src0, src1= - v_cmpx_le_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_le_u16 dst, src0, src1= - v_cmpx_le_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_le_u32 dst, src0, src1= - v_cmpx_le_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_le_u64 dst, src0, src1= - v_cmpx_le_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_gt_i16 dst, src0, src1= - v_cmpx_gt_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_gt_i32 dst, src0, src1= - v_cmpx_gt_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_gt_i64 dst, src0, src1= - v_cmpx_gt_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_gt_u16 dst, src0, src1= - v_cmpx_gt_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_gt_u32 dst, src0, src1= - v_cmpx_gt_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_gt_u64 dst, src0, src1= - v_cmpx_gt_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ne_i16 dst, src0, src1= - v_cmpx_ne_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ne_i32 dst, src0, src1= - v_cmpx_ne_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ne_i64 dst, src0, src1= - v_cmpx_ne_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ne_u16 dst, src0, src1= - v_cmpx_ne_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ne_u32 dst, src0, src1= - v_cmpx_ne_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ne_u64 dst, src0, src1= - v_cmpx_ne_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lg_i16 dst, src0, src1= - v_cmpx_lg_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lg_i32 dst, src0, src1= - v_cmpx_lg_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lg_i64 dst, src0, src1= - v_cmpx_lg_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lg_u16 dst, src0, src1= - v_cmpx_lg_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lg_u32 dst, src0, src1= - v_cmpx_lg_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_lg_u64 dst, src0, src1= - v_cmpx_lg_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ge_i16 dst, src0, src1= - v_cmpx_ge_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ge_i32 dst, src0, src1= - v_cmpx_ge_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ge_i64 dst, src0, src1= - v_cmpx_ge_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ge_u16 dst, src0, src1= - v_cmpx_ge_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ge_u32 dst, src0, src1= - v_cmpx_ge_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_ge_u64 dst, src0, src1= - v_cmpx_ge_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_o_i16 dst, src0, src1= - v_cmpx_o_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_o_i32 dst, src0, src1= - v_cmpx_o_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_o_i64 dst, src0, src1= - v_cmpx_o_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_o_u16 dst, src0, src1= - v_cmpx_o_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_o_u32 dst, src0, src1= - v_cmpx_o_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_o_u64 dst, src0, src1= - v_cmpx_o_u64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_u_i16 dst, src0, src1= - v_cmpx_u_i16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_u_i32 dst, src0, src1= - v_cmpx_u_i32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_u_i64 dst, src0, src1= - v_cmpx_u_i64 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_u_u16 dst, src0, src1= - v_cmpx_u_u16 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_u_u32 dst, src0, src1= - v_cmpx_u_u32 \dst, \src0, \src1 -.endm - -.macro _v_cmpx_u_u64 dst, src0, src1= - v_cmpx_u_u64 \dst, \src0, \src1 -.endm -.macro _v_mac_f32 c:req, a:req, b:req - v_mac_f32 \c, \a, \b -.endmacro - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicA - _v_add_u32 v[\dstIdx+0], v[\dstIdx+0], v[\dstIdx+1] - v_lshrrev_b32 v[\dstIdx+0], \magicShift, v[\dstIdx+0] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -/* ValuC range: [0-128), serializedStore enabled */ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx */ -.set vgprValuA_X0_I0, 128 -.set vgprValuA_X1_I0, 132 -.set vgprValuA_X2_I0, 136 -.set vgprValuA_X3_I0, 140 -.set vgprValuB_X0_I0, 144 -.set vgprValuB_X1_I0, 160 -.set vgprValuB_X2_I0, 176 -.set vgprValuB_X3_I0, 192 -.set vgprLocalWriteAddrA, 208 -.set vgprLocalWriteAddrB, 209 -.set vgprGlobalReadOffsetA, 210 -.set vgprGlobalReadOffsetB, 214 -.set vgprG2LA, 218 -.set vgprValuA_X0_I1, 218 -.set vgprValuA_X1_I1, 222 -.set vgprValuA_X2_I1, 226 -.set vgprValuA_X3_I1, 230 -.set vgprG2LB, 234 -.set vgprLocalReadAddrA, 250 -.set vgprLocalReadAddrB, 251 -.set vgprSerial, 252 -/* Num VGPR=256 */ -/* Num AccVGPR=0 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprLoopCounterL, 5 -.set sgprOrigLoopCounter, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeA, 24 -.set sgprTensor2dSizeB, 26 -.set sgprAddressD, 28 -.set sgprAddressC, 30 -.set sgprAddressA, 32 -.set sgprAddressB, 34 -/* offsets pre-applied */ -.set sgprAlpha, 44 -.set sgprBeta, 46 -.set sgprStridesD, 48 -.set sgprStridesC, 50 -.set sgprStridesA, 52 -.set sgprStridesB, 54 -.set sgprSizesFree, 56 -.set sgprSizesSum, 59 -.set sgprOrigStaggerUIter, 60 -.set sgprNumWorkGroups0, 61 -.set sgprNumWorkGroups1, 62 -.set sgprNumFullBlocks, 63 -.set sgprWgmRemainder1, 64 -.set sgprMagicNumberWgmRemainder1, 65 -.set sgprShadowLimitA, 36 -.set sgprShadowLimitB, 38 -.set sgprStaggerUIter, 7 -.set sgprWrapUA, 40 -.set sgprWrapUB, 42 -.set sgprGlobalReadIncsA, 66 -.set sgprGlobalReadIncsB, 67 -/* max SGPR=73 */ - -/* Size Assignments */ -.set sgprSizeI, sgprSizesFree+0 -.set sgprSizeJ, sgprSizesFree+1 -.set sgprSizeK, sgprSizesFree+2 -.set sgprSizeL, sgprSizesSum+0 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideA0I, 1 -.set sgprStrideAL, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set MT0, 128 -.set MT1, 128 -.set DepthU, 16 -.set GSU, 1 -.set BpeA, 8 -.set BpeALog2, 3 -.set BpeB, 8 -.set BpeBLog2, 3 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 2 -.set SrdShiftLeftB, 2 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0xffffffff -.set BufferOOB, 0x80000000 - -/******************************************/ -/* Bits 127:96 of SRD. */ -/* hex: 0x00020000 */ -/* dst_sel_x (3b): 0 */ -/* dst_sel_y (3b): 0 */ -/* dst_sel_z (3b): 0 */ -/* dst_sel_w (3b): 0 */ -/* num_format (3b): 0 */ -/* data_format (4b): 4 */ -/* user_vm_enable (1b): 0 */ -/* user_vm_mode (1b): 0 */ -/* index_stride (2b): 0 */ -/* add_tid_enable (1b): 0 */ -/* _unusedA (3b): 0 */ -/* nv (1b): 0 */ -/* _unusedB (2b): 0 */ -/* type (2b): 0 */ -/******************************************/ -.set Srd127_96, 0x00020000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr:req vgprOffset0I:req vgprOffsetL:req vgprTmp:req -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideAL], v[\vgprOffsetL] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffset0I], v[\vgprTmp+0] // accumulate K lower -_v_add_u32 v[\vgprAddr+0], 0x2, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x3, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr:req vgprOffsetL:req vgprOffset1J:req vgprTmp:req -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower -_v_add_u32 v[\vgprAddr+0], 0x2, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x3, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - - - -/******************************************/ -/* Allocate Resources */ -/******************************************/ - -s_setprio 3 // optimization store -s_mov_b32 m0, 0x9000 // LDS clamp at 36864 bytes -v_mov_b32 v[vgprSerial], v0 // thread serial id - -/* Load Kernel Args */ -s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 // -s_load_dwordx16 s[48:63], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x48 // -s_load_dwordx2 s[64:65], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x88 // -s_waitcnt lgkmcnt(0) // wait for 160 bytes of kern args -s_mov_b32 s44, s36 -s_mov_b32 s45, s37 -s_mov_b32 s46, s38 -s_mov_b32 s47, s39 - -/* Short circuit condition if Alpha == 0, then sumDims=0 */ -v_cmp_eq_f64 vcc, s[sgprAlpha:sgprAlpha+1], 0.0 // Alpha == 0.0 ? -s_cbranch_vccz label_AlphaNonZero // branch if Alpha != 0 -s_mov_b32 s[sgprSizesSum+0], 0x0 // Set summation dim=0 if Alpha == 0 -label_AlphaNonZero: - - -/******************************************/ -/* Local Read Addresses */ -/******************************************/ - - -/* local read addresses: tile assignments a/b */ - -/*lr0I*/ -v_and_b32 v2, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) -v_and_b32 v1, 15, v2 // 1. N offset: nIdx = wtid % MI_N(16) - // 1. N offset: nOffset = nIdx * nStride(1) (multiplier is 1, do nothing) -v_lshrrev_b32 v0, 4, v2 // 2. block offset: bnIdx = wtid / dividedForBlkId(16) -v_and_b32 v0, 0, v0 // 2. block offset: bnIdx = bnIdx % num1DBlocks(1) -v_lshlrev_b32 v0, 0x4, v0 // 2. block offset: bnOffset = bnIdx * strideBlock(16) -_v_add_u32 v1, v0, v1 // 3. add N and block offset: bnOffset = block and N offset -v_lshlrev_b32 v1, 0x1, v1 // 3. apply VectorWidth: bnOffset = bnOffset * vw(2) -v_lshrrev_b32 v2, 4, v2 // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) -v_lshlrev_b32 v2, 0x8, v2 // 4. K offset: lrKOffset = kIdx * mStride(256) -_v_add_u32 v1, v2, v1 // 5. offset in wave: lrOffset = bnOffset + lrKOffset -v_lshrrev_b32 v0, 6, v[vgprSerial] // 6. wave offset in N dimen: wtid = tid / dividedForWaveId(64) -v_and_b32 v0, 3, v0 // 6. wave offset in M dimen: wtid0 = wtid / num1DWaves(4) -v_lshlrev_b32 v0, 0x5, v0 // 6. wave offset in M dimen: wOffset = wtid0 * W0Stride(32) -_v_add_u32 v1, v0, v1 // 7. final local read offset: flrOffset = lrOffset + WOffset -/*lr1J*/ -v_and_b32 v3, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) -v_and_b32 v2, 15, v3 // 1. N offset: nIdx = wtid % MI_N(16) -v_lshlrev_b32 v2, 0x4, v2 // 1. N offset: nOffset = nIdx * nStride(16) -v_lshrrev_b32 v0, 4, v3 // 2. block offset: bnIdx = wtid / dividedForBlkId(16) -v_and_b32 v0, 0, v0 // 2. block offset: bnIdx = bnIdx % num1DBlocks(1) -v_lshlrev_b32 v0, 0x8, v0 // 2. block offset: bnOffset = bnIdx * strideBlock(256) -_v_add_u32 v2, v0, v2 // 3. add N and block offset: bnOffset = block and N offset - // 3. apply VectorWidth: bnOffset = bnOffset * vw(1) (multiplier is 1, do nothing) -v_lshrrev_b32 v3, 4, v3 // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) -v_lshlrev_b32 v3, 0x1, v3 // 4. K offset: lrKOffset = kIdx * mStride(2) -_v_add_u32 v2, v3, v2 // 5. offset in wave: lrOffset = bnOffset + lrKOffset - - -/* local read addresses: final offsets a */ - -// v_lshrrev_b32 v0, 8, v[vgprSerial] // LSU offset: sgid = Serial / subGroup(256) -// s_mov_b32 s68, 128 // LSU offset: stirde = MT0(128) + PAD0(0) -// v_mul_lo_u32 v0, s68, v0 // LSU offset: lsuoffset = sgid*(MT0+PAD) -// _v_add_lshl_u32 v[vgprLocalReadAddrA], v0, v1, 0x3 // Final Offset: offset = (lro0*VW+lsuoffset)*bpe - - -/* local read addresses: final offsets b */ - -v_lshrrev_b32 v0, 8, v[vgprSerial] // LSU offset: sgid = Serial / subGroup(256) -s_mov_b32 s68, 128 // LSU offset: stirde = MT1(128) + PAD1(0) -v_mul_lo_u32 v0, s68, v0 // LSU offset: lsuoffset = sgid*(MT1+PAD) -_v_add_lshl_u32 v[vgprLocalReadAddrB], v0, v2, 0x3 // Final Offset: offset = (lro1*VW+lsuoffset)*bpe -v_lshrrev_b32 v1, 7, v[vgprLocalReadAddrB] // Final Offset: padding 4 per block 128 -v_lshlrev_b32 v1, 0x5, v1 // Final Offset: padding 4 per block 128 -_v_add_u32 v[vgprLocalReadAddrB], v1, v[vgprLocalReadAddrB] // Final Offset: add padding 4 per block 128 - - -/* local read addresses: declare addresses a */ - -/* N/A */ - - -/* local read addresses: declare addresses b */ - -// _v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x4000, v[vgprLocalReadAddrB+0] // += LdsOffsetB (lower) - - - -/******************************************/ -/* Begin setupNewTile, isPap=False */ -/******************************************/ - - -/* global read addresses: work-group */ - -/* graWorkGroup mapping */ -s_mov_b32 s71, 0x20000001L // magic number for WGM==4 -s_mul_hi_u32 s69, s[sgprWorkGroup1], s71 // s_magic mul -s_mul_i32 s68, s[sgprWorkGroup1], s71 // s_magic mul -s_lshr_b64 s[68:69], s[68:69], 31 // sMagicDiv -s_mul_i32 s69, s68, 4 // quotient * non-magic divisor -s_sub_u32 s69, s[sgprWorkGroup1], s69 // WorkGroup1=remainder -s_mul_i32 s69, s69, s[sgprNumWorkGroups0] // (wg1 % WGM)*nwg0 -s_add_u32 s69, s69, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*nwg0 -s_cmp_ge_u32 s68, s[sgprNumFullBlocks] // blockId >= numFullBlocks ? -s_cmov_b32 s71, s[sgprMagicNumberWgmRemainder1] // -s_cselect_b32 s70, s[sgprWgmRemainder1], 4 // -s_mul_hi_u32 s3, s69, s71 // s_magic mul -s_mul_i32 s2, s69, s71 // s_magic mul -s_lshr_b64 s[2:3], s[2:3], 31 // sMagicDiv -s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s70 // quotient * non-magic divisor -s_sub_u32 s[sgprWorkGroup1], s69, s[sgprWorkGroup1] // WorkGroup1=remainder -s_mul_i32 s68, s68, 4 // blockId * WGM -s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s68 // wg1 += blockId * WGM - - -/* global read addresses: tile offset assignment a */ - -/* LVCA = 64 */ -/* v0 = (local)groA-tile = serial%LVCA (note (wgA*MTA) will be added to SRD) */ -/* v1 = groA-unroll = serial/LVCA */ -v_and_b32 v0, 15, v[vgprSerial] // v0 = v[vgprSerial] % 16 -v_lshlrev_b32 v0, 0x1, v0 // v0 = v0 * 2 -v_lshrrev_b32 v1, 6, v[vgprSerial] // v1 = v[vgprSerial] / 64 -v_lshlrev_b32 v1, 5, v1 // v1 = v1 * 32 -v_add_u32 v0, v0, v1 // v0 = v0 + v1 - -v_and_b32 v1, 63, v[vgprSerial] // v1 = (v[vgprSerial] % 64) / 16 -v_lshrrev_b32 v1, 4, v1 // v1 = (v[vgprSerial] % 64) / 16 -v_lshlrev_b32 v1, 1, v1 // v1 = v1 * 2 - -// v_lshrrev_b32 v1, 6, v[vgprSerial] // v1 = v[vgprSerial] / 64 -// v_and_b32 v0, 63, v[vgprSerial] // v0 = v[vgprSerial] % 64 -// /* gro-tile *= glvw */ -// v_lshlrev_b32 v0, 0x1, v0 // v0 = v0 * 2 - - -/* global read addresses: tile offset assignment b */ - -/* LVCB = 8 */ -/* v2 = (local)groB-tile = serial/LVCB (note (wgB*MTB) will be added to SRD) */ -/* v3 = groB-unroll = serial%LVCB */ -v_and_b32 v6, 63, v[vgprSerial] // v6 = v[vgprSerial] % 64 -v_lshrrev_b32 v2, 3, v6 // v2 = v6 / 8 -v_and_b32 v3, 7, v6 // v3 = v6 % 8 -v_readfirstlane_b32 s68, v[vgprSerial] // WaveIdxWavefrontWidth -s_lshr_b32 s68, s68, 0x6 // WaveId -s_mul_i32 s68, s68, 32 // Global Read Wave: each wave loads continuous lsp(8)*nrp(4) columns -_v_add_u32 v2, s68, v2 // Global Read Wave: add back to cloumn index -/* gro-unroll *= glvw */ -v_lshlrev_b32 v3, 0x1, v3 // v3 = v3 * 2 - - -/* global read addresses: unroll assignment a */ - -/* v1 */ - - -/* global read addresses: unroll assignment b */ - -/* v3 */ - - -/* global read addresses: other free assignments */ - -/* s[sgprWorkGroup2] */ - - -/* global read addresses: tile offsets a */ - -v_mov_b32 v4, v0 // groA0I_0 - - -/* global read addresses: tile offsets b */ - -v_mov_b32 v5, v2 // groB1J_0 -_v_add_co_u32 v6, vcc, 8, v5 // groB1J_1 += LSPB -_v_add_co_u32 v7, vcc, 8, v6 // groB1J_2 += LSPB -_v_add_co_u32 v8, vcc, 8, v7 // groB1J_3 += LSPB - - -/* global read addresses: unroll offsets a */ - -v_mov_b32 v9, v1 // groAL_0 -_v_add_co_u32 v10, vcc, 1, v9 // groAL_1 + LSPA -_v_add_co_u32 v11, vcc, 8, v9 // groAL_2 + LSPA -_v_add_co_u32 v12, vcc, 9, v9 // groAL_3 + LSPA - - -/* global read addresses: unroll offsets b */ - -v_mov_b32 v13, v3 // groBL_0 - - -/* global read addresses: shift a */ - -s_mul_i32 s68, s[sgprWorkGroup0], 128 // WorkGroup[01] * MT -s_sub_u32 s68, s[sgprSizeI], s68 // edge = Size0I - WG*MT -s_sub_u32 s68, s68, 2 // edge -= margin(2) -v_mov_b32 v14, s68 // edge vgpr = Size0I- WG*MT - margin(2) -_v_add_co_u32 v15, vcc, v14, 2 // shiftedEdge = edge + srdShiftLeft(2) -_v_add_co_u32 v16, vcc, v4, 2 // shiftedOffset = offset + srdShiftLeft(2) -v_cmp_lt_u32 s[68:69], v16, v15 // shiftedOffset < shiftedEdge -v_cndmask_b32 v4, v14, v4, s[68:69] // offset = (offset < edge) ? offset(v4) : edge(v14) - - -/* global read addresses: final offsets a */ - -GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0, 4, 9, 14 // gROA_0_0_0_0 -GLOBAL_OFFSET_A vgprGlobalReadOffsetA+1, 4, 10, 14 // gROA_0_0_1_0 -GLOBAL_OFFSET_A vgprGlobalReadOffsetA+2, 4, 11, 14 // gROA_0_0_2_0 -GLOBAL_OFFSET_A vgprGlobalReadOffsetA+3, 4, 12, 14 // gROA_0_0_3_0 - - -/* global read addresses: final offsets b */ - -GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0, 13, 5, 9 // gROB_0_0_0_0 -GLOBAL_OFFSET_B vgprGlobalReadOffsetB+1, 13, 6, 9 // gROB_0_0_1_0 -GLOBAL_OFFSET_B vgprGlobalReadOffsetB+2, 13, 7, 9 // gROB_0_0_2_0 -GLOBAL_OFFSET_B vgprGlobalReadOffsetB+3, 13, 8, 9 // gROB_0_0_3_0 - - -/* global read addresses: addresses a */ - -/* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s71, s[sgprWorkGroup0], 128 // WorkGroup[01] * MT -s_mul_i32 s70, s[sgprWorkGroup0], 128 // WorkGroup[01] * MT -s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s70 // sub tileStart -s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], s71 // sub tileStart -s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x3 // Set limit to use bytes -s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad -s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s69, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s68, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s70, s70, s68 // accum wg term to tilestart -s_addc_u32 s71, s71, s69 // accum wg term to tilestart -s_lshl_b64 s[70:71], s[70:71], 0x3 // tileStart *= BPE -s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s70 // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s71 // SRD base = Address+ tileStart1 -s_sub_u32 s[sgprSrdA+0], s[sgprSrdA+0], 16 // pre-pad to make room for possible pointer shift -s_subb_u32 s[sgprSrdA+1], s[sgprSrdA+1], 0 // pre-pad to make room for possible pointer shift -s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD - - -/* global read addresses: addresses b */ - -/* max read offset = size[n] * stride[n-1] */ -s_mul_hi_u32 s71, s[sgprWorkGroup1], 128 // WorkGroup[01] * MT -s_mul_i32 s70, s[sgprWorkGroup1], 128 // WorkGroup[01] * MT -s_mul_hi_u32 s71, s70, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_mul_i32 s70, s70, s[sgprStrideB1J] // tlu=0, scaled tile-offset by stride -s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s70 // sub tileStart -s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], s71 // sub tileStart -s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x3 // Set limit to use bytes -s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad -s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 -s_mul_hi_u32 s69, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_mul_i32 s68, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG -s_add_u32 s70, s70, s68 // accum wg term to tilestart -s_addc_u32 s71, s71, s69 // accum wg term to tilestart -s_lshl_b64 s[70:71], s[70:71], 0x3 // tileStart *= BPE -s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s70 // SRD base = Address+ tileStart0 -s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s71 // SRD base = Address+ tileStart1 -s_sub_u32 s[sgprSrdB+0], s[sgprSrdB+0], 16 // pre-pad to make room for possible pointer shift -s_subb_u32 s[sgprSrdB+1], s[sgprSrdB+1], 0 // pre-pad to make room for possible pointer shift -s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD - - -/* global read addresses: increments a */ - -s_mul_i32 s[sgprGlobalReadIncsA+0], DepthU*BpeA, s[sgprStrideAL] // incrA unrollIdx) - - -/* global read addresses: increments b */ - -s_mov_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeB // incrB (unrollIdx) - - -/******************************************/ -/* Local Write Addresses */ -/******************************************/ - -/* lwaTileAssignmentA = v0 */ - -/* lwaTileAssignmentB = v2 */ - -/* lwaUnrollAssignmentA = v1 */ - -/* lwaUnrollAssignmentB = v3 */ - - -/* local write addresses: first offset a */ - -// v_mul_u32_u24 v[vgprLocalWriteAddrA], 0x80, v1 // lwAL**(MTA + PAD) -// _v_add_lshl_u32 v[vgprLocalWriteAddrA], v0, v[vgprLocalWriteAddrA], 0x3 // lwFOA = (lwAA + lwAL*(MT0I+PAD))*bpe - - -/* local write addresses: first offset b */ - -v_mul_u32_u24 v[vgprLocalWriteAddrB], 0x10, v2 // lwBL**(DepthU_Compute + PAD) -_v_add_lshl_u32 v[vgprLocalWriteAddrB], v3, v[vgprLocalWriteAddrB], 0x3 // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpe -v_lshrrev_b32 v3, 7, v[vgprLocalWriteAddrB] // padding 4 per block 128 -v_lshlrev_b32 v3, 0x5, v3 // padding 4 per block 128 -_v_add_u32 v[vgprLocalWriteAddrB], v3, v[vgprLocalWriteAddrB] // add padding 4 per block 128 -// _v_add_co_u32 v[vgprLocalWriteAddrB], vcc, 0x4000, v[vgprLocalWriteAddrB] // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=2048*8 - - - - - - - -/* declare loop num iterations */ - - -s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 4 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 16 -s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter - -s_and_b32 s[sgprStaggerUIter], s[sgprOrigStaggerUIter], s[sgprWorkGroup0] // Compute actual stagger start for this tile -s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], 0 // shift by StaggerUStride - - -/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */ -s_mul_hi_i32 s69, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset -s_mul_i32 s68, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] // stagger byte offset -s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop -s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop -s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration -s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1] // remove one iteration -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0] // Move shadow to real if we are within 2^32 - - -/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */ -s_mul_hi_i32 s69, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset -s_mul_i32 s68, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] // stagger byte offset -s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop -s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop -s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration -s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1] // remove one iteration -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0] // Move shadow to real if we are within 2^32 -s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap - -/* local read addresses: init pointers a */ - - -/* localReadInitPointers */ - -/* local read addresses: init pointers b */ - - -/* localReadInitPointers */ - - -/* prefetch: global -> local */ - -s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? -// s_setprio 0 // optimization store -s_cbranch_scc1 ShadowInitStart_9 // skip to ShadowInitStart iter b/c numIter==0 - -buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0 - -buffer_load_dwordx4 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -buffer_load_dwordx4 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -buffer_load_dwordx4 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -buffer_load_dwordx4 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0 - - - - -/* global read inc A loopL */ -s_add_u32 s70, s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s70 // Is this wrapIter? (pf) -s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUA+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0] // Move shadow to real if we are within 2^32 - -/* global read inc B loopL */ -s_add_u32 s70, s[sgprLoopCounterL], 1 // remove pf(1) -s_cmp_eq_u32 s[sgprStaggerUIter], s70 // Is this wrapIter? (pf) -s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUB+1], 0 // incUpper <- ? -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0] // Move shadow to real if we are within 2^32 - - -/******************************************/ -/* End setupNewTile, isPap=False */ -/******************************************/ - -ShadowInitStart_9: // - -s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) -s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields -s_mov_b32 s[sgprSrdD+2], 0x80000000 // -s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD - -s_mov_b32 s[sgprSrdC+0], s[sgprAddressC+0] // init SRD base address (lower) -s_mov_b32 s[sgprSrdC+1], s[sgprAddressC+1] // init SRD base address (upper) + other fields -s_mov_b32 s[sgprSrdC+2], 0x80000000 // -s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD - - -s_mul_i32 s70, MT1, s[sgprWorkGroup1] // <- wg1*MT1 -s_mul_hi_u32 s69, s70, s[sgprStrideC1J] // CScale s70 by Stride -s_mul_i32 s68, s70, s[sgprStrideC1J] // CScale s70 by Stride -s_lshl_b64 s[68:69], s[68:69], 3 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s69 // add hi to SRD -s_mul_hi_u32 s69, s70, s[sgprStrideD1J] // Scale s70 by Stride -s_mul_i32 s68, s70, s[sgprStrideD1J] // Scale s70 by Stride -s_lshl_b64 s[68:69], s[68:69], 3 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s69 // add hi to SRD - -s_mul_hi_u32 s69, s[sgprWorkGroup2], s[sgprStrideCK] // CScale s[sgprWorkGroup2] by Stride -s_mul_i32 s68, s[sgprWorkGroup2], s[sgprStrideCK] // CScale s[sgprWorkGroup2] by Stride -s_lshl_b64 s[68:69], s[68:69], 3 // scale by bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68 // add lo to SRD -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s69 // add hi to SRD -s_mul_hi_u32 s69, s[sgprWorkGroup2], s[sgprStrideDK] // Scale s[sgprWorkGroup2] by Stride -s_mul_i32 s68, s[sgprWorkGroup2], s[sgprStrideDK] // Scale s[sgprWorkGroup2] by Stride -s_lshl_b64 s[68:69], s[68:69], 3 // scale by bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68 // add lo to SRD -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s69 // add hi to SRD - - - -/* initC: remove C-tile 0-128 from pool */ -v_mov_b32 v208, 15728640 // set out-of-bound addr -ds_read_b32 v[vgprValuC+0], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+1], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+2], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+3], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+4], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+5], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+6], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+7], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+8], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+9], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+10], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+11], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+12], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+13], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+14], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+15], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+16], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+17], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+18], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+19], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+20], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+21], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+22], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+23], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+24], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+25], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+26], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+27], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+28], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+29], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+30], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+31], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+32], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+33], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+34], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+35], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+36], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+37], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+38], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+39], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+40], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+41], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+42], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+43], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+44], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+45], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+46], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+47], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+48], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+49], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+50], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+51], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+52], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+53], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+54], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+55], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+56], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+57], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+58], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+59], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+60], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+61], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+62], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+63], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+64], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+65], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+66], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+67], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+68], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+69], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+70], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+71], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+72], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+73], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+74], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+75], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+76], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+77], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+78], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+79], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+80], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+81], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+82], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+83], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+84], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+85], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+86], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+87], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+88], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+89], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+90], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+91], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+92], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+93], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+94], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+95], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+96], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+97], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+98], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+99], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+100], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+101], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+102], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+103], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+104], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+105], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+106], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+107], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+108], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+109], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+110], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+111], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+112], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+113], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+114], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+115], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+116], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+117], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+118], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+119], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+120], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+121], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+122], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+123], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+124], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+125], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+126], v208, offset:0 // initC -ds_read_b32 v[vgprValuC+127], v208, offset:0 // initC - -/* initC: remove AB-tile 128-208 from pool */ - -s_cmp_eq_u32 s[sgprLoopCounterL], 0 // at last iteration? - -/* after InitC, skip to end of prefetch last iter if numIter==0 */ -s_cbranch_scc0 label_NoBranch_10 // Only branch on scc1 -s_getpc_B64 s[68:69] // addr of next instr -s_add_i32 s70, PrefetchGlobalLastIterEnd_4, 0x4 // target branch offset -s_cmp_ge_i32 s70, 0x0 // check positive or negative -s_cbranch_scc1 label_Positive_11 // jump when positive -s_abs_i32 s70, s70 // abs offset -s_sub_u32 s68, s68, s70 // sub target branch offset -s_subb_u32 s69, s69, 0 // sub high and carry -s_setpc_b64 s[68:69] // branch to PrefetchGlobalLastIterEnd_4 -label_Positive_11: -s_add_u32 s68, s68, s70 // add target branch offset -s_addc_u32 s69, s69, 0 // add high and carry -s_setpc_b64 s[68:69] // branch to PrefetchGlobalLastIterEnd_4 -label_NoBranch_10: - -s_waitcnt vmcnt(4) // lgkmcnt=-1 vmcnt=08wait for global read - -/* local write b */ -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0 -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280 -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560 -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840 - - -/* local write swap a */ - - - -/* local write swap b */ - - - - -s_cmp_eq_u32 s[sgprLoopCounterL] 0x1 // PGR=2 but only 1 loop -s_cbranch_scc1 label_0012 // PGR=2 but only 1 loop - - -buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0 - -buffer_load_dwordx4 v[vgprValuA_X0_I1+0:vgprValuA_X0_I1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -buffer_load_dwordx4 v[vgprValuA_X0_I1+4:vgprValuA_X0_I1+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -buffer_load_dwordx4 v[vgprValuA_X0_I1+8:vgprValuA_X0_I1+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -buffer_load_dwordx4 v[vgprValuA_X0_I1+12:vgprValuA_X0_I1+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0 - -label_0012: // - -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-10prefetch wait for local write - -// Skip force waitcnt0 -s_barrier // - - -/* local read prefetch b */ - -ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 - - -/* local read inc a */ - -/* N/A, lro->128 */ -/* self.localReadDoCntA 1 self.localReadDoCntB 1 */ - - -/* local read inc b */ - -/* N/A, lro->8 */ -/* self.localReadDoCntA 1 self.localReadDoCntB 1 */ - - - -/******************************************/ -/* Unrolled Loop(s) - Begin */ -/******************************************/ - -openLoopL_13: -s_cmp_eq_u32 s[sgprLoopCounterL], 0x1 // LoopCounterL < EndCounter -s_cbranch_scc1 label_0014 // PGR=2 but only 1 loop, toPGR1 -s_cmp_le_u32 s[sgprLoopCounterL], 0x2 // LoopCounterL < EndCounter -s_cbranch_scc1 LoopEndL_2 // do not enter LoopL -LoopBeginL_1: - - -/******************************************/ -/* Unrolled Loop 1/2 - Begin */ -/******************************************/ - - -/* Begin Each Unroll: Check VGPR.checkin for INT8 LW */ - - - - - -/* iter 0 */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:0 */ -s_waitcnt lgkmcnt(0) vmcnt(8) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[0:7] -/* mfmaIndex:1 */ -ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -/* global read inc A loopL */ -s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUA+1], 0 // incUpper <- ? -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[8:15] -/* mfmaIndex:2 */ -ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc) -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[16:23] -/* mfmaIndex:3 */ -ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0] // Move shadow to real if we are within 2^32 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[24:31] -/* mfmaIndex:4 */ -s_setprio 0 // store optimization -ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -/* global read inc B loopL */ -s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUB+1], 0 // incUpper <- ? -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[32:39] -/* mfmaIndex:5 */ -ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc) -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[40:47] -/* mfmaIndex:6 */ -ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0] // Move shadow to real if we are within 2^32 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[48:55] -/* mfmaIndex:7 */ -ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[56:63] -/* mfmaIndex:8 */ -ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[64:71] -/* mfmaIndex:9 */ -/* localReadsVacancy: letencyLeft 1 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[72:79] -/* mfmaIndex:10 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[80:87] -/* mfmaIndex:11 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[88:95] -/* mfmaIndex:12 */ -s_setprio 3 // store optimization -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[96:103] -/* mfmaIndex:13 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[104:111] -/* mfmaIndex:14 */ -/* 1 LDS buffer: read-sync-write */ -s_waitcnt lgkmcnt(0) // -s_barrier // -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[112:119] -/* mfmaIndex:15 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0 -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[120:127] -buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -/* numPrefetchIter=0 */ -/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - -/* iter 1 */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:16 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(1) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[0:7] -/* mfmaIndex:17 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[8:15] -/* mfmaIndex:18 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[16:23] -/* mfmaIndex:19 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[24:31] -/* mfmaIndex:20 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[32:39] -/* mfmaIndex:21 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[40:47] -/* mfmaIndex:22 */ -buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[48:55] -/* mfmaIndex:23 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[56:63] -/* mfmaIndex:24 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[64:71] -/* mfmaIndex:25 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[72:79] -/* mfmaIndex:26 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[80:87] -/* mfmaIndex:27 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560 -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[88:95] -/* mfmaIndex:28 */ -buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[96:103] -/* mfmaIndex:29 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[104:111] -/* mfmaIndex:30 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[112:119] -/* mfmaIndex:31 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 2 (reset local read pointers iteration) (swap local read pointers iteration) */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:32 */ -s_waitcnt lgkmcnt(3) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[0:7] -/* mfmaIndex:33 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840 -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[8:15] -/* mfmaIndex:34 */ -buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0 -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[16:23] -/* mfmaIndex:35 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[24:31] -/* mfmaIndex:36 */ -s_setprio 0 // store optimization -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[32:39] -/* mfmaIndex:37 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[40:47] -/* mfmaIndex:38 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[48:55] -/* mfmaIndex:39 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[56:63] -/* mfmaIndex:40 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[64:71] -/* mfmaIndex:41 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[72:79] -/* mfmaIndex:42 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[80:87] -/* mfmaIndex:43 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[88:95] -/* mfmaIndex:44 */ -s_setprio 3 // store optimization -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-13wait for local write -s_barrier // -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[96:103] -/* mfmaIndex:45 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[104:111] -/* mfmaIndex:46 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[112:119] -/* mfmaIndex:47 */ - -/* local read swap offsets a */ - -/* local read swap offsets b */ - -/* local read init pointers a */ - -/* localReadInitPointers */ - -/* local read init pointers b */ - -/* localReadInitPointers */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 3 (swap and reset local write pointers iteration) */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:48 */ - -/* local write swap offsets a */ - -/* local write swap offsets b */ -s_waitcnt lgkmcnt(4) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[0:7] -/* mfmaIndex:49 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[8:15] -/* mfmaIndex:50 */ -ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[16:23] -/* mfmaIndex:51 */ -ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[24:31] -/* mfmaIndex:52 */ -buffer_load_dwordx4 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[32:39] -/* mfmaIndex:53 */ -ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[40:47] -/* mfmaIndex:54 */ -ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[48:55] -/* mfmaIndex:55 */ -ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[56:63] -/* mfmaIndex:56 */ -buffer_load_dwordx4 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[64:71] -/* mfmaIndex:57 */ -ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[72:79] -/* mfmaIndex:58 */ -ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[80:87] -/* mfmaIndex:59 */ -ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[88:95] -/* mfmaIndex:60 */ -buffer_load_dwordx4 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[96:103] -/* mfmaIndex:61 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[104:111] -/* mfmaIndex:62 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[112:119] -/* mfmaIndex:63 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[120:127] -buffer_load_dwordx4 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0 -/* numPrefetchIter=1 */ -/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - - - -/******************************************/ -/* Unrolled Loop - End */ -/******************************************/ - -s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL - -/******************************************/ -/* Unrolled Loop 2/2 - Begin */ -/******************************************/ - - -/* Begin Each Unroll: Check VGPR.checkin for INT8 LW */ - - - - - -/* iter 0 */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:0 */ -s_waitcnt lgkmcnt(0) vmcnt(8) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[0:7] -/* mfmaIndex:1 */ -ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -/* global read inc A loopL */ -s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUA+1], 0 // incUpper <- ? -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[8:15] -/* mfmaIndex:2 */ -ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc) -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[16:23] -/* mfmaIndex:3 */ -ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0] // Move shadow to real if we are within 2^32 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[24:31] -/* mfmaIndex:4 */ -s_setprio 0 // store optimization -ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -/* global read inc B loopL */ -s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUB+1], 0 // incUpper <- ? -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[32:39] -/* mfmaIndex:5 */ -ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc) -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[40:47] -/* mfmaIndex:6 */ -ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0] // Move shadow to real if we are within 2^32 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[48:55] -/* mfmaIndex:7 */ -ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[56:63] -/* mfmaIndex:8 */ -ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[64:71] -/* mfmaIndex:9 */ -/* localReadsVacancy: letencyLeft 1 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[72:79] -/* mfmaIndex:10 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[80:87] -/* mfmaIndex:11 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[88:95] -/* mfmaIndex:12 */ -s_setprio 3 // store optimization -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[96:103] -/* mfmaIndex:13 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[104:111] -/* mfmaIndex:14 */ -/* 1 LDS buffer: read-sync-write */ -s_waitcnt lgkmcnt(0) // -s_barrier // -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[112:119] -/* mfmaIndex:15 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0 -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[120:127] -buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -/* numPrefetchIter=0 */ -/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - -/* iter 1 */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:16 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(1) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[0:7] -/* mfmaIndex:17 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[8:15] -/* mfmaIndex:18 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[16:23] -/* mfmaIndex:19 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[24:31] -/* mfmaIndex:20 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[32:39] -/* mfmaIndex:21 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[40:47] -/* mfmaIndex:22 */ -buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[48:55] -/* mfmaIndex:23 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[56:63] -/* mfmaIndex:24 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[64:71] -/* mfmaIndex:25 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[72:79] -/* mfmaIndex:26 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[80:87] -/* mfmaIndex:27 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560 -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[88:95] -/* mfmaIndex:28 */ -buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[96:103] -/* mfmaIndex:29 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[104:111] -/* mfmaIndex:30 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[112:119] -/* mfmaIndex:31 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 2 (reset local read pointers iteration) (swap local read pointers iteration) */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:32 */ -s_waitcnt lgkmcnt(3) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[0:7] -/* mfmaIndex:33 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=7wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840 -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[8:15] -/* mfmaIndex:34 */ -buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0 -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[16:23] -/* mfmaIndex:35 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[24:31] -/* mfmaIndex:36 */ -s_setprio 0 // store optimization -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[32:39] -/* mfmaIndex:37 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[40:47] -/* mfmaIndex:38 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[48:55] -/* mfmaIndex:39 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[56:63] -/* mfmaIndex:40 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[64:71] -/* mfmaIndex:41 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[72:79] -/* mfmaIndex:42 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[80:87] -/* mfmaIndex:43 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[88:95] -/* mfmaIndex:44 */ -s_setprio 3 // store optimization -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-13wait for local write -s_barrier // -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[96:103] -/* mfmaIndex:45 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[104:111] -/* mfmaIndex:46 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[112:119] -/* mfmaIndex:47 */ - -/* local read swap offsets a */ - -/* local read swap offsets b */ - -/* local read init pointers a */ - -/* localReadInitPointers */ - -/* local read init pointers b */ - -/* localReadInitPointers */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 3 (swap and reset local write pointers iteration) */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:48 */ - -/* local write swap offsets a */ - -/* local write swap offsets b */ -s_waitcnt lgkmcnt(4) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[0:7] -/* mfmaIndex:49 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[8:15] -/* mfmaIndex:50 */ -ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[16:23] -/* mfmaIndex:51 */ -ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[24:31] -/* mfmaIndex:52 */ -buffer_load_dwordx4 v[vgprValuA_X0_I1+0:vgprValuA_X0_I1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0 -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[32:39] -/* mfmaIndex:53 */ -ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[40:47] -/* mfmaIndex:54 */ -ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[48:55] -/* mfmaIndex:55 */ -ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[56:63] -/* mfmaIndex:56 */ -buffer_load_dwordx4 v[vgprValuA_X0_I1+4:vgprValuA_X0_I1+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[64:71] -/* mfmaIndex:57 */ -ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[72:79] -/* mfmaIndex:58 */ -ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[80:87] -/* mfmaIndex:59 */ -ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[88:95] -/* mfmaIndex:60 */ -buffer_load_dwordx4 v[vgprValuA_X0_I1+8:vgprValuA_X0_I1+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0 -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[96:103] -/* mfmaIndex:61 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[104:111] -/* mfmaIndex:62 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[112:119] -/* mfmaIndex:63 */ -s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL -s_cmp_eq_u32 s[sgprLoopCounterL], 0x2 // counterL==2 -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[120:127] -buffer_load_dwordx4 v[vgprValuA_X0_I1+12:vgprValuA_X0_I1+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0 -/* numPrefetchIter=1 */ -/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - - - -/******************************************/ -/* Unrolled Loop - End */ -/******************************************/ - - -/* closeLoop loopL finalLoop=1 tailLoop=0 */ -s_cbranch_scc0 LoopBeginL_1 // restart LoopL -LoopEndL_oddexit_3: // unroll loop odditer exit -LoopEndL_2: - - -/* Before NLL: Check VGPR.checkin for INT8 LW */ - - - - -/******************************************/ -/* NoGlobalLoadLoop - Begin */ -/******************************************/ - -s_setprio 0 // store optimization - -/* iter 0 */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:0 */ -s_waitcnt lgkmcnt(0) vmcnt(8) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[0:7] -/* mfmaIndex:1 */ -ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -/* global read inc A loopL */ -s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUA+1], 0 // incUpper <- ? -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[8:15] -/* mfmaIndex:2 */ -ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc) -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[16:23] -/* mfmaIndex:3 */ -ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0] // Move shadow to real if we are within 2^32 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[24:31] -/* mfmaIndex:4 */ -ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -/* global read inc B loopL */ -s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter? -s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ? -s_cselect_b32 s69, s[sgprWrapUB+1], 0 // incUpper <- ? -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[32:39] -/* mfmaIndex:5 */ -ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc) -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[40:47] -/* mfmaIndex:6 */ -ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0] // Move shadow to real if we are within 2^32 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[48:55] -/* mfmaIndex:7 */ -ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[56:63] -/* mfmaIndex:8 */ -ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[64:71] -/* mfmaIndex:9 */ -/* localReadsVacancy: letencyLeft 1 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[72:79] -/* mfmaIndex:10 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[80:87] -/* mfmaIndex:11 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[88:95] -/* mfmaIndex:12 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[96:103] -/* mfmaIndex:13 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[104:111] -/* mfmaIndex:14 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[112:119] -/* mfmaIndex:15 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - -/* iter 1 */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:16 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[0:7] -/* mfmaIndex:17 */ -/* localReadsVacancy: letencyLeft 5 */ -/* 1 LDS buffer: read-sync-write */ -s_waitcnt lgkmcnt(0) // -s_barrier // -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[8:15] -/* mfmaIndex:18 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[16:23] -/* mfmaIndex:19 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[24:31] -/* mfmaIndex:20 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[32:39] -/* mfmaIndex:21 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[40:47] -/* mfmaIndex:22 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[48:55] -/* mfmaIndex:23 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[56:63] -/* mfmaIndex:24 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[64:71] -/* mfmaIndex:25 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[72:79] -/* mfmaIndex:26 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[80:87] -/* mfmaIndex:27 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[88:95] -/* mfmaIndex:28 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[96:103] -/* mfmaIndex:29 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[104:111] -/* mfmaIndex:30 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[112:119] -/* mfmaIndex:31 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 2 (reset local read pointers iteration) (swap local read pointers iteration) */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:32 */ -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[0:7] -/* mfmaIndex:33 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[8:15] -/* mfmaIndex:34 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[16:23] -/* mfmaIndex:35 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[24:31] -/* mfmaIndex:36 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[32:39] -/* mfmaIndex:37 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[40:47] -/* mfmaIndex:38 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[48:55] -/* mfmaIndex:39 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[56:63] -/* mfmaIndex:40 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[64:71] -/* mfmaIndex:41 */ -s_waitcnt vmcnt(7) // lgkmcnt=-1 vmcnt=3wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0 -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[72:79] -/* mfmaIndex:42 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[80:87] -/* mfmaIndex:43 */ -s_waitcnt vmcnt(6) // lgkmcnt=-1 vmcnt=2wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280 -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[88:95] -/* mfmaIndex:44 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[96:103] -/* mfmaIndex:45 */ -s_waitcnt vmcnt(5) // lgkmcnt=-1 vmcnt=1wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560 -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[104:111] -/* mfmaIndex:46 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[112:119] -/* mfmaIndex:47 */ -s_waitcnt vmcnt(4) // lgkmcnt=-1 vmcnt=0wait for global read before writing to local -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840 - -/* local read swap offsets a */ - -/* local read swap offsets b */ - -/* local read init pointers a */ - -/* localReadInitPointers */ - -/* local read init pointers b */ - -/* localReadInitPointers */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 3 (swap and reset local write pointers iteration) */ - -/* grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:48 */ - -/* local write swap offsets a */ - -/* local write swap offsets b */ -s_waitcnt lgkmcnt(4) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[0:7] -/* mfmaIndex:49 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[8:15] -/* mfmaIndex:50 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[16:23] -/* mfmaIndex:51 */ -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-13wait for local write -// Skip force waitcnt0 -s_barrier // -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[24:31] -/* mfmaIndex:52 */ -ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[32:39] -/* mfmaIndex:53 */ -ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[40:47] -/* mfmaIndex:54 */ -ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[48:55] -/* mfmaIndex:55 */ -ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[56:63] -/* mfmaIndex:56 */ -ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[64:71] -/* mfmaIndex:57 */ -ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[72:79] -/* mfmaIndex:58 */ -ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[80:87] -/* mfmaIndex:59 */ -ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[88:95] -/* mfmaIndex:60 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[96:103] -/* mfmaIndex:61 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[104:111] -/* mfmaIndex:62 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[112:119] -/* mfmaIndex:63 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[120:127] -/* numPrefetchIter=1 */ -/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - -label_0014: - - -/******************************************/ -/* Opt. NoLoadLoop Without PAP - Begin */ -/******************************************/ - -s_mov_b32 s68, s[sgprBeta+0] // tmp = Beta[0] -s_or_b32 s68, s[sgprBeta+1], s68 // tmp |= Beta[1] -s_cmpk_eq_u32 s68, 0x0 // Beta == 0 -s_cbranch_scc0 OptNLL_End_16 // Branch if Beta is not zero - -s_mov_b32 s68, 0 // Low part of double 1.0 -s_mov_b32 s69, 0x3ff00000 // High part of double 1.0 -s_cmp_eq_u64 s[sgprAlpha:sgprAlpha+1], s[68:69] // Alpha == 1.0 ? -s_cbranch_scc0 OptNLL_End_16 // branch if alpha != 1 - -s_and_b32 s68, 127, s[sgprSizeI] // s68 = s[sgprSizeI] % 128 -s_add_u32 s69, -0x1, s[sgprNumWorkGroups0] // -s_cmp_ge_u32 s[sgprWorkGroup0], s69 // wg0 >= nwg0-1 ? -s_cselect_b32 s68, s68, 0 // set rMT0 -s_cmpk_gt_u32 s68, 0x0 // rMT0 > 0 -s_cbranch_scc1 OptNLL_End_16 // jump if edges required -s_and_b32 s68, 127, s[sgprSizeJ] // s68 = s[sgprSizeJ] % 128 -s_add_u32 s69, -0x1, s[sgprNumWorkGroups1] // -s_cmp_ge_u32 s[sgprWorkGroup1], s69 // wg1 >= nwg1-1 -s_cselect_b32 s68, s68, 0 // set rMT1 -s_cmpk_gt_u32 s68, 0x0 // rMT1 > 0 -s_cbranch_scc1 OptNLL_End_16 // jump if edges required - -s_and_b32 s69, 15, s[sgprSizesSum+0] // s69 = s[sgprSizesSum+0] % 16 -s_cmp_eq_u32 s69, 0x0 // numIterL == 0 -s_cbranch_scc0 OptNLL_End_16 // skip if tail loop required - - - -/* iter 0 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:0 */ -s_waitcnt lgkmcnt(0) vmcnt(3) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[0:7] -/* mfmaIndex:1 */ -ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[8:15] -/* mfmaIndex:2 */ -ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[16:23] -/* mfmaIndex:3 */ -ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[24:31] -/* mfmaIndex:4 */ -ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[32:39] -/* mfmaIndex:5 */ -ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[40:47] -/* mfmaIndex:6 */ -ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[48:55] -/* mfmaIndex:7 */ -ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[56:63] -/* mfmaIndex:8 */ -ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[64:71] -/* mfmaIndex:9 */ -/* localReadsVacancy: letencyLeft 1 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[72:79] -/* mfmaIndex:10 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[80:87] -/* mfmaIndex:11 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[88:95] -/* mfmaIndex:12 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[96:103] -/* mfmaIndex:13 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[104:111] -/* mfmaIndex:14 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[112:119] -/* mfmaIndex:15 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - -/* iter 1 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:16 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(0) vmcnt(2) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[0:7] -/* mfmaIndex:17 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[8:15] -/* mfmaIndex:18 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[16:23] -/* mfmaIndex:19 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[24:31] -/* mfmaIndex:20 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[32:39] -/* mfmaIndex:21 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[40:47] -/* mfmaIndex:22 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[48:55] -/* mfmaIndex:23 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[56:63] -/* mfmaIndex:24 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[64:71] -/* mfmaIndex:25 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[72:79] -/* mfmaIndex:26 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[80:87] -/* mfmaIndex:27 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[88:95] -/* mfmaIndex:28 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[96:103] -/* mfmaIndex:29 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[104:111] -/* mfmaIndex:30 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[112:119] -/* mfmaIndex:31 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 2 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:32 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(0) vmcnt(1) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[0:7] -/* mfmaIndex:33 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[8:15] -/* mfmaIndex:34 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[16:23] -/* mfmaIndex:35 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[24:31] -/* mfmaIndex:36 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[32:39] -/* mfmaIndex:37 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[40:47] -/* mfmaIndex:38 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[48:55] -/* mfmaIndex:39 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[56:63] -/* mfmaIndex:40 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[64:71] -/* mfmaIndex:41 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[72:79] -/* mfmaIndex:42 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[80:87] -/* mfmaIndex:43 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[88:95] -/* mfmaIndex:44 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[96:103] -/* mfmaIndex:45 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[104:111] -/* mfmaIndex:46 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[112:119] -/* mfmaIndex:47 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 3 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:48 */ -s_waitcnt lgkmcnt(0) vmcnt(0) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[0:7] -/* mfmaIndex:49 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[8:15] -/* mfmaIndex:50 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[16:23] -/* mfmaIndex:51 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[24:31] -/* mfmaIndex:52 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[32:39] -/* mfmaIndex:53 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[40:47] -/* mfmaIndex:54 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[48:55] -/* mfmaIndex:55 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[56:63] -/* mfmaIndex:56 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[64:71] -/* mfmaIndex:57 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[72:79] -/* mfmaIndex:58 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[80:87] -/* mfmaIndex:59 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[88:95] -/* mfmaIndex:60 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[96:103] -/* mfmaIndex:61 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[104:111] -/* mfmaIndex:62 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[112:119] -/* mfmaIndex:63 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=0 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - -/* Stores for OptNLL */ -Summation_End_OptNLL_17: -s_setprio 0 // optimization store -/* endSummation: add vgpr [128...132) to pool (vgprValuA_X0_I0) */ -/* endSummation: add vgpr [144...160) to pool (vgprValuB_X0_I0) */ -/* endSummation: add vgpr [208...252) to pool */ -.set NumFullBlocks, UNDEF -.set WgmRemainder1, UNDEF -.set MagicNumberWgmRemainder1, UNDEF -.set ShadowLimitA, UNDEF -.set ShadowLimitB, UNDEF -.set WrapUA, UNDEF -.set WrapUB, UNDEF -.set GlobalReadIncsA, UNDEF -.set GlobalReadIncsB, UNDEF - -/* Mapping of Acc register -> C Vgpr register */ - -/* Multiply MI out register with Alpha -> C Vgpr register */ -/* computeStoreVgprs */ -v_lshrrev_b32 v144, 6, v[vgprSerial] // v144 = v[vgprSerial] / 64 -v_lshrrev_b32 v145, 2, v144 // v145 = v144 / 4 -v_mul_lo_u32 v145, 0x10, v145 // wave coordination offset 1 -v_and_b32 v129, 63, v[vgprSerial] // v129 = v[vgprSerial] % 64 -v_lshrrev_b32 v129, 4, v129 // v129 = v129 / 16 - // thread0 * continuous_output (multiplier is 1, do nothing) -v_add_u32 v129, v145, v129 // coordination 1 = wave_id1 + tid1 -v_mul_lo_u32 v130, v129, s[sgprStrideC1J] // offset 1 -v_mul_lo_u32 v131, v129, s[sgprStrideD1J] // offset 1 -v_and_b32 v128, 3, v144 // v128 = v144 % 4 -v_mul_lo_u32 v128, 0x10, v128 // wave coordination offset 0 -v_and_b32 v145, 15, v[vgprSerial] // v145 = v[vgprSerial] % 16 -_v_add_lshl_u32 v128, v145, v128, 1 // coordination 0 = wave_id0 + tid0 -s_mul_i32 s63, 128, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v128, s63, v128 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 -s_mul_i32 s63, 128, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v129, s63, v129 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 -GW_B0_E0_20: - -/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */ -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ -/* (0,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(0,0,0,0) */ -_v_add_lshl_u32 v146, v131, v128, 0x3 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128 -v_mov_b32 v[vgprValuC+148], v[vgprValuC+0] // copy MI out reg to vreg[0] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+1] // copy MI out reg to vreg[1] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+8] // copy MI out reg to vreg[2] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+9] // copy MI out reg to vreg[3] - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ -/* (1,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(1,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+2] // copy MI out reg to vreg[4] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+3] // copy MI out reg to vreg[5] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+10] // copy MI out reg to vreg[6] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+11] // copy MI out reg to vreg[7] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #2 (d1,d0,vc1,vc0) = */ -/* (2,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(2,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+4] // copy MI out reg to vreg[8] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+5] // copy MI out reg to vreg[9] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+12] // copy MI out reg to vreg[10] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+13] // copy MI out reg to vreg[11] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #3 (d1,d0,vc1,vc0) = */ -/* (3,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(3,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+6] // copy MI out reg to vreg[12] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+7] // copy MI out reg to vreg[13] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+14] // copy MI out reg to vreg[14] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+15] // copy MI out reg to vreg[15] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #4 (d1,d0,vc1,vc0) = */ -/* (4,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(4,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+16] // copy MI out reg to vreg[16] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+17] // copy MI out reg to vreg[17] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+24] // copy MI out reg to vreg[18] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+25] // copy MI out reg to vreg[19] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #5 (d1,d0,vc1,vc0) = */ -/* (5,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(5,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+18] // copy MI out reg to vreg[20] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+19] // copy MI out reg to vreg[21] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+26] // copy MI out reg to vreg[22] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+27] // copy MI out reg to vreg[23] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #6 (d1,d0,vc1,vc0) = */ -/* (6,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(6,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+20] // copy MI out reg to vreg[24] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+21] // copy MI out reg to vreg[25] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+28] // copy MI out reg to vreg[26] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+29] // copy MI out reg to vreg[27] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #7 (d1,d0,vc1,vc0) = */ -/* (7,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(7,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+22] // copy MI out reg to vreg[28] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+23] // copy MI out reg to vreg[29] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+30] // copy MI out reg to vreg[30] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+31] // copy MI out reg to vreg[31] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #8 (d1,d0,vc1,vc0) = */ -/* (8,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(8,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+32] // copy MI out reg to vreg[32] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+33] // copy MI out reg to vreg[33] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+40] // copy MI out reg to vreg[34] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+41] // copy MI out reg to vreg[35] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #9 (d1,d0,vc1,vc0) = */ -/* (9,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(9,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+34] // copy MI out reg to vreg[36] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+35] // copy MI out reg to vreg[37] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+42] // copy MI out reg to vreg[38] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+43] // copy MI out reg to vreg[39] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #10 (d1,d0,vc1,vc0) = */ -/* (10,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(10,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+36] // copy MI out reg to vreg[40] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+37] // copy MI out reg to vreg[41] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+44] // copy MI out reg to vreg[42] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+45] // copy MI out reg to vreg[43] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #11 (d1,d0,vc1,vc0) = */ -/* (11,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(11,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+38] // copy MI out reg to vreg[44] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+39] // copy MI out reg to vreg[45] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+46] // copy MI out reg to vreg[46] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+47] // copy MI out reg to vreg[47] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #12 (d1,d0,vc1,vc0) = */ -/* (12,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(12,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+48] // copy MI out reg to vreg[48] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+49] // copy MI out reg to vreg[49] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+56] // copy MI out reg to vreg[50] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+57] // copy MI out reg to vreg[51] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #13 (d1,d0,vc1,vc0) = */ -/* (13,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(13,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+50] // copy MI out reg to vreg[52] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+51] // copy MI out reg to vreg[53] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+58] // copy MI out reg to vreg[54] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+59] // copy MI out reg to vreg[55] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #14 (d1,d0,vc1,vc0) = */ -/* (14,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(14,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+52] // copy MI out reg to vreg[56] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+53] // copy MI out reg to vreg[57] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+60] // copy MI out reg to vreg[58] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+61] // copy MI out reg to vreg[59] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #15 (d1,d0,vc1,vc0) = */ -/* (15,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(15,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+54] // copy MI out reg to vreg[60] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+55] // copy MI out reg to vreg[61] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+62] // copy MI out reg to vreg[62] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+63] // copy MI out reg to vreg[63] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #16 (d1,d0,vc1,vc0) = */ -/* (16,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(16,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+64] // copy MI out reg to vreg[64] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+65] // copy MI out reg to vreg[65] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+72] // copy MI out reg to vreg[66] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+73] // copy MI out reg to vreg[67] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #17 (d1,d0,vc1,vc0) = */ -/* (17,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(17,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+66] // copy MI out reg to vreg[68] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+67] // copy MI out reg to vreg[69] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+74] // copy MI out reg to vreg[70] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+75] // copy MI out reg to vreg[71] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #18 (d1,d0,vc1,vc0) = */ -/* (18,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(18,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+68] // copy MI out reg to vreg[72] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+69] // copy MI out reg to vreg[73] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+76] // copy MI out reg to vreg[74] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+77] // copy MI out reg to vreg[75] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #19 (d1,d0,vc1,vc0) = */ -/* (19,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(19,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+70] // copy MI out reg to vreg[76] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+71] // copy MI out reg to vreg[77] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+78] // copy MI out reg to vreg[78] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+79] // copy MI out reg to vreg[79] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #20 (d1,d0,vc1,vc0) = */ -/* (20,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(20,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+80] // copy MI out reg to vreg[80] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+81] // copy MI out reg to vreg[81] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+88] // copy MI out reg to vreg[82] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+89] // copy MI out reg to vreg[83] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #21 (d1,d0,vc1,vc0) = */ -/* (21,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(21,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+82] // copy MI out reg to vreg[84] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+83] // copy MI out reg to vreg[85] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+90] // copy MI out reg to vreg[86] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+91] // copy MI out reg to vreg[87] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #22 (d1,d0,vc1,vc0) = */ -/* (22,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(22,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+84] // copy MI out reg to vreg[88] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+85] // copy MI out reg to vreg[89] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+92] // copy MI out reg to vreg[90] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+93] // copy MI out reg to vreg[91] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #23 (d1,d0,vc1,vc0) = */ -/* (23,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(23,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+86] // copy MI out reg to vreg[92] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+87] // copy MI out reg to vreg[93] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+94] // copy MI out reg to vreg[94] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+95] // copy MI out reg to vreg[95] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #24 (d1,d0,vc1,vc0) = */ -/* (24,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(24,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+96] // copy MI out reg to vreg[96] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+97] // copy MI out reg to vreg[97] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+104] // copy MI out reg to vreg[98] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+105] // copy MI out reg to vreg[99] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #25 (d1,d0,vc1,vc0) = */ -/* (25,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(25,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+98] // copy MI out reg to vreg[100] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+99] // copy MI out reg to vreg[101] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+106] // copy MI out reg to vreg[102] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+107] // copy MI out reg to vreg[103] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #26 (d1,d0,vc1,vc0) = */ -/* (26,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(26,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+100] // copy MI out reg to vreg[104] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+101] // copy MI out reg to vreg[105] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+108] // copy MI out reg to vreg[106] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+109] // copy MI out reg to vreg[107] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #27 (d1,d0,vc1,vc0) = */ -/* (27,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(27,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+102] // copy MI out reg to vreg[108] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+103] // copy MI out reg to vreg[109] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+110] // copy MI out reg to vreg[110] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+111] // copy MI out reg to vreg[111] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #28 (d1,d0,vc1,vc0) = */ -/* (28,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(28,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+112] // copy MI out reg to vreg[112] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+113] // copy MI out reg to vreg[113] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+120] // copy MI out reg to vreg[114] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+121] // copy MI out reg to vreg[115] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #29 (d1,d0,vc1,vc0) = */ -/* (29,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(29,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+114] // copy MI out reg to vreg[116] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+115] // copy MI out reg to vreg[117] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+122] // copy MI out reg to vreg[118] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+123] // copy MI out reg to vreg[119] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #30 (d1,d0,vc1,vc0) = */ -/* (30,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(30,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+116] // copy MI out reg to vreg[120] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+117] // copy MI out reg to vreg[121] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+124] // copy MI out reg to vreg[122] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+125] // copy MI out reg to vreg[123] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #31 (d1,d0,vc1,vc0) = */ -/* (31,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(31,0,0,0) */ -v_mov_b32 v[vgprValuC+148], v[vgprValuC+118] // copy MI out reg to vreg[124] -v_mov_b32 v[vgprValuC+149], v[vgprValuC+119] // copy MI out reg to vreg[125] -v_mov_b32 v[vgprValuC+150], v[vgprValuC+126] // copy MI out reg to vreg[126] -v_mov_b32 v[vgprValuC+151], v[vgprValuC+127] // copy MI out reg to vreg[127] - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_branch label_GW_End_22 // jump to end -label_GW_End_22: - -s_endpgm // Kernel End -OptNLL_End_16: - - -/******************************************/ -/* Ord. NoLoadLoop - Begin */ -/******************************************/ - - - - -/* iter 0 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:0 */ -s_waitcnt lgkmcnt(0) vmcnt(3) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[0:7] -/* mfmaIndex:1 */ -ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[8:15] -/* mfmaIndex:2 */ -ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[16:23] -/* mfmaIndex:3 */ -ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[24:31] -/* mfmaIndex:4 */ -ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[32:39] -/* mfmaIndex:5 */ -ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[40:47] -/* mfmaIndex:6 */ -ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[48:55] -/* mfmaIndex:7 */ -ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[56:63] -/* mfmaIndex:8 */ -ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0 -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[64:71] -/* mfmaIndex:9 */ -/* localReadsVacancy: letencyLeft 1 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[72:79] -/* mfmaIndex:10 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[80:87] -/* mfmaIndex:11 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[88:95] -/* mfmaIndex:12 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[96:103] -/* mfmaIndex:13 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[104:111] -/* mfmaIndex:14 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[112:119] -/* mfmaIndex:15 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */ - - -/* iter 1 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:16 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(0) vmcnt(2) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[0:7] -/* mfmaIndex:17 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[8:15] -/* mfmaIndex:18 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[16:23] -/* mfmaIndex:19 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[24:31] -/* mfmaIndex:20 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[32:39] -/* mfmaIndex:21 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[40:47] -/* mfmaIndex:22 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[48:55] -/* mfmaIndex:23 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[56:63] -/* mfmaIndex:24 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[64:71] -/* mfmaIndex:25 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[72:79] -/* mfmaIndex:26 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[80:87] -/* mfmaIndex:27 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[88:95] -/* mfmaIndex:28 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[96:103] -/* mfmaIndex:29 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[104:111] -/* mfmaIndex:30 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[112:119] -/* mfmaIndex:31 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 2 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:32 */ -/* localReadsVacancy: letencyLeft 5 */ -s_waitcnt lgkmcnt(0) vmcnt(1) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[0:7] -/* mfmaIndex:33 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[8:15] -/* mfmaIndex:34 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[16:23] -/* mfmaIndex:35 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[24:31] -/* mfmaIndex:36 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[32:39] -/* mfmaIndex:37 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[40:47] -/* mfmaIndex:38 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[48:55] -/* mfmaIndex:39 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[56:63] -/* mfmaIndex:40 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[64:71] -/* mfmaIndex:41 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[72:79] -/* mfmaIndex:42 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[80:87] -/* mfmaIndex:43 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[88:95] -/* mfmaIndex:44 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[96:103] -/* mfmaIndex:45 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[104:111] -/* mfmaIndex:46 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[112:119] -/* mfmaIndex:47 */ -/* localReadsVacancy: letencyLeft 5 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - - -/* iter 3 (last unrolled loop) */ - -/* grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48 */ -/* numMfmaForLR:13, barrierMfmaIndex:50 */ -/* mfmaIndex:48 */ -s_waitcnt lgkmcnt(0) vmcnt(0) // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[0:7] -/* mfmaIndex:49 */ -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[8:15] -/* mfmaIndex:50 */ -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[16:23] -/* mfmaIndex:51 */ -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[24:31] -/* mfmaIndex:52 */ -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[32:39] -/* mfmaIndex:53 */ -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[40:47] -/* mfmaIndex:54 */ -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[48:55] -/* mfmaIndex:55 */ -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[56:63] -/* mfmaIndex:56 */ -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[64:71] -/* mfmaIndex:57 */ -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[72:79] -/* mfmaIndex:58 */ -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[80:87] -/* mfmaIndex:59 */ -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[88:95] -/* mfmaIndex:60 */ -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[96:103] -/* mfmaIndex:61 */ -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[104:111] -/* mfmaIndex:62 */ -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[112:119] -/* mfmaIndex:63 */ -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[120:127] -/* numPrefetchIter=0 */ -/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=0 readsPerIterA=1 */ -/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */ - -PrefetchGlobalLastIterEnd_4: - - -/******************************************/ -/* Tail Loop */ -/******************************************/ - - -/* local write reset offsets a */ - - - -/* local write reset offsets b */ - - -/* tail loop: add vgpr [132...136) to pool (vgprValuA_X1_I0) */ -/* tail loop: add vgpr [136...140) to pool (vgprValuA_X2_I0) */ -/* tail loop: add vgpr [140...144) to pool (vgprValuA_X3_I0) */ -/* tail loop: add vgpr [160...176) to pool (vgprValuB_X1_I0) */ -/* tail loop: add vgpr [176...192) to pool (vgprValuB_X2_I0) */ -/* tail loop: add vgpr [192...208) to pool (vgprValuB_X3_I0) */ - -//numIterL = (((sizeL % LOCAL_DEPTHU) + LOCAL_SPLITU - 1) / LOCAL_SPLITU) -s_and_b32 s[sgprLoopCounterL], 15, s[sgprSizesSum+0] // s[sgprLoopCounterL] = s[sgprSizesSum+0] % 16 -s_cmp_eq_u32 s[sgprLoopCounterL], 0x0 // numIterL == 0 -s_mov_b32 s[sgprOrigLoopCounter], 0 // repurpose to count each localRead increment -s_cbranch_scc1 SkipTailLoopL_7 // skip to end of tail loop b/c numIter==0 - - -/* remove stagger offsets for tail loop */ - -s_sub_i32 s68, 3, s[sgprStaggerUIter] // -s_mul_hi_i32 s69, s68, s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_mul_i32 s68, s68, s[sgprGlobalReadIncsA+0] // start offset S in bytes -s_sub_u32 s68, s68, s[sgprWrapUA] // S - WrapU -s_subb_u32 s69, s69, s[sgprWrapUA+1] // S - WrapU -s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc) -s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0] // Move shadow to real if we are within 2^32 - -s_sub_i32 s68, 3, s[sgprStaggerUIter] // -s_mul_hi_i32 s69, s68, s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_mul_i32 s68, s68, s[sgprGlobalReadIncsB+0] // start offset S in bytes -s_sub_u32 s68, s68, s[sgprWrapUB] // S - WrapU -s_subb_u32 s69, s69, s[sgprWrapUB+1] // S - WrapU -s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68 // gra SRD += inc(lower) -s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s69 // gra SRD += inc(upper) -s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc) -s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc) -s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? -s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0] // Move shadow to real if we are within 2^32 - - -/* Update M0 for DTLDS */ - - - -/* global read a */ - -/* g2l=0, load component 0 */ -buffer_load_dwordx2 v[vgprG2LA+0+0:vgprG2LA+0+0+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value -/* g2l=0, load component 1 */ -buffer_load_dwordx2 v[vgprG2LA+0+2:vgprG2LA+0+2+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value -/* g2l=4, load component 0 */ -buffer_load_dwordx2 v[vgprG2LA+4+0:vgprG2LA+4+0+1], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value -/* g2l=4, load component 1 */ -buffer_load_dwordx2 v[vgprG2LA+4+2:vgprG2LA+4+2+1], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value -/* g2l=8, load component 0 */ -buffer_load_dwordx2 v[vgprG2LA+8+0:vgprG2LA+8+0+1], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value -/* g2l=8, load component 1 */ -buffer_load_dwordx2 v[vgprG2LA+8+2:vgprG2LA+8+2+1], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value -/* g2l=12, load component 0 */ -buffer_load_dwordx2 v[vgprG2LA+12+0:vgprG2LA+12+0+1], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value -/* g2l=12, load component 1 */ -buffer_load_dwordx2 v[vgprG2LA+12+2:vgprG2LA+12+2+1], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value - - -/* Update M0 for DTLDS */ - - - -/* global read b */ - -/* g2l=0, load component 0 */ -buffer_load_dwordx2 v[vgprG2LB+0+0:vgprG2LB+0+0+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value -/* g2l=0, load component 1 */ -buffer_load_dwordx2 v[vgprG2LB+0+2:vgprG2LB+0+2+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value -/* g2l=4, load component 0 */ -buffer_load_dwordx2 v[vgprG2LB+4+0:vgprG2LB+4+0+1], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value -/* g2l=4, load component 1 */ -buffer_load_dwordx2 v[vgprG2LB+4+2:vgprG2LB+4+2+1], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value -/* g2l=8, load component 0 */ -buffer_load_dwordx2 v[vgprG2LB+8+0:vgprG2LB+8+0+1], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value -/* g2l=8, load component 1 */ -buffer_load_dwordx2 v[vgprG2LB+8+2:vgprG2LB+8+2+1], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value -/* g2l=12, load component 0 */ -buffer_load_dwordx2 v[vgprG2LB+12+0:vgprG2LB+12+0+1], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value -/* g2l=12, load component 1 */ -buffer_load_dwordx2 v[vgprG2LB+12+2:vgprG2LB+12+2+1], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value - -s_waitcnt vmcnt(0) // lgkmcnt=-1 vmcnt=02wait for global read - -// Skip force waitcnt0 -s_barrier // - - - - -/* local write a */ - -ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+0:vgprG2LA+0+3] offset:0 // lwoA_0_0_0_0 = (0*LSCA) + (0*LSPA)(*MT0I+PAD) = 0 -ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+4:vgprG2LA+4+3] offset:4096 // lwoA_0_0_1_0 = (0*LSCA) + (1*LSPA)(*MT0I+PAD) = 4096 -ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+8:vgprG2LA+8+3] offset:8192 // lwoA_0_0_2_0 = (0*LSCA) + (2*LSPA)(*MT0I+PAD) = 8192 -ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+12:vgprG2LA+12+3] offset:12288 // lwoA_0_0_3_0 = (0*LSCA) + (3*LSPA)(*MT0I+PAD) = 12288 - - -/* local write b */ - -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0 -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280 -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560 -ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840 - - -/* Recalc local read offsets */ - -/*lr0I*/ -v_and_b32 v134, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) -v_and_b32 v133, 15, v134 // 1. N offset: nIdx = wtid % MI_N(16) - // 1. N offset: nOffset = nIdx * nStride(1) (multiplier is 1, do nothing) -v_lshrrev_b32 v132, 4, v134 // 2. block offset: bnIdx = wtid / dividedForBlkId(16) -v_and_b32 v132, 0, v132 // 2. block offset: bnIdx = bnIdx % num1DBlocks(1) -v_lshlrev_b32 v132, 0x4, v132 // 2. block offset: bnOffset = bnIdx * strideBlock(16) -_v_add_u32 v133, v132, v133 // 3. add N and block offset: bnOffset = block and N offset -v_lshlrev_b32 v133, 0x1, v133 // 3. apply VectorWidth: bnOffset = bnOffset * vw(2) -v_lshrrev_b32 v134, 4, v134 // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) -v_lshlrev_b32 v134, 0x7, v134 // 4. K offset: lrKOffset = kIdx * mStride(128) -_v_add_u32 v133, v134, v133 // 5. offset in wave: lrOffset = bnOffset + lrKOffset -v_lshrrev_b32 v132, 6, v[vgprSerial] // 6. wave offset in N dimen: wtid = tid / dividedForWaveId(64) -v_and_b32 v132, 3, v132 // 6. wave offset in M dimen: wtid0 = wtid / num1DWaves(4) -v_lshlrev_b32 v132, 0x5, v132 // 6. wave offset in M dimen: wOffset = wtid0 * W0Stride(32) -_v_add_u32 v133, v132, v133 // 7. final local read offset: flrOffset = lrOffset + WOffset -/*lr1J*/ -v_and_b32 v135, 63, v[vgprSerial] // 0. thread id in wave: wtid = tid % wavelength(64) -v_and_b32 v134, 15, v135 // 1. N offset: nIdx = wtid % MI_N(16) -v_lshlrev_b32 v134, 0x4, v134 // 1. N offset: nOffset = nIdx * nStride(16) -v_lshrrev_b32 v132, 4, v135 // 2. block offset: bnIdx = wtid / dividedForBlkId(16) -v_and_b32 v132, 0, v132 // 2. block offset: bnIdx = bnIdx % num1DBlocks(1) -v_lshlrev_b32 v132, 0x8, v132 // 2. block offset: bnOffset = bnIdx * strideBlock(256) -_v_add_u32 v134, v132, v134 // 3. add N and block offset: bnOffset = block and N offset - // 3. apply VectorWidth: bnOffset = bnOffset * vw(1) (multiplier is 1, do nothing) -v_lshrrev_b32 v135, 4, v135 // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1)) - // 4. K offset: lrKOffset = kIdx * mStride(1) (multiplier is 1, do nothing) -_v_add_u32 v134, v135, v134 // 5. offset in wave: lrOffset = bnOffset + lrKOffset -v_lshrrev_b32 v132, 8, v[vgprSerial] // LSU offset: sgid = Serial / subGroup(256) -s_mov_b32 s68, 128 // LSU offset: stirde = MT0(128) + PAD0(0) -v_mul_lo_u32 v132, s68, v132 // LSU offset: lsuoffset = sgid*(MT0+PAD) -_v_add_lshl_u32 v[vgprLocalReadAddrA], v132, v133, 0x3 // Final Offset: offset = (lro0*VW+lsuoffset)*bpe -/* N/A */ -v_lshrrev_b32 v132, 8, v[vgprSerial] // LSU offset: sgid = Serial / subGroup(256) -s_mov_b32 s68, 128 // LSU offset: stirde = MT1(128) + PAD1(0) -v_mul_lo_u32 v132, s68, v132 // LSU offset: lsuoffset = sgid*(MT1+PAD) -_v_add_lshl_u32 v[vgprLocalReadAddrB], v132, v134, 0x3 // Final Offset: offset = (lro1*VW+lsuoffset)*bpe -v_lshrrev_b32 v133, 7, v[vgprLocalReadAddrB] // Final Offset: padding 4 per block 128 -v_lshlrev_b32 v133, 0x5, v133 // Final Offset: padding 4 per block 128 -_v_add_u32 v[vgprLocalReadAddrB], v133, v[vgprLocalReadAddrB] // Final Offset: add padding 4 per block 128 -_v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x4000, v[vgprLocalReadAddrB+0] // += LdsOffsetB (lower) - -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-15wait for local write - -// Skip force waitcnt0 -s_barrier // - - -/* local read reset offsets a */ - - - -/* local read reset offsets b */ - - - -/* local read init pointers a */ - - -/* localReadInitPointers */ - - -/* local read init pointers b */ - - -/* localReadInitPointers */ - - -/* tail loop: macs */ - -TailLoopBeginL_5: - - -/* local read a */ - -ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 - - -/* local read b */ - -ds_read_b64 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+1], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+2:vgprValuB_X0_I0+2+1], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+1], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+6:vgprValuB_X0_I0+6+1], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+1], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+10:vgprValuB_X0_I0+10+1], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+1], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 -ds_read_b64 v[vgprValuB_X0_I0+14:vgprValuB_X0_I0+14+1], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0 - - -/* local read inc a */ - -s_mov_b32 s68, 0x1000 // inc -_v_add_co_u32 v[vgprLocalReadAddrA], vcc, s68, v[vgprLocalReadAddrA] // lrA += 4096 (LSU*(MT+PAD)*bpe) - - -/* local read inc b */ - -s_mov_b32 s68, 0x20 // inc -_v_add_co_u32 v[vgprLocalReadAddrB], vcc, s68, v[vgprLocalReadAddrB] // lrB += 32 (LSU*(MT+PAD)*bpe) - -s_waitcnt lgkmcnt(0) // lgkmcnt=0 vmcnt=-14wait for local read - - -v_and_b32 v132, 63, v[vgprSerial] // v132 = v[vgprSerial] % 64 -v_lshrrev_b32 v132, 4, v132 // v132 = v132 / 16 - // v132 = v132 * 1 (multiplier is 1, do nothing) -v_cmp_ge_i32 s[68:69], v132, s[sgprLoopCounterL] // check K index >= Size L -v_cndmask_b32 v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+0+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+2+0], v[vgprValuA_X0_I0+2+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+0], v[vgprValuB_X0_I0+0+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+2+0], v[vgprValuB_X0_I0+2+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+0], v[vgprValuB_X0_I0+4+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+6+0], v[vgprValuB_X0_I0+6+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+0], v[vgprValuB_X0_I0+8+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+10+0], v[vgprValuB_X0_I0+10+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+0], v[vgprValuB_X0_I0+12+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+14+0], v[vgprValuB_X0_I0+14+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuA_X0_I0+2+1], v[vgprValuA_X0_I0+2+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+0+1], v[vgprValuB_X0_I0+0+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+2+1], v[vgprValuB_X0_I0+2+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+4+1], v[vgprValuB_X0_I0+4+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+6+1], v[vgprValuB_X0_I0+6+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_X0_I0+8+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+10+1], v[vgprValuB_X0_I0+10+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+12+1], v[vgprValuB_X0_I0+12+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -v_cndmask_b32 v[vgprValuB_X0_I0+14+1], v[vgprValuB_X0_I0+14+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL -s_nop 1 -v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[0:7] -v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[8:15] -v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[16:23] -v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[24:31] -v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[32:39] -v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[40:47] -v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[48:55] -v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[56:63] -v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[64:71] -v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[72:79] -v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[80:87] -v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[88:95] -v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[96:103] -v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[104:111] -v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[112:119] -v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[120:127] - - -/* closeLoop loopL finalLoop=1 tailLoop=1 */ -s_sub_i32 s[sgprLoopCounterL], s[sgprLoopCounterL], 0x4 // dec counterL (tailLoop) -s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x4 // inc counterL -s_cmp_le_i32 s[sgprLoopCounterL], 0x0 // counterL<=0 -s_cbranch_scc0 TailLoopBeginL_5 // restart LoopL -TailLoopEndL_6: - -SkipTailLoopL_7: - -Summation_End_25: -s_setprio 0 // optimization store -/* endSummation: add vgpr [128...132) to pool (vgprValuA_X0_I0) */ -/* endSummation: add vgpr [144...160) to pool (vgprValuB_X0_I0) */ -/* endSummation: add vgpr [208...252) to pool */ -.set NumFullBlocks, UNDEF -.set WgmRemainder1, UNDEF -.set MagicNumberWgmRemainder1, UNDEF -.set ShadowLimitA, UNDEF -.set ShadowLimitB, UNDEF -.set WrapUA, UNDEF -.set WrapUB, UNDEF -.set GlobalReadIncsA, UNDEF -.set GlobalReadIncsB, UNDEF - -/* Mapping of Acc register -> C Vgpr register */ - -/* Multiply MI out register with Alpha -> C Vgpr register */ - -// TODO in Generator -// skip shift vector if M % 2 == 0 -s_and_b32 s63, 0x1, s[sgprSizeI] -s_cbranch_scc0 label_0029 // done shifting - -/* shift vector components d0 */ - -v_mov_b32 v131, s[sgprWorkGroup0] // -v_mul_i32_i24 v131, -0x80, v131 // wg*MT -_v_add_co_u32 v131, vcc, s[sgprSizesFree+0], v131 // wgMT = Size - wg*MT -v_mov_b32 v132, 0x80 // MT -v_cmp_lt_u32 s[64:65], v131, v132 // wgMT < MT -v_cndmask_b32 v131, v132, v131, s[64:65] // wgMT = (wgMT < MT) ? wgMT : MT -v_lshrrev_b32 v133, 6, v[vgprSerial] // v133 = v[vgprSerial] / 64 -v_and_b32 v133, 3, v133 // v133 = v133 % 4 -v_lshrrev_b32 v134, 5, v131 // v134 = v131 / 32 -v_and_b32 v134, 3, v134 // v134 = v134 % 4 -v_cmp_eq_u32 s[64:65], v134, v133 // wave_id == block_belong_to_wave? -v_cndmask_b32 v131, v132, v131, s[64:65] // wgMT = (wgMT < MT) ? wgMT : MT - -/* mbReg: which mb block need to shift, mb(matrixInstCoal(16) * VectorWidth(2)) */ -v_lshrrev_b32 v132, 5, v131 // v132 = v131 / 32 -v_lshlrev_b32 v134, 0x0, v133 // v134 = v133 * 1 -_v_sub_u32 v132, v132, v134 // - -/* gbReg: glvw block id */ -v_lshrrev_b32 v134, 1, v131 // v134 = v131 / 2 - -/* tgbReg: glvw block id */ -v_lshrrev_b32 v135, 0, v[vgprSerial] // v135 = v[vgprSerial] / 1 -v_and_b32 v135, 15, v135 // v135 = v135 % 16 -v_lshlrev_b32 v135, 0x1, v135 // v135 = v135 * 2 -v_lshrrev_b32 v135, 1, v135 // v135 = v135 / 2 -v_lshlrev_b32 v133, 0x4, v133 // v133 = v133 * 16 -_v_add_co_u32 v135, vcc, v133, v135 // tgbReg = (tid_coal * continOut) / GLVW -_v_sub_u32 v134, v134, v135 // - -/* vwReg: glvw in which vw block? */ -v_and_b32 v133, 1, v131 // permute register between threads -v_lshrrev_b32 v133, 1, v133 // permute register between threads - -/* rReg : reminder of M_size % GlobalLoadVectorWidth */ -v_and_b32 v135, 1, v131 // v135 = v131 % 2 -v_cmp_eq_u32 vcc, v135, 0x1 // wgMT%VW == 1 -s_cbranch_vccnz label_0026 // branch to shift d0 r=1 -s_branch label_0029 // no shifting - -/******************************************/ -/* shift d0 r=1 */ -/******************************************/ -label_0026: -v_cmp_eq_u32 vcc, v132, 0x0 // -s_cbranch_vccnz label_0027 // branch to shift d0 r1 mb0 - -/******************************************/ -/* shift d0 r=1 mb=0 */ -/******************************************/ -label_0027: // r1 mb0 -v_cmp_eq_u32 vcc, v133, 0x0 // -s_cbranch_vccnz label_0028 // branch to shift d0 r1 mb0 vw0 - -/******************************************/ -/* shift d0 r=1 mb=0 vw0 */ -/******************************************/ -label_0028: // r1 mb0 vw0 -s_mov_b32 s64, 0 // -v_cmpx_eq_u32 s[64:65], v134, s64 // is thread in edge glvw region -v_and_b32 v128, 63, v[vgprSerial] // permute register between threads -v_lshlrev_b32 v128, 2, v128 // permute register between threads -v_mov_b32 v135, v8 // glvw 1 mb 0 tt1 0 r 0 -v_mov_b32 v0, v135 // -v_mov_b32 v135, v9 // glvw 1 mb 0 tt1 0 r 1 -v_mov_b32 v1, v135 // -v_mov_b32 v135, v10 // glvw 1 mb 0 tt1 1 r 0 -v_mov_b32 v2, v135 // -v_mov_b32 v135, v11 // glvw 1 mb 0 tt1 1 r 1 -v_mov_b32 v3, v135 // -v_mov_b32 v135, v12 // glvw 1 mb 0 tt1 2 r 0 -v_mov_b32 v4, v135 // -v_mov_b32 v135, v13 // glvw 1 mb 0 tt1 2 r 1 -v_mov_b32 v5, v135 // -v_mov_b32 v135, v14 // glvw 1 mb 0 tt1 3 r 0 -v_mov_b32 v6, v135 // -v_mov_b32 v135, v15 // glvw 1 mb 0 tt1 3 r 1 -v_mov_b32 v7, v135 // -v_mov_b32 v135, v24 // glvw 1 mb 0 tt1 4 r 0 -v_mov_b32 v16, v135 // -v_mov_b32 v135, v25 // glvw 1 mb 0 tt1 4 r 1 -v_mov_b32 v17, v135 // -v_mov_b32 v135, v26 // glvw 1 mb 0 tt1 5 r 0 -v_mov_b32 v18, v135 // -v_mov_b32 v135, v27 // glvw 1 mb 0 tt1 5 r 1 -v_mov_b32 v19, v135 // -v_mov_b32 v135, v28 // glvw 1 mb 0 tt1 6 r 0 -v_mov_b32 v20, v135 // -v_mov_b32 v135, v29 // glvw 1 mb 0 tt1 6 r 1 -v_mov_b32 v21, v135 // -v_mov_b32 v135, v30 // glvw 1 mb 0 tt1 7 r 0 -v_mov_b32 v22, v135 // -v_mov_b32 v135, v31 // glvw 1 mb 0 tt1 7 r 1 -v_mov_b32 v23, v135 // -v_mov_b32 v135, v40 // glvw 1 mb 0 tt1 8 r 0 -v_mov_b32 v32, v135 // -v_mov_b32 v135, v41 // glvw 1 mb 0 tt1 8 r 1 -v_mov_b32 v33, v135 // -v_mov_b32 v135, v42 // glvw 1 mb 0 tt1 9 r 0 -v_mov_b32 v34, v135 // -v_mov_b32 v135, v43 // glvw 1 mb 0 tt1 9 r 1 -v_mov_b32 v35, v135 // -v_mov_b32 v135, v44 // glvw 1 mb 0 tt1 10 r 0 -v_mov_b32 v36, v135 // -v_mov_b32 v135, v45 // glvw 1 mb 0 tt1 10 r 1 -v_mov_b32 v37, v135 // -v_mov_b32 v135, v46 // glvw 1 mb 0 tt1 11 r 0 -v_mov_b32 v38, v135 // -v_mov_b32 v135, v47 // glvw 1 mb 0 tt1 11 r 1 -v_mov_b32 v39, v135 // -v_mov_b32 v135, v56 // glvw 1 mb 0 tt1 12 r 0 -v_mov_b32 v48, v135 // -v_mov_b32 v135, v57 // glvw 1 mb 0 tt1 12 r 1 -v_mov_b32 v49, v135 // -v_mov_b32 v135, v58 // glvw 1 mb 0 tt1 13 r 0 -v_mov_b32 v50, v135 // -v_mov_b32 v135, v59 // glvw 1 mb 0 tt1 13 r 1 -v_mov_b32 v51, v135 // -v_mov_b32 v135, v60 // glvw 1 mb 0 tt1 14 r 0 -v_mov_b32 v52, v135 // -v_mov_b32 v135, v61 // glvw 1 mb 0 tt1 14 r 1 -v_mov_b32 v53, v135 // -v_mov_b32 v135, v62 // glvw 1 mb 0 tt1 15 r 0 -v_mov_b32 v54, v135 // -v_mov_b32 v135, v63 // glvw 1 mb 0 tt1 15 r 1 -v_mov_b32 v55, v135 // -v_mov_b32 v135, v72 // glvw 1 mb 0 tt1 16 r 0 -v_mov_b32 v64, v135 // -v_mov_b32 v135, v73 // glvw 1 mb 0 tt1 16 r 1 -v_mov_b32 v65, v135 // -v_mov_b32 v135, v74 // glvw 1 mb 0 tt1 17 r 0 -v_mov_b32 v66, v135 // -v_mov_b32 v135, v75 // glvw 1 mb 0 tt1 17 r 1 -v_mov_b32 v67, v135 // -v_mov_b32 v135, v76 // glvw 1 mb 0 tt1 18 r 0 -v_mov_b32 v68, v135 // -v_mov_b32 v135, v77 // glvw 1 mb 0 tt1 18 r 1 -v_mov_b32 v69, v135 // -v_mov_b32 v135, v78 // glvw 1 mb 0 tt1 19 r 0 -v_mov_b32 v70, v135 // -v_mov_b32 v135, v79 // glvw 1 mb 0 tt1 19 r 1 -v_mov_b32 v71, v135 // -v_mov_b32 v135, v88 // glvw 1 mb 0 tt1 20 r 0 -v_mov_b32 v80, v135 // -v_mov_b32 v135, v89 // glvw 1 mb 0 tt1 20 r 1 -v_mov_b32 v81, v135 // -v_mov_b32 v135, v90 // glvw 1 mb 0 tt1 21 r 0 -v_mov_b32 v82, v135 // -v_mov_b32 v135, v91 // glvw 1 mb 0 tt1 21 r 1 -v_mov_b32 v83, v135 // -v_mov_b32 v135, v92 // glvw 1 mb 0 tt1 22 r 0 -v_mov_b32 v84, v135 // -v_mov_b32 v135, v93 // glvw 1 mb 0 tt1 22 r 1 -v_mov_b32 v85, v135 // -v_mov_b32 v135, v94 // glvw 1 mb 0 tt1 23 r 0 -v_mov_b32 v86, v135 // -v_mov_b32 v135, v95 // glvw 1 mb 0 tt1 23 r 1 -v_mov_b32 v87, v135 // -v_mov_b32 v135, v104 // glvw 1 mb 0 tt1 24 r 0 -v_mov_b32 v96, v135 // -v_mov_b32 v135, v105 // glvw 1 mb 0 tt1 24 r 1 -v_mov_b32 v97, v135 // -v_mov_b32 v135, v106 // glvw 1 mb 0 tt1 25 r 0 -v_mov_b32 v98, v135 // -v_mov_b32 v135, v107 // glvw 1 mb 0 tt1 25 r 1 -v_mov_b32 v99, v135 // -v_mov_b32 v135, v108 // glvw 1 mb 0 tt1 26 r 0 -v_mov_b32 v100, v135 // -v_mov_b32 v135, v109 // glvw 1 mb 0 tt1 26 r 1 -v_mov_b32 v101, v135 // -v_mov_b32 v135, v110 // glvw 1 mb 0 tt1 27 r 0 -v_mov_b32 v102, v135 // -v_mov_b32 v135, v111 // glvw 1 mb 0 tt1 27 r 1 -v_mov_b32 v103, v135 // -v_mov_b32 v135, v120 // glvw 1 mb 0 tt1 28 r 0 -v_mov_b32 v112, v135 // -v_mov_b32 v135, v121 // glvw 1 mb 0 tt1 28 r 1 -v_mov_b32 v113, v135 // -v_mov_b32 v135, v122 // glvw 1 mb 0 tt1 29 r 0 -v_mov_b32 v114, v135 // -v_mov_b32 v135, v123 // glvw 1 mb 0 tt1 29 r 1 -v_mov_b32 v115, v135 // -v_mov_b32 v135, v124 // glvw 1 mb 0 tt1 30 r 0 -v_mov_b32 v116, v135 // -v_mov_b32 v135, v125 // glvw 1 mb 0 tt1 30 r 1 -v_mov_b32 v117, v135 // -v_mov_b32 v135, v126 // glvw 1 mb 0 tt1 31 r 0 -v_mov_b32 v118, v135 // -v_mov_b32 v135, v127 // glvw 1 mb 0 tt1 31 r 1 -v_mov_b32 v119, v135 // -s_mov_b64 s[64:65], 0xFFFFFFFFFFFFFFFF // to restore all threads active -s_or_saveexec_b64 vcc, s[64:65] // all threads active -s_branch label_0029 // done shifting - -label_0029: // end shift0 - - - -/* not-LocalSplitU: global write indices */ - -/* computeStoreVgprs */ -v_lshrrev_b32 v132, 6, v[vgprSerial] // v132 = v[vgprSerial] / 64 -v_lshrrev_b32 v133, 2, v132 // v133 = v132 / 4 -v_mul_lo_u32 v133, 0x10, v133 // wave coordination offset 1 -v_and_b32 v129, 63, v[vgprSerial] // v129 = v[vgprSerial] % 64 -v_lshrrev_b32 v129, 4, v129 // v129 = v129 / 16 - // thread0 * continuous_output (multiplier is 1, do nothing) -v_add_u32 v129, v133, v129 // coordination 1 = wave_id1 + tid1 -v_mul_lo_u32 v130, v129, s[sgprStrideC1J] // offset 1 -v_mul_lo_u32 v131, v129, s[sgprStrideD1J] // offset 1 -v_and_b32 v128, 3, v132 // v128 = v132 % 4 -v_mul_lo_u32 v128, 0x10, v128 // wave coordination offset 0 -v_and_b32 v133, 15, v[vgprSerial] // v133 = v[vgprSerial] % 16 -_v_add_lshl_u32 v128, v133, v128, 1 // coordination 0 = wave_id0 + tid0 -s_mul_i32 s63, 128, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v128, s63, v128 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 -s_mul_i32 s63, 128, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v129, s63, v129 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 - - -/* not-LocalSplitU: global write */ - -s_mov_b32 s64, s[sgprBeta+0] // tmp = Beta[0] -s_or_b32 s64, s[sgprBeta+1], s64 // tmp |= Beta[1] -s_cmpk_eq_u32 s64, 0x0 // Beta == 0 -s_cbranch_scc0 GW_Beta_46 // Branch if Beta is not zero - -s_and_b32 s64, 127, s[sgprSizeI] // s64 = s[sgprSizeI] % 128 -s_add_u32 s65, -0x1, s[sgprNumWorkGroups0] // -s_cmp_ge_u32 s[sgprWorkGroup0], s65 // wg0 >= nwg0-1 ? -s_cselect_b32 s64, s64, 0 // set rMT0 -s_cmpk_gt_u32 s64, 0x0 // rMT0 > 0 -s_cbranch_scc1 GW_B0_E1_37 // jump if edges required -s_and_b32 s64, 127, s[sgprSizeJ] // s64 = s[sgprSizeJ] % 128 -s_add_u32 s65, -0x1, s[sgprNumWorkGroups1] // -s_cmp_ge_u32 s[sgprWorkGroup1], s65 // wg1 >= nwg1-1 -s_cselect_b32 s64, s64, 0 // set rMT1 -s_cmpk_gt_u32 s64, 0x0 // rMT1 > 0 -s_cbranch_scc1 GW_B0_E1_37 // jump if edges required -GW_B0_E0_34: - -/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */ -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #0 (d1,d0,vc1,vc0) = */ -/* (0,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(0,0,0,0) */ -_v_add_lshl_u32 v134, v131, v128, 0x3 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128 - -/* rC *= alpha batchEements=[(0, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #1 (d1,d0,vc1,vc0) = */ -/* (1,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(1,0,0,0) */ - -/* rC *= alpha batchEements=[(1, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #2 (d1,d0,vc1,vc0) = */ -/* (2,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(2,0,0,0) */ - -/* rC *= alpha batchEements=[(2, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #3 (d1,d0,vc1,vc0) = */ -/* (3,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(3,0,0,0) */ - -/* rC *= alpha batchEements=[(3, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #4 (d1,d0,vc1,vc0) = */ -/* (4,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(4,0,0,0) */ - -/* rC *= alpha batchEements=[(4, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #5 (d1,d0,vc1,vc0) = */ -/* (5,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(5,0,0,0) */ - -/* rC *= alpha batchEements=[(5, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #6 (d1,d0,vc1,vc0) = */ -/* (6,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(6,0,0,0) */ - -/* rC *= alpha batchEements=[(6, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #7 (d1,d0,vc1,vc0) = */ -/* (7,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(7,0,0,0) */ - -/* rC *= alpha batchEements=[(7, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #8 (d1,d0,vc1,vc0) = */ -/* (8,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(8,0,0,0) */ - -/* rC *= alpha batchEements=[(8, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #9 (d1,d0,vc1,vc0) = */ -/* (9,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(9,0,0,0) */ - -/* rC *= alpha batchEements=[(9, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #10 (d1,d0,vc1,vc0) = */ -/* (10,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(10,0,0,0) */ - -/* rC *= alpha batchEements=[(10, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #11 (d1,d0,vc1,vc0) = */ -/* (11,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(11,0,0,0) */ - -/* rC *= alpha batchEements=[(11, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #12 (d1,d0,vc1,vc0) = */ -/* (12,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(12,0,0,0) */ - -/* rC *= alpha batchEements=[(12, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #13 (d1,d0,vc1,vc0) = */ -/* (13,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(13,0,0,0) */ - -/* rC *= alpha batchEements=[(13, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #14 (d1,d0,vc1,vc0) = */ -/* (14,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(14,0,0,0) */ - -/* rC *= alpha batchEements=[(14, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #15 (d1,d0,vc1,vc0) = */ -/* (15,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(15,0,0,0) */ - -/* rC *= alpha batchEements=[(15, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #16 (d1,d0,vc1,vc0) = */ -/* (16,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(16,0,0,0) */ - -/* rC *= alpha batchEements=[(16, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #17 (d1,d0,vc1,vc0) = */ -/* (17,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(17,0,0,0) */ - -/* rC *= alpha batchEements=[(17, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #18 (d1,d0,vc1,vc0) = */ -/* (18,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(18,0,0,0) */ - -/* rC *= alpha batchEements=[(18, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #19 (d1,d0,vc1,vc0) = */ -/* (19,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(19,0,0,0) */ - -/* rC *= alpha batchEements=[(19, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #20 (d1,d0,vc1,vc0) = */ -/* (20,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(20,0,0,0) */ - -/* rC *= alpha batchEements=[(20, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #21 (d1,d0,vc1,vc0) = */ -/* (21,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(21,0,0,0) */ - -/* rC *= alpha batchEements=[(21, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #22 (d1,d0,vc1,vc0) = */ -/* (22,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(22,0,0,0) */ - -/* rC *= alpha batchEements=[(22, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #23 (d1,d0,vc1,vc0) = */ -/* (23,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(23,0,0,0) */ - -/* rC *= alpha batchEements=[(23, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #24 (d1,d0,vc1,vc0) = */ -/* (24,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(24,0,0,0) */ - -/* rC *= alpha batchEements=[(24, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #25 (d1,d0,vc1,vc0) = */ -/* (25,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(25,0,0,0) */ - -/* rC *= alpha batchEements=[(25, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #26 (d1,d0,vc1,vc0) = */ -/* (26,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(26,0,0,0) */ - -/* rC *= alpha batchEements=[(26, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #27 (d1,d0,vc1,vc0) = */ -/* (27,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(27,0,0,0) */ - -/* rC *= alpha batchEements=[(27, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #28 (d1,d0,vc1,vc0) = */ -/* (28,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(28,0,0,0) */ - -/* rC *= alpha batchEements=[(28, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #29 (d1,d0,vc1,vc0) = */ -/* (29,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(29,0,0,0) */ - -/* rC *= alpha batchEements=[(29, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #30 (d1,d0,vc1,vc0) = */ -/* (30,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(30,0,0,0) */ - -/* rC *= alpha batchEements=[(30, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Batch #31 (d1,d0,vc1,vc0) = */ -/* (31,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(31,0,0,0) */ - -/* rC *= alpha batchEements=[(31, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_branch label_GW_End_45 // jump to end -GW_B0_E1_37: - -/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */ -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */ -/* (0,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(0, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */ -/* (0,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(0,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(0, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */ -/* (1,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(1,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(1, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */ -/* (1,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(1,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(1, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #4 (d1,d0,vc1,vc0) = */ -/* (2,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(2,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(2, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #5 (d1,d0,vc1,vc0) = */ -/* (2,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(2,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(2, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #6 (d1,d0,vc1,vc0) = */ -/* (3,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(3,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(3, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #7 (d1,d0,vc1,vc0) = */ -/* (3,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(3,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(3, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #8 (d1,d0,vc1,vc0) = */ -/* (4,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(4,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(4, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #9 (d1,d0,vc1,vc0) = */ -/* (4,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(4,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(4, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #10 (d1,d0,vc1,vc0) = */ -/* (5,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(5,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(5, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #11 (d1,d0,vc1,vc0) = */ -/* (5,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(5,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(5, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #12 (d1,d0,vc1,vc0) = */ -/* (6,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(6,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(6, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #13 (d1,d0,vc1,vc0) = */ -/* (6,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(6,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(6, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #14 (d1,d0,vc1,vc0) = */ -/* (7,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(7,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(7, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #15 (d1,d0,vc1,vc0) = */ -/* (7,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(7,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(7, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #16 (d1,d0,vc1,vc0) = */ -/* (8,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(8,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(8, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #17 (d1,d0,vc1,vc0) = */ -/* (8,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(8,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(8, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #18 (d1,d0,vc1,vc0) = */ -/* (9,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(9,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(9, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #19 (d1,d0,vc1,vc0) = */ -/* (9,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(9,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(9, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #20 (d1,d0,vc1,vc0) = */ -/* (10,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(10,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(10, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #21 (d1,d0,vc1,vc0) = */ -/* (10,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(10,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(10, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #22 (d1,d0,vc1,vc0) = */ -/* (11,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(11,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(11, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #23 (d1,d0,vc1,vc0) = */ -/* (11,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(11,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(11, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #24 (d1,d0,vc1,vc0) = */ -/* (12,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(12,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(12, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #25 (d1,d0,vc1,vc0) = */ -/* (12,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(12,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(12, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #26 (d1,d0,vc1,vc0) = */ -/* (13,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(13,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(13, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #27 (d1,d0,vc1,vc0) = */ -/* (13,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(13,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(13, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #28 (d1,d0,vc1,vc0) = */ -/* (14,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(14,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(14, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #29 (d1,d0,vc1,vc0) = */ -/* (14,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(14,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(14, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #30 (d1,d0,vc1,vc0) = */ -/* (15,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(15,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(15, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #31 (d1,d0,vc1,vc0) = */ -/* (15,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(15,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(15, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #32 (d1,d0,vc1,vc0) = */ -/* (16,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(16,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(16, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #33 (d1,d0,vc1,vc0) = */ -/* (16,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(16,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(16, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #34 (d1,d0,vc1,vc0) = */ -/* (17,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(17,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(17, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #35 (d1,d0,vc1,vc0) = */ -/* (17,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(17,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(17, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #36 (d1,d0,vc1,vc0) = */ -/* (18,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(18,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(18, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #37 (d1,d0,vc1,vc0) = */ -/* (18,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(18,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(18, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #38 (d1,d0,vc1,vc0) = */ -/* (19,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(19,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(19, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #39 (d1,d0,vc1,vc0) = */ -/* (19,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(19,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(19, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #40 (d1,d0,vc1,vc0) = */ -/* (20,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(20,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(20, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #41 (d1,d0,vc1,vc0) = */ -/* (20,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(20,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(20, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #42 (d1,d0,vc1,vc0) = */ -/* (21,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(21,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(21, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #43 (d1,d0,vc1,vc0) = */ -/* (21,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(21,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(21, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #44 (d1,d0,vc1,vc0) = */ -/* (22,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(22,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(22, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #45 (d1,d0,vc1,vc0) = */ -/* (22,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(22,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(22, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #46 (d1,d0,vc1,vc0) = */ -/* (23,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(23,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(23, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #47 (d1,d0,vc1,vc0) = */ -/* (23,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(23,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(23, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #48 (d1,d0,vc1,vc0) = */ -/* (24,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(24,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(24, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #49 (d1,d0,vc1,vc0) = */ -/* (24,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(24,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(24, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #50 (d1,d0,vc1,vc0) = */ -/* (25,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(25,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(25, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #51 (d1,d0,vc1,vc0) = */ -/* (25,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(25,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(25, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #52 (d1,d0,vc1,vc0) = */ -/* (26,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(26,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(26, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #53 (d1,d0,vc1,vc0) = */ -/* (26,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(26,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(26, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #54 (d1,d0,vc1,vc0) = */ -/* (27,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(27,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(27, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #55 (d1,d0,vc1,vc0) = */ -/* (27,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(27,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(27, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #56 (d1,d0,vc1,vc0) = */ -/* (28,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(28,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(28, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #57 (d1,d0,vc1,vc0) = */ -/* (28,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(28,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(28, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #58 (d1,d0,vc1,vc0) = */ -/* (29,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(29,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(29, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #59 (d1,d0,vc1,vc0) = */ -/* (29,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(29,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(29, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #60 (d1,d0,vc1,vc0) = */ -/* (30,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(30,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(30, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #61 (d1,d0,vc1,vc0) = */ -/* (30,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(30,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(30, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #62 (d1,d0,vc1,vc0) = */ -/* (31,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(31,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(31, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Edge Batch #63 (d1,d0,vc1,vc0) = */ -/* (31,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ -/* (d1,vc1,d0,vc0)=(31,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset - -/* rC *= alpha batchEements=[(31, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha - -/* apply mask, calc new C and issue writes */ -buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_branch label_GW_End_45 // jump to end -GW_Beta_46: -s_and_b32 s64, 127, s[sgprSizeI] // s64 = s[sgprSizeI] % 128 -s_add_u32 s65, -0x1, s[sgprNumWorkGroups0] // -s_cmp_ge_u32 s[sgprWorkGroup0], s65 // wg0 >= nwg0-1 ? -s_cselect_b32 s64, s64, 0 // set rMT0 -s_cmpk_gt_u32 s64, 0x0 // rMT0 > 0 -s_cbranch_scc1 GW_B1_E1_44 // jump if edges required -s_and_b32 s64, 127, s[sgprSizeJ] // s64 = s[sgprSizeJ] % 128 -s_add_u32 s65, -0x1, s[sgprNumWorkGroups1] // -s_cmp_ge_u32 s[sgprWorkGroup1], s65 // wg1 >= nwg1-1 -s_cselect_b32 s64, s64, 0 // set rMT1 -s_cmpk_gt_u32 s64, 0x0 // rMT1 > 0 -s_cbranch_scc1 GW_B1_E1_44 // jump if edges required -GW_B1_E0_41: - -/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */ -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */ -/* (0,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(0, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(0,0,0,0) */ -_v_add_lshl_u32 v135, v130, v128, 0x3 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128 -_v_add_lshl_u32 v134, v131, v128, 0x3 // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128 -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #1 (d1,d0,vc1,vc0) = */ -/* (1,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(1, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(1,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #2 (d1,d0,vc1,vc0) = */ -/* (2,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(2, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(2,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #3 (d1,d0,vc1,vc0) = */ -/* (3,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(3, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(3,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #4 (d1,d0,vc1,vc0) = */ -/* (4,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(4, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(4,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #5 (d1,d0,vc1,vc0) = */ -/* (5,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(5, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(5,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #6 (d1,d0,vc1,vc0) = */ -/* (6,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(6, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(6,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #7 (d1,d0,vc1,vc0) = */ -/* (7,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(7, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(7,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #8 (d1,d0,vc1,vc0) = */ -/* (8,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(8, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(8,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #9 (d1,d0,vc1,vc0) = */ -/* (9,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(9, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(9,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #10 (d1,d0,vc1,vc0) = */ -/* (10,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(10, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(10,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #11 (d1,d0,vc1,vc0) = */ -/* (11,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(11, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(11,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #12 (d1,d0,vc1,vc0) = */ -/* (12,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(12, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(12,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #13 (d1,d0,vc1,vc0) = */ -/* (13,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(13, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(13,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #14 (d1,d0,vc1,vc0) = */ -/* (14,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(14, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(14,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #15 (d1,d0,vc1,vc0) = */ -/* (15,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(15, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(15,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #16 (d1,d0,vc1,vc0) = */ -/* (16,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(16, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(16,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #17 (d1,d0,vc1,vc0) = */ -/* (17,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(17, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(17,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #18 (d1,d0,vc1,vc0) = */ -/* (18,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(18, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(18,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #19 (d1,d0,vc1,vc0) = */ -/* (19,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(19, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(19,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #20 (d1,d0,vc1,vc0) = */ -/* (20,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(20, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(20,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #21 (d1,d0,vc1,vc0) = */ -/* (21,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(21, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(21,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #22 (d1,d0,vc1,vc0) = */ -/* (22,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(22, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(22,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #23 (d1,d0,vc1,vc0) = */ -/* (23,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(23, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(23,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #24 (d1,d0,vc1,vc0) = */ -/* (24,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(24, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(24,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #25 (d1,d0,vc1,vc0) = */ -/* (25,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(25, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(25,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #26 (d1,d0,vc1,vc0) = */ -/* (26,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(26, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(26,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #27 (d1,d0,vc1,vc0) = */ -/* (27,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(27, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(27,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #28 (d1,d0,vc1,vc0) = */ -/* (28,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(28, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(28,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #29 (d1,d0,vc1,vc0) = */ -/* (29,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(29, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(29,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #30 (d1,d0,vc1,vc0) = */ -/* (30,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(30, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(30,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Batch #31 (d1,d0,vc1,vc0) = */ -/* (31,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(31, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(31,0,0,0) */ -s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) -s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_branch label_GW_End_45 // jump to end -GW_B1_E1_44: - -// TODO in Generator -// wider store if M % 2 == 0 -s_and_b32 s63, 0x1, s[sgprSizeI] -s_cbranch_scc0 GW_B1_E1_VW2 // done shifting - -/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */ -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ -/* (0,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(0, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ -/* (0,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(0, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(0,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */ -/* (1,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(1, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(1,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */ -/* (1,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(1, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(1,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #4 (d1,d0,vc1,vc0) = */ -/* (2,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(2, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(2,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #5 (d1,d0,vc1,vc0) = */ -/* (2,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(2, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(2,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #6 (d1,d0,vc1,vc0) = */ -/* (3,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(3, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(3,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #7 (d1,d0,vc1,vc0) = */ -/* (3,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(3, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(3,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #8 (d1,d0,vc1,vc0) = */ -/* (4,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(4, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(4,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #9 (d1,d0,vc1,vc0) = */ -/* (4,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(4, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(4,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #10 (d1,d0,vc1,vc0) = */ -/* (5,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(5, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(5,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #11 (d1,d0,vc1,vc0) = */ -/* (5,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(5, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(5,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #12 (d1,d0,vc1,vc0) = */ -/* (6,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(6, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(6,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #13 (d1,d0,vc1,vc0) = */ -/* (6,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(6, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(6,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #14 (d1,d0,vc1,vc0) = */ -/* (7,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(7, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(7,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #15 (d1,d0,vc1,vc0) = */ -/* (7,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(7, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(7,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #16 (d1,d0,vc1,vc0) = */ -/* (8,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(8, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(8,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #17 (d1,d0,vc1,vc0) = */ -/* (8,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(8, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(8,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #18 (d1,d0,vc1,vc0) = */ -/* (9,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(9, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(9,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #19 (d1,d0,vc1,vc0) = */ -/* (9,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(9, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(9,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #20 (d1,d0,vc1,vc0) = */ -/* (10,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(10, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(10,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #21 (d1,d0,vc1,vc0) = */ -/* (10,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(10, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(10,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #22 (d1,d0,vc1,vc0) = */ -/* (11,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(11, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(11,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #23 (d1,d0,vc1,vc0) = */ -/* (11,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(11, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(11,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #24 (d1,d0,vc1,vc0) = */ -/* (12,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(12, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(12,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #25 (d1,d0,vc1,vc0) = */ -/* (12,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(12, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(12,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #26 (d1,d0,vc1,vc0) = */ -/* (13,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(13, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(13,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #27 (d1,d0,vc1,vc0) = */ -/* (13,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(13, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(13,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #28 (d1,d0,vc1,vc0) = */ -/* (14,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(14, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(14,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #29 (d1,d0,vc1,vc0) = */ -/* (14,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(14, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(14,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #30 (d1,d0,vc1,vc0) = */ -/* (15,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(15, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(15,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #31 (d1,d0,vc1,vc0) = */ -/* (15,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(15, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(15,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #32 (d1,d0,vc1,vc0) = */ -/* (16,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(16, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(16,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #33 (d1,d0,vc1,vc0) = */ -/* (16,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(16, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(16,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #34 (d1,d0,vc1,vc0) = */ -/* (17,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(17, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(17,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #35 (d1,d0,vc1,vc0) = */ -/* (17,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(17, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(17,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #36 (d1,d0,vc1,vc0) = */ -/* (18,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(18, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(18,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #37 (d1,d0,vc1,vc0) = */ -/* (18,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(18, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(18,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #38 (d1,d0,vc1,vc0) = */ -/* (19,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(19, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(19,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #39 (d1,d0,vc1,vc0) = */ -/* (19,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(19, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(19,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #40 (d1,d0,vc1,vc0) = */ -/* (20,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(20, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(20,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #41 (d1,d0,vc1,vc0) = */ -/* (20,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(20, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(20,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #42 (d1,d0,vc1,vc0) = */ -/* (21,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(21, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(21,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #43 (d1,d0,vc1,vc0) = */ -/* (21,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(21, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(21,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #44 (d1,d0,vc1,vc0) = */ -/* (22,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(22, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(22,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #45 (d1,d0,vc1,vc0) = */ -/* (22,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(22, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(22,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #46 (d1,d0,vc1,vc0) = */ -/* (23,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(23, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(23,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #47 (d1,d0,vc1,vc0) = */ -/* (23,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(23, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(23,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #48 (d1,d0,vc1,vc0) = */ -/* (24,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(24, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(24,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #49 (d1,d0,vc1,vc0) = */ -/* (24,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(24, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(24,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #50 (d1,d0,vc1,vc0) = */ -/* (25,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(25, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(25,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #51 (d1,d0,vc1,vc0) = */ -/* (25,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(25, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(25,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #52 (d1,d0,vc1,vc0) = */ -/* (26,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(26, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(26,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #53 (d1,d0,vc1,vc0) = */ -/* (26,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(26, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(26,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #54 (d1,d0,vc1,vc0) = */ -/* (27,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(27, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(27,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #55 (d1,d0,vc1,vc0) = */ -/* (27,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(27, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(27,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #56 (d1,d0,vc1,vc0) = */ -/* (28,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(28, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(28,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #57 (d1,d0,vc1,vc0) = */ -/* (28,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(28, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(28,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #58 (d1,d0,vc1,vc0) = */ -/* (29,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(29, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(29,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #59 (d1,d0,vc1,vc0) = */ -/* (29,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(29, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(29,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #60 (d1,d0,vc1,vc0) = */ -/* (30,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(30, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(30,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #61 (d1,d0,vc1,vc0) = */ -/* (30,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(30, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(30,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #62 (d1,d0,vc1,vc0) = */ -/* (31,0,0,0:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(31, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(31,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #63 (d1,d0,vc1,vc0) = */ -/* (31,0,0,1:vw1) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(31, 0, 0, 1)] */ -v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(31,0,0,1) */ -_v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_branch label_GW_End_45 // jump to end - -GW_B1_E1_VW2: - -/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */ -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */ -/* (0,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(0, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */ -/* (1,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(1, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(1,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */ -/* (2,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(2, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(2,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */ -/* (3,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(3, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(3,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #4 (d1,d0,vc1,vc0) = */ -/* (4,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(4, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(4,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #5 (d1,d0,vc1,vc0) = */ -/* (5,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(5, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(5,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #6 (d1,d0,vc1,vc0) = */ -/* (6,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(6, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(6,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #7 (d1,d0,vc1,vc0) = */ -/* (7,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(7, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(7,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #8 (d1,d0,vc1,vc0) = */ -/* (8,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(8, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(8,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #9 (d1,d0,vc1,vc0) = */ -/* (9,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(9, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(9,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #10 (d1,d0,vc1,vc0) = */ -/* (10,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(10, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(10,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #11 (d1,d0,vc1,vc0) = */ -/* (11,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(11, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(11,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #12 (d1,d0,vc1,vc0) = */ -/* (12,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(12, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(12,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #13 (d1,d0,vc1,vc0) = */ -/* (13,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(13, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(13,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #14 (d1,d0,vc1,vc0) = */ -/* (14,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(14, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(14,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #15 (d1,d0,vc1,vc0) = */ -/* (15,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(15, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(15,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #16 (d1,d0,vc1,vc0) = */ -/* (16,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(16, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(16,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #17 (d1,d0,vc1,vc0) = */ -/* (17,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(17, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(17,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #18 (d1,d0,vc1,vc0) = */ -/* (18,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(18, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(18,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #19 (d1,d0,vc1,vc0) = */ -/* (19,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(19, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(19,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #20 (d1,d0,vc1,vc0) = */ -/* (20,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(20, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(20,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #21 (d1,d0,vc1,vc0) = */ -/* (21,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(21, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(21,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #22 (d1,d0,vc1,vc0) = */ -/* (22,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(22, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(22,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #23 (d1,d0,vc1,vc0) = */ -/* (23,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(23, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(23,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #24 (d1,d0,vc1,vc0) = */ -/* (24,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(24, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(24,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #25 (d1,d0,vc1,vc0) = */ -/* (25,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(25, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(25,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #26 (d1,d0,vc1,vc0) = */ -/* (26,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(26, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(26,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #27 (d1,d0,vc1,vc0) = */ -/* (27,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(27, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(27,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #28 (d1,d0,vc1,vc0) = */ -/* (28,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(28, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(28,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #29 (d1,d0,vc1,vc0) = */ -/* (29,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(29, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(29,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #30 (d1,d0,vc1,vc0) = */ -/* (30,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(30, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(30,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */ -s_sleep 5 // optimization: sync and wait -s_barrier - -/******************************************/ -/* Global Write Beta Edge Batch #31 (d1,d0,vc1,vc0) = */ -/* (31,0,0,0:vw2) */ -/******************************************/ - -/* calc coords, apply mask, and issue loads (if necessary) */ - -/* rC *= alpha batchEements=[(31, 0, 0, 0)] */ -v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha -v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha -/* (d1,vc1,d0,vc0)=(31,0,0,0) */ -_v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 - -/* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row -v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 -_v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset -_v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset -buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc -s_sleep 5 // optimization: sync and wait -s_barrier -s_waitcnt vmcnt(0) // wait C - -/* apply mask, calc new C and issue writes */ -v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta -v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D -s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst -s_branch label_GW_End_45 // jump to end -label_GW_End_45: - -label_0047: /// KernelEnd -s_endpgm // Kernel End - - diff --git a/src/Tensile/KernelWriter.py b/src/Tensile/KernelWriter.py index e51b22e631..5acb743afe 100644 --- a/src/Tensile/KernelWriter.py +++ b/src/Tensile/KernelWriter.py @@ -5291,7 +5291,7 @@ def getReplacementKernelPath(self, kernel): kernelName = self.getKernelName(kernel) if isCustomKernelConfig(kernel): - return os.path.join(globalParameters["CustomKernelDirectory"], (kernelName + ".s")) + return globalParameters["CustomKernelDirectory"].joinpath(kernelName + ".s") else: # Replacement kernel return ReplacementKernels.Get(kernelName) diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py index 455435c85c..fbe60be050 100644 --- a/src/Tensile/TensileCreateLibrary.py +++ b/src/Tensile/TensileCreateLibrary.py @@ -34,7 +34,8 @@ from . import LibraryIO from . import Utils from .Common import globalParameters, HR, print1, print2, printExit, ensurePath, \ - CHeader, CMakeHeader, assignGlobalParameters, gfxName, architectureMap + CHeader, CMakeHeader, assignGlobalParameters, gfxName, architectureMap, \ + copy_data_files from .KernelWriterAssembly import KernelWriterAssembly from .KernelWriterSource import KernelWriterSource from .SolutionLibrary import MasterSolutionLibrary @@ -651,11 +652,7 @@ def copyStaticFiles(outputPath=None): "tensile_float8_bfloat8.h", "hip_f8_impl.h", "KernelHeader.h" ] - - for fileName in libraryStaticFiles: - # copy file - shutil.copy( os.path.join(globalParameters["SourcePath"], fileName), \ - outputPath ) + copy_data_files(libraryStaticFiles, outputPath) return libraryStaticFiles diff --git a/src/Tensile/data/Source/client/CMakeLists.txt b/src/Tensile/data/Source/client/CMakeLists.txt index cae52f54c1..cd4075de74 100644 --- a/src/Tensile/data/Source/client/CMakeLists.txt +++ b/src/Tensile/data/Source/client/CMakeLists.txt @@ -60,7 +60,7 @@ find_package(Boost COMPONENTS program_options REQUIRED) if (NOT WIN32) find_package(ROCmSMI QUIET) if(NOT ROCmSMI_FOUND) - set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${Tensile_DIR}" "${Tensile_DIR}/../Source/cmake" "${CMAKE_SOURCE_DIR}/cmake") + set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${CMAKE_SOURCE_DIR}/cmake") find_package(ROCmSMI REQUIRED) endif() endif() diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 b/src/Tensile/data/template/cmake/TensileConfigVersion.cmake.j2 similarity index 96% rename from src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 rename to src/Tensile/data/template/cmake/TensileConfigVersion.cmake.j2 index 1c56f4a612..5b6d82ecbb 100644 --- a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 +++ b/src/Tensile/data/template/cmake/TensileConfigVersion.cmake.j2 @@ -27,7 +27,7 @@ set(TENSILE_VERSION_MINOR {TENSILE_VERSION_MINOR}) set(TENSILE_VERSION_PATCH {TENSILE_VERSION_PATCH}) # export version -set(PACKAGE_VERSION "${TENSILE_VERSION_MAJOR}.${TENSILE_VERSION_MINOR}.${TENSILE_VERSION_PATCH}") +set(PACKAGE_VERSION "{TENSILE_VERSION_MAJOR}.{TENSILE_VERSION_MINOR}.{TENSILE_VERSION_PATCH}") if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) set(PACKAGE_VERSION_EXACT TRUE)