Skip to content

Commit

Permalink
Port packer fix for partial faces from WH B0 & matmul calculation for…
Browse files Browse the repository at this point in the history
… partial face/narrow tile combination on input (#29)

* Port commit 4c0043027d055d6a9c7986c5c57c12c61b695574 from tt-llk-wh-b0

* Fix matmul for tiny tile on in0 and narrow tile on in1
  • Loading branch information
nvelickovicTT authored Aug 8, 2024
1 parent 74d71e3 commit 841a944
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 16 deletions.
18 changes: 9 additions & 9 deletions common/inc/cpack_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace ckernel::packer
// //uint32_t read_mode : 1; //Removed in BH
// uint32_t exp_threshold_en : 1;
// uint32_t reserved_2 : 1;
// uint32_t unp_lf8_4b_exp: 1;
// uint32_t unp_lf8_4b_exp: 1;
// uint32_t pac_lf8_4b_exp: 1;
// uint32_t exp_threshold : 8;
} pack_config_t;
Expand Down Expand Up @@ -155,11 +155,11 @@ namespace ckernel::packer
(uint)(pack_src_format&0x3) == (uint)DataFormat::Float16 ? 2 : 1;
uint y_stride = FACE_C_DIM*x_stride;
uint w_stride = TILE_NUM_FACES*FACE_C_DIM*FACE_R_DIM*x_stride;

// Untilize mode has 2 packer interfaces active, so z counter needs to jump by 2
// faces, since z counter is only 1 bit (can't be programmed to inc by 2)
const uint z_stride = ((untilize ^ tilize) && (tile_c_dim == TILE_C_DIM)) ? 2*FACE_R_DIM*y_stride : FACE_R_DIM*y_stride;


TT_SETDMAREG(0, LOWER_HALFWORD((y_stride<<PCK0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT)), 0, LO_16(p_gpr_pack::TMP0)); //x-stride not used!
TT_SETDMAREG(0, UPPER_HALFWORD((y_stride<<PCK0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT)), 0, HI_16(p_gpr_pack::TMP0));
Expand Down Expand Up @@ -212,7 +212,7 @@ namespace ckernel::packer
exp_threshold_val = 113;
}
// EXP threshold is updated in the config word 3 which has a bit programmed by the unpacker as well
constexpr uint exp_threshold_rmw_mask = THCON_SEC0_REG1_Exp_threshold_en_MASK|THCON_SEC0_REG1_Exp_threshold_MASK;
constexpr uint exp_threshold_rmw_mask = THCON_SEC0_REG1_Exp_threshold_en_MASK|THCON_SEC0_REG1_Exp_threshold_MASK;
uint exp_threshold_rmw_data = (exp_threshold_val << THCON_SEC0_REG1_Exp_threshold_SHAMT) | (exp_threshold_en << THCON_SEC0_REG1_Exp_threshold_en_SHAMT);
cfg_reg_rmw_tensix<THCON_SEC0_REG1_Row_start_section_size_ADDR32+3,0,exp_threshold_rmw_mask>(exp_threshold_rmw_data);
}
Expand Down Expand Up @@ -259,7 +259,7 @@ namespace ckernel::packer
cfg[PCK_DEST_RD_CTRL_Read_32b_data_ADDR32] = dest_rd_ctrl.val;

// Save to GPR for quick data format reconfig
regfile[p_gpr_pack::EXP0_SEC_SIZE_BFP] = (num_faces) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
regfile[p_gpr_pack::EXP0_SEC_SIZE_BFP] = (partial_face ? 1 : num_faces) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
sync_regfile_write(p_gpr_pack::EXP0_SEC_SIZE_BFP);
}

Expand Down Expand Up @@ -306,7 +306,7 @@ namespace ckernel::packer
exp_threshold_val = 113;
}
// EXP threshold is updated in the config word 3 which has a bit programmed by the unpacker as well
constexpr uint exp_threshold_rmw_mask = THCON_SEC0_REG1_Exp_threshold_en_MASK|THCON_SEC0_REG1_Exp_threshold_MASK;
constexpr uint exp_threshold_rmw_mask = THCON_SEC0_REG1_Exp_threshold_en_MASK|THCON_SEC0_REG1_Exp_threshold_MASK;
uint exp_threshold_rmw_data = (exp_threshold_val << THCON_SEC0_REG1_Exp_threshold_SHAMT) | (exp_threshold_en << THCON_SEC0_REG1_Exp_threshold_en_SHAMT);
cfg_reg_rmw_tensix<THCON_SEC0_REG1_Row_start_section_size_ADDR32+3,0,exp_threshold_rmw_mask>(exp_threshold_rmw_data);
}
Expand Down Expand Up @@ -343,7 +343,7 @@ namespace ckernel::packer

t6_mutex_acquire(mutex::REG_RMW);

//Set Fp8 E4M3 mode for packer
//Set Fp8 E4M3 mode for packer
if((pack_dst_format&0x1F) == (uint)DataFormat::Fp8_e4m3) {
cfg_reg_rmw_tensix<THCON_SEC0_REG1_Pac_LF8_4b_exp_RMW>(1);
}
Expand Down Expand Up @@ -388,7 +388,7 @@ namespace ckernel::packer
cfg[STACC_RELU_ApplyRelu_ADDR32] = hw_relu_config.val[0];

// In Blackhole, x_start/x_end must be within 1 row size (i.e. from 0 to 15)
TT_SETADCXX(p_setadc::PAC, FACE_C_DIM-1, 0x0);
TT_SETADCXX(p_setadc::PAC, FACE_C_DIM-1, 0x0);

}

Expand Down Expand Up @@ -492,7 +492,7 @@ namespace ckernel::packer
const uint32_t pack_l1_acc_disable_pack_zero_flag = pack_l1_acc ? (0b11) : (0b00);

cfg_reg_rmw_tensix<THCON_SEC0_REG1_Pack_L1_Acc_ADDR32, THCON_SEC0_REG1_Pack_L1_Acc_SHAMT, THCON_SEC0_REG1_Disable_pack_zero_flags_MASK | THCON_SEC0_REG1_Pack_L1_Acc_MASK>(pack_l1_acc_disable_pack_zero_flag);

}

// Write tile header to l1
Expand Down
20 changes: 13 additions & 7 deletions llk_lib/llk_math_matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,20 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con
// Lambda function to load reply buffer
[high_fidelity, reuse_a, partial_face, is_in1_32x16, is_in0_16x32, is_in0_32x16, is_in1_16x32, t_dim] {
if (is_in1_32x16) {
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A0 // srca=srca, srcb+=8, dest+=8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_1, 0); // B0A0 // srca+=16, srcb+=8, dest=0
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B1A1 // srca=srca, srcb+=8, dest=+8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_4, 0); // B1A1 // srca=0, srcb+=8, dest=16 (addr_mod_4), bias=0
if (is_in0_16x32) {
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A0 // srca=srca, srcb+=8, dest+=8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_1, 0); // B0A0 // srca+=16, srcb+=8, dest=0
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B1A1 // srca=srca, srcb+=8, dest=+8, bias=1
} else {
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A0 // srca=srca, srcb+=8, dest+=8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_1, 0); // B0A0 // srca+=16, srcb+=8, dest=0
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B1A1 // srca=srca, srcb+=8, dest=+8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_4, 0); // B1A1 // srca=0, srcb+=8, dest=16 (addr_mod_4), bias=0

TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B2A0 // srca=srca, srcb+=8, dest+=8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B2A0 // srca+=16, srcb+=8, dest=16
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B3A1 // srca=srca, srcb+=8, dest+=8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B2A0 // srca=srca, srcb+=8, dest+=8
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B2A0 // srca+=16, srcb+=8, dest=16
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B3A1 // srca=srca, srcb+=8, dest+=8
}
} else if (is_in0_16x32 || is_in0_32x16) {
if (partial_face) {
TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16, srcb=0, dest=+16
Expand Down

0 comments on commit 841a944

Please sign in to comment.