Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add binary SFPU OPs - eltwise & bitwise #56

Merged
merged 2 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/inc/ckernel_sfpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "sfpu/ckernel_sfpu_abs.h"
#include "sfpu/ckernel_sfpu_add_int32.h"
#include "sfpu/ckernel_sfpu_binary.h"
#include "sfpu/ckernel_sfpu_binary_bitwise.h"
#include "sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h"
#include "sfpu/ckernel_sfpu_clamp.h"
#include "sfpu/ckernel_sfpu_comp.h"
Expand Down
2 changes: 1 addition & 1 deletion common/inc/sfpu/ckernel_sfpu_add_int32.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _add_int32_(const int iterations, const uint dst_offset) {
inline void _add_int32_(const uint dst_offset) {
// Operand A is input1 (int32)
// Operand B is input2 (int32)
// Output is int32
Expand Down
139 changes: 139 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_binary.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel.h"
#include "ckernel_defs.h"
#include "noc_nonblocking_api.h"
#include "ckernel_sfpu_exp.h"
#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

enum {
ADD_BINARY = 0,
SUB_BINARY = 1,
MUL_BINARY = 2,
DIV_BINARY = 3,
RSUB_BINARY = 4,
POW_BINARY = 5
}; // BINOP_MODE

sfpi_inline vFloat _calculate_sfpu_binary_power_(vFloat base, vFloat pow)
{
vFloat original_base = base;

// Check for integer power
vInt pow_int = float_to_int16(pow, 0); // int16 should be plenty, since large powers will approach 0/Inf
vFloat pow_rounded = int32_to_float(pow_int, 0);
v_if (pow_rounded == pow) {
// if pow is integer, set base to positive
base = setsgn(base, 0);
}
v_endif;

// Normalize base to calculation range
vFloat x = setexp(base, 127); // set exp to exp bias (put base in range of 1-2)

// 3rd order polynomial approx - determined using rminimax over [1,2]
vFloat series_result = x * (x * (x * 0x2.44734p-4f - 0xd.e712ap-4f) + 0x2.4f5388p+0f) - 0x1.952992p+0f;

// Convert exponent to float
vInt exp = exexp(base);
v_if (exp < 0) {
exp = setsgn(~exp + 1, 1);
}
v_endif;
vFloat expf = int32_to_float(exp, 0);

// De-normalize to original range
vFloat vConstLn2 = 0.692871f;
vFloat log_result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)

// Base case when input is 0. ln(0) = -inf
v_if (base == 0.0f) { // Reload for register pressure
log_result = -std::numeric_limits<float>::infinity();
}
v_endif;

// Take exp(pow * log(base)) to produce base^pow
vFloat val = pow * log_result;

// Force sign to 0 (make number positive)
vFloat result = _sfpu_exp_(setsgn(val, 0));

v_if (val < 0) {
result = _sfpu_reciprocal_(result);
}
v_endif;

// Check valid base range
v_if (original_base < 0.0f) { // negative base
// Check for integer power
v_if (pow_rounded == pow) {
// if pow is odd integer, set result to negative
v_if (pow_int & 0x1) {
result = setsgn(result, 1);
}
v_endif;
} v_else {
result = std::numeric_limits<float>::quiet_NaN();
}
v_endif;
}
v_endif;

return result;
}

template <bool APPROXIMATION_MODE, int BINOP_MODE, int ITERATIONS = 8>
inline void _calculate_sfpu_binary_(const uint dst_offset)
{
// SFPU microcode
for (int d = 0; d < ITERATIONS; d++) {
constexpr uint dst_tile_size = 32;
vFloat in0 = dst_reg[0];
vFloat in1 = dst_reg[dst_offset * dst_tile_size];
vFloat result = 0.0f;

if constexpr (BINOP_MODE == ADD_BINARY) {
result = in0 + in1;
} else if constexpr (BINOP_MODE == SUB_BINARY) {
result = in0 - in1;
} else if constexpr (BINOP_MODE == MUL_BINARY) {
result = in0 * in1;
} else if constexpr (BINOP_MODE == DIV_BINARY) {
v_if (in1 == 0) {
v_if (in0 == 0) {
result = std::numeric_limits<float>::quiet_NaN();
} v_else {
result = std::numeric_limits<float>::infinity();
result = setsgn(result, in0);
}
v_endif;
} v_elseif (in0 == in1) {
result = vConst1;
} v_else {
result = in0 * setsgn(_sfpu_reciprocal_<4>(in1), in1);
}
v_endif;
} else if constexpr (BINOP_MODE == RSUB_BINARY) {
result = in1 - in0;
} else if constexpr (BINOP_MODE == POW_BINARY) {
result = _calculate_sfpu_binary_power_(in0, in1);
}

dst_reg[0] = result;
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
50 changes: 50 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_binary_bitwise.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel.h"
#include "ckernel_defs.h"
#include "noc_nonblocking_api.h"
#include "sfpi.h"
#include <limits.h>

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

enum {
AND_BINARY = 0,
OR_BINARY = 1,
XOR_BINARY = 2,
}; // BITWISE_MODE

template <bool APPROXIMATION_MODE, int BITWISE_MODE, int ITERATIONS = 8>
inline void _calculate_sfpu_binary_bitwise_(const uint dst_offset)
{
// SFPU microcode
for (int d = 0; d < ITERATIONS; d++) {
constexpr uint dst_tile_size = 64;

TTI_SFPLOAD(0,12,ADDR_MOD_7,0);
TT_SFPLOAD(1,12,ADDR_MOD_7,dst_offset*dst_tile_size);

if constexpr (BITWISE_MODE == AND_BINARY) {
TTI_SFPAND(0,1,0,0);
} else if constexpr (BITWISE_MODE == OR_BINARY) {
TTI_SFPOR(0,1,0,0);
} else if constexpr (BITWISE_MODE == XOR_BINARY) {
TTI_SFPXOR(0,1,0,0);
}

TTI_SFPSTORE(0,12,ADDR_MOD_7,0);
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
8 changes: 4 additions & 4 deletions common/inc/sfpu/ckernel_sfpu_quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _quant_int32_(const int iterations, const uint dst_offset)
inline void _quant_int32_(const uint dst_offset)
{
// Operand A is input (fp32)
// Operand B is scaling factor (fp32)
Expand All @@ -43,7 +43,7 @@ inline void _quant_int32_(const int iterations, const uint dst_offset)
}

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _requant_int32_(const int iterations, const uint dst_offset)
inline void _requant_int32_(const uint dst_offset)
{
// Operand A is input to requant (int32)
// Operand B is scaling factor (fp32)
Expand Down Expand Up @@ -71,7 +71,7 @@ inline void _requant_int32_(const int iterations, const uint dst_offset)
}

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _dequant_int32_(const int iterations, const uint dst_offset)
inline void _dequant_int32_(const uint dst_offset)
{
// Operand A[LREG0] is input to dequant (int32)
// Operand B[LREG1] is scaling factor (fp32)
Expand All @@ -97,7 +97,7 @@ inline void _dequant_int32_(const int iterations, const uint dst_offset)
}
}

inline void init_quant_zero_point(const uint zero_point)
inline void _init_quant_zero_point_(const uint zero_point)
{
_sfpu_load_imm32_(2,zero_point);
}
Expand Down
2 changes: 2 additions & 0 deletions llk_lib/llk_math_eltwise_binary_sfpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ inline void _llk_math_eltwise_binary_sfpu_start_(const uint dst_index) {
} else {
math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(dst_index);
}
TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
}

inline void _llk_math_eltwise_binary_sfpu_done_() {
Expand All @@ -48,6 +49,7 @@ inline void _llk_math_eltwise_binary_sfpu_inc_dst_face_addr_() {
math::inc_dst_addr<8>();
}

template <SfpuType sfpu_op>
inline void _llk_math_eltwise_binary_sfpu_init_() {
sfpu::_init_sfpu_config_reg();
eltwise_binary_sfpu_configure_addrmod();
Expand Down
Loading