Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add subdevice support to multicore untilize #16193

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,66 @@
from models.utility_functions import is_grayskull, skip_for_blackhole


@pytest.mark.parametrize(
"dtype",
(ttnn.bfloat16,),
ids=[
"bfloat16",
],
)
@pytest.mark.parametrize(
"nb, nc, nh, nw",
(
# llama shapes
(1, 1, 32, 128 * 1024),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No other shapes are needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for our model use case, this is the only one

),
)
@pytest.mark.parametrize("device_params", [{"dispatch_core_axis": ttnn.DispatchCoreAxis.COL}], indirect=True)
def test_run_untilize_subcoregrid_test(dtype, nb, nc, nh, nw, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Skipping float32 tests on Grayskull")
device.enable_async(True)
shape = [nb, nc, nh, nw]

torch.set_printoptions(precision=3, sci_mode=False, linewidth=3000, threshold=10000, edgeitems=128)

torch.manual_seed(10)

inp = torch.rand(*shape).bfloat16()

a = ttnn.Tensor(
inp.flatten().tolist(),
shape,
dtype,
ttnn.TILE_LAYOUT,
device,
)

out_mem_config = ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1)

b1 = ttnn.untilize(
a,
memory_config=out_mem_config,
use_multicore=True,
sub_core_grids=ttnn.CoreRangeSet(
{
ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)),
ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(6, 9)),
}
),
)
c1 = b1.cpu().to_torch()

untilized_inp = untilize(inp)

if dtype == ttnn.float32:
passing1, output = comp_pcc(untilized_inp, c1, 0.999999)
logger.info(output)
else:
passing1 = torch.equal(untilized_inp, c1)
assert passing1


@pytest.mark.parametrize(
"dtype",
(ttnn.bfloat16, ttnn.float32),
Expand Down Expand Up @@ -40,7 +100,6 @@
def test_run_untilize_test(dtype, nb, nc, nh, nw, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Skipping float32 tests on Grayskull")

shape = [nb, nc, 32 * nh, 32 * nw]

torch.set_printoptions(precision=3, sci_mode=False, linewidth=3000, threshold=10000, edgeitems=128)
Expand Down Expand Up @@ -72,7 +131,6 @@ def test_run_untilize_test(dtype, nb, nc, nh, nw, device):
logger.info(output)
else:
passing1 = torch.equal(untilized_inp, c1)

assert passing1


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ operation::ProgramWithCallbacks Untilize::create_program(
auto& output_tensor = output_tensors.at(0);
if (this->use_multicore) {
return detail::untilize_multi_core(
input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en);
input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en, this->sub_core_grids);
} else {
return detail::untilize_single_core(
input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ struct Untilize {
const bool use_multicore;
const bool use_pack_untilize;
const bool fp32_dest_acc_en;
const std::optional<CoreRangeSet> sub_core_grids;

void validate(const std::vector<Tensor>& input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,190 @@ uint32_t get_largest_divisor(uint32_t dividend, uint32_t starting_divisor, uint3
return 1;
}

operation::ProgramWithCallbacks untilize_multi_core_parallelize_column_subgrid(
const Tensor& a,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the support here 1:1 with the version without subgrids?

Asking because I'm curious if the validate needs updating.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the regular version splits the cores into core_range and core_range_cliff but when I ran it for my shapes, the core_range_cliff was empty so I think it is 1:1.
I think validate is fine.

Tensor& output,
bool use_pack_untilize,
bool fp32_dest_acc_en,
const CoreRangeSet& sub_core_grids) {
tt::tt_metal::Program program{};

tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);

Device* device = a.device();

uint32_t ntiles = a.volume() / TILE_HW;
uint32_t ncores = sub_core_grids.num_cores();
for (uint32_t core_id = ncores; core_id >= 1; core_id--) {
if (ntiles % ncores == 0) {
break;
} else {
ncores--;
}
}

TT_ASSERT(ntiles % (ncores) == 0);

uint32_t max_tiles = 1;
uint32_t ntiles_per_block = ntiles / ncores;
uint32_t stick_s = a.get_legacy_shape()[-1];
uint32_t ntiles_per_row = stick_s / TILE_WIDTH;
uint32_t stick_size = stick_s * output.element_size();
uint32_t ntiles_per_column = ntiles / ntiles_per_row;
uint32_t starting_tile = ntiles_per_block;
if (ntiles_per_row > max_tiles) {
starting_tile = max_tiles;
}
ntiles_per_block = get_largest_divisor(ntiles_per_row, starting_tile);
TT_ASSERT(
ntiles_per_row % ntiles_per_block == 0 and ntiles_per_block >= 1 and ntiles_per_block <= ntiles_per_row and
ntiles % ntiles_per_block == 0);

uint32_t nblocks = (ntiles / ntiles_per_block);
uint32_t block_size_nbytes = input_single_tile_size;

auto cores = corerange_to_cores(sub_core_grids, ncores, true);
auto all_cores = num_cores_to_corerangeset_in_subcoregrids(cores[0], ncores, sub_core_grids, true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ clang-diagnostic-error ⚠️
use of undeclared identifier num_cores_to_corerangeset_in_subcoregrids

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the only difference here that we're using a different work_split and num_cores_to_corerangeset_in_subcoregrids to get the cores?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, could you just have one function that if-elses when it's nullopt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the multicore version has some computations for num_x_cores, num_y_cores and splits the cores into core_range and core_range_cliff:
auto [ncores, all_cores, core_range, core_range_cliff, nblocks_per_core, nblocks_per_core_cliff] =
ttnn::split_blocks_for_tilize(CoreCoord(ncores_x, ncores_y), nblocks);

uint32_t nblocks_per_core = nblocks / ncores;

bool row_major = true;
bool src_block_sharded = false;
uint32_t num_rows_block = 0, block_row_size = 0, output_row_size = 0, last_block_row_size_unpadded = 0,
num_output_rows_unpadded = 0;
CoreCoord end_core;
std::vector<CoreCoord> cores_with_rtargs;

uint32_t num_input_tiles = ntiles_per_block * 2;
auto [src0_cb_index, cb_src0] = create_cb(
tt::CBIndex::c_0, program, all_cores, input_single_tile_size, num_input_tiles, input_cb_data_format, nullptr);

uint32_t num_output_tiles = ntiles_per_block * 2;
auto [output_cb_index, cb_output] = create_cb(
tt::CBIndex::c_16,
program,
all_cores,
output_single_tile_size,
num_output_tiles,
output_cb_data_format,
nullptr);

Buffer* src0_buffer = a.buffer();
Buffer* dst_buffer = output.buffer();
bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
std::vector<uint32_t> reader_ct_args = {(uint32_t)src0_is_dram};

auto unary_reader_kernel_id = CreateKernel(
program,
"ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp",
all_cores,
ReaderDataMovementConfig(reader_ct_args));

bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
bool stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
uint32_t log2_stick_size = stick_size_is_power_of_two ? (std::uint32_t)std::log2(stick_size) : 0;
std::vector<uint32_t> writer_ct_args = {
(uint32_t)out_is_dram,
(uint32_t)stick_size_is_power_of_two,
(uint32_t)log2_stick_size,
};

auto unary_writer_kernel_id = CreateKernel(
program,
"ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/dataflow/"
"writer_unary_stick_layout_split_rows_interleaved_parallel_columns.cpp",
all_cores,
WriterDataMovementConfig(writer_ct_args));

/** compute
*/
std::vector<uint32_t> compute_args = {
(uint32_t)nblocks_per_core, // per_core_block_cnt
(uint32_t)ntiles_per_block, // per_block_ntiles
};

std::string compute_kernel(
"ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp");
if (ntiles_per_block > MAX_PACK_UNTILIZE_WIDTH || !use_pack_untilize) {
log_debug(tt::LogOp, "Using slow untilize.");
compute_kernel =
std::string("ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize.cpp");
} else {
log_debug(tt::LogOp, "Using fast pack untilize.");
}

auto untilize_kernel_id = CreateKernel(
program,
compute_kernel,
all_cores,
ComputeConfig{.fp32_dest_acc_en = fp32_dest_acc_en, .compile_args = compute_args});

uint32_t tile_start_id = 0;
uint32_t offset_within_stick = 0;

auto nsticks_per_core = ntiles_per_column * TILE_HEIGHT;

for (uint32_t i = 0; i < cores.size(); i++) {
CoreCoord core = cores[i];

// reader runtime args
auto ntiles_per_core = ntiles_per_block * nblocks_per_core;
const std::array reader_rt_args = {
src0_buffer->address(), // src_addr
ntiles_per_core, // ntiles
tile_start_id // start_id
};

const std::array writer_rt_args = {
dst_buffer->address(), // dst_addr
nsticks_per_core, // nsticks
stick_size, // block_size_nbytes
ntiles_per_core, // ntiles_per_core
TILE_WIDTH * output.element_size(), // tile_width_size
std::uint32_t{0}, // start stick id = 0, since parallelizing on height
offset_within_stick};

tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
cores_with_rtargs.push_back(core);
tile_start_id += ntiles_per_core;
offset_within_stick += ntiles_per_core * TILE_WIDTH * output.element_size();
}

auto override_runtime_arguments_callback = [reader_kernel_id = unary_reader_kernel_id,
writer_kernel_id = unary_writer_kernel_id,
cb_src0 = cb_src0,
cb_output = cb_output,
cores_with_rtargs](
const void* operation,
Program& program,
const std::vector<Tensor>& input_tensors,
const std::vector<std::optional<const Tensor>>&,
const std::vector<Tensor>& output_tensors) {
auto src_buffer = input_tensors.at(0).buffer();
auto dst_buffer = output_tensors.at(0).buffer();
{
auto& runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ clang-diagnostic-error ⚠️
use of undeclared identifier reader_kernel_id

for (const CoreCoord& core : cores_with_rtargs) {
auto& runtime_args = runtime_args_by_core[core.x][core.y];
runtime_args[0] = src_buffer->address();
}
}

{
auto& runtime_args_by_core = GetRuntimeArgs(program, writer_kernel_id);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ clang-diagnostic-error ⚠️
use of undeclared identifier writer_kernel_id

for (const CoreCoord& core : cores_with_rtargs) {
auto& runtime_args = runtime_args_by_core[core.x][core.y];
runtime_args[0] = dst_buffer->address();
}
}
};

return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback};
}

operation::ProgramWithCallbacks untilize_multi_core_parallelize_column(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) {
tt::tt_metal::Program program{};
Expand All @@ -41,12 +225,13 @@ operation::ProgramWithCallbacks untilize_multi_core_parallelize_column(
Device* device = a.device();

auto grid_size = device->compute_with_storage_grid_size();

uint32_t ntiles = a.volume() / TILE_HW;
uint32_t ncores_x = grid_size.x;
uint32_t ncores_y = grid_size.y;
// uint32_t ncores_x = 2;

ncores_x = get_largest_divisor(ntiles, ncores_x);
uint32_t ncores_y = grid_size.y;
// uint32_t ncores_y = 1;
ncores_y = get_largest_divisor(ntiles, ncores_y, ncores_x);

Expand Down Expand Up @@ -260,7 +445,11 @@ operation::ProgramWithCallbacks untilize_multi_core_parallelize_column(
}

operation::ProgramWithCallbacks untilize_multi_core(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) {
const Tensor& a,
Tensor& output,
bool use_pack_untilize,
bool fp32_dest_acc_en,
std::optional<CoreRangeSet> sub_core_grids) {
tt::tt_metal::Program program{};

bool src_sharded = a.memory_config().is_sharded();
Expand Down Expand Up @@ -289,7 +478,12 @@ operation::ProgramWithCallbacks untilize_multi_core(
if (!src_sharded and !out_sharded) {
uint32_t ntiles_height = ntiles / ntiles_per_block;
if (ntiles_height == 1) {
return untilize_multi_core_parallelize_column(a, output, use_pack_untilize, fp32_dest_acc_en);
if (sub_core_grids.has_value()) {
return untilize_multi_core_parallelize_column_subgrid(
a, output, use_pack_untilize, fp32_dest_acc_en, sub_core_grids.value());
} else {
return untilize_multi_core_parallelize_column(a, output, use_pack_untilize, fp32_dest_acc_en);
}
} else {
return untilize_single_core(a, output, use_pack_untilize, fp32_dest_acc_en);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
namespace ttnn::operations::data_movement::detail {

operation::ProgramWithCallbacks untilize_multi_core(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
const Tensor& a,
Tensor& output,
bool use_pack_untilize,
bool fp32_dest_acc_en,
std::optional<CoreRangeSet> sub_core_grids);

operation::ProgramWithCallbacks untilize_single_core(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
Expand Down
11 changes: 7 additions & 4 deletions ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ ttnn::Tensor ExecuteUntilize::invoke(
const ttnn::Tensor& input_tensor,
const std::optional<MemoryConfig>& memory_config,
bool use_multicore,
bool use_pack_untilize) {
bool use_pack_untilize,
const std::optional<CoreRangeSet> sub_core_grids) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ performance-unnecessary-value-param ⚠️
the const qualified parameter sub_core_grids is copied for each invocation; consider making it a reference

Suggested change
const std::optional<CoreRangeSet> sub_core_grids) {
const std::optional<CoreRangeSet>& sub_core_grids) {

bool fp32_dest_acc_en =
input_tensor.get_dtype() ==
DataType::UINT32; // MT: Currently only uint32 is moved to DST directly, fp32 is converted to fp16b
Expand All @@ -51,7 +52,8 @@ ttnn::Tensor ExecuteUntilize::invoke(
memory_config.value_or(input_tensor.memory_config()),
use_multicore,
use_pack_untilize,
fp32_dest_acc_en},
fp32_dest_acc_en,
sub_core_grids},
{input_tensor},
{},
{},
Expand All @@ -65,8 +67,9 @@ ttnn::Tensor ExecuteUntilize::invoke(
const ttnn::Tensor& input_tensor,
const std::optional<MemoryConfig>& memory_config,
bool use_multicore,
bool use_pack_untilize) {
return invoke(DefaultQueueId, input_tensor, memory_config, use_multicore, use_pack_untilize);
bool use_pack_untilize,
const std::optional<CoreRangeSet> sub_core_grids) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ performance-unnecessary-value-param ⚠️
the const qualified parameter sub_core_grids is copied for each invocation; consider making it a reference

Suggested change
const std::optional<CoreRangeSet> sub_core_grids) {
const std::optional<CoreRangeSet>& sub_core_grids) {

return invoke(DefaultQueueId, input_tensor, memory_config, use_multicore, use_pack_untilize, sub_core_grids);
}

} // namespace ttnn::operations::data_movement
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ struct ExecuteUntilize {
const ttnn::Tensor& input_tensor,
const std::optional<MemoryConfig>& memory_config = std::nullopt,
bool use_multicore = true,
bool use_pack_untilize = true);
bool use_pack_untilize = true,
const std::optional<CoreRangeSet> sub_core_grids = std::nullopt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ performance-unnecessary-value-param ⚠️
the const qualified parameter sub_core_grids is copied for each invocation; consider making it a reference

Suggested change
const std::optional<CoreRangeSet> sub_core_grids = std::nullopt);
const std::optional<CoreRangeSet>& sub_core_grids = std::nullopt);


static ttnn::Tensor invoke(
const ttnn::Tensor& input_tensor,
const std::optional<MemoryConfig>& memory_config = std::nullopt,
bool use_multicore = true,
bool use_pack_untilize = true);
bool use_pack_untilize = true,
const std::optional<CoreRangeSet> sub_core_grids = std::nullopt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ performance-unnecessary-value-param ⚠️
the const qualified parameter sub_core_grids is copied for each invocation; consider making it a reference

Suggested change
const std::optional<CoreRangeSet> sub_core_grids = std::nullopt);
const std::optional<CoreRangeSet>& sub_core_grids = std::nullopt);

};

} // namespace operations::data_movement
Expand Down
Loading
Loading