diff --git a/programming_examples/matrix_scalar_add/common.py b/programming_examples/matrix_scalar_add/common.py index 6b8c39e67..1c5cfbf59 100644 --- a/programming_examples/matrix_scalar_add/common.py +++ b/programming_examples/matrix_scalar_add/common.py @@ -5,13 +5,13 @@ import air.backend.xrt as xrt_backend import filelock -IMAGE_WIDTH = 32 -IMAGE_HEIGHT = 16 -IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT] +IMAGE_WIDTH = 16 +IMAGE_HEIGHT = 32 +IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH] -TILE_WIDTH = 16 -TILE_HEIGHT = 8 -TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT] +TILE_WIDTH = 8 +TILE_HEIGHT = 16 +TILE_SIZE = [TILE_HEIGHT, TILE_WIDTH] assert IMAGE_WIDTH % TILE_WIDTH == 0 assert IMAGE_HEIGHT % TILE_HEIGHT == 0 @@ -67,7 +67,7 @@ def test_main(build_module, experimental_passes, verbose=False): row = i // IMAGE_WIDTH col = i % IMAGE_WIDTH - tile_num = (row // TILE_HEIGHT) * (IMAGE_HEIGHT // TILE_HEIGHT) + ( + tile_num = (row // TILE_HEIGHT) * (IMAGE_WIDTH // TILE_WIDTH) + ( col // TILE_WIDTH ) diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py index 569946c9c..953d10fab 100644 --- a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py +++ b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py @@ -51,8 +51,8 @@ def launch_body(a, b): # Transfer one tile of data per worker for h in range(IMAGE_HEIGHT // TILE_HEIGHT): for w in range(IMAGE_WIDTH // TILE_WIDTH): - offset0 = IMAGE_HEIGHT * h - offset1 = IMAGE_HEIGHT * w + offset0 = TILE_HEIGHT * h + offset1 = TILE_WIDTH * w # Put data into the channel tile by tile ChannelPut( @@ -66,8 +66,8 @@ def launch_body(a, b): # Transfer one tile of data per worker for h in range(IMAGE_HEIGHT // TILE_HEIGHT): for w in range(IMAGE_WIDTH // TILE_WIDTH): - offset0 = IMAGE_HEIGHT * h - offset1 = IMAGE_HEIGHT * w + offset0 = TILE_HEIGHT * h + offset1 = TILE_WIDTH * w # Write data back out to the channel tile by tile ChannelGet( @@ -109,7 +109,7 @@ def herd_body(_tx, _ty, _sx, _sy): for j in range_(TILE_HEIGHT): for i in range_(TILE_WIDTH): # Load the input value from tile_in - val_in = load(tile_in, [i, j]) + val_in = load(tile_in, [j, i]) # Compute the output value val_out = arith.addi( @@ -121,7 +121,7 @@ def herd_body(_tx, _ty, _sx, _sy): ) # Store the output value in tile_out - store(val_out, tile_out, [i, j]) + store(val_out, tile_out, [j, i]) yield_([]) yield_([]) diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py index 71229eaaa..4905d3eee 100644 --- a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py +++ b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py @@ -46,10 +46,20 @@ def segment_body(arg2, arg3): # We are hoping to map each tile to a different compute core. @herd( name="xaddherd", - sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT], + sizes=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH], operands=[arg2, arg3], ) def herd_body(tx, ty, sx, sy, a, b): + scaled_index_map_height = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(TILE_HEIGHT), + ) + ], + ) scaled_index_map_width = AffineMap.get( 0, 1, @@ -66,7 +76,7 @@ def herd_body(tx, ty, sx, sy, a, b): [ AffineExpr.get_mul( AffineSymbolExpr.get(0), - AffineConstantExpr.get(IMAGE_HEIGHT // TILE_HEIGHT), + AffineConstantExpr.get(IMAGE_WIDTH // TILE_WIDTH), ) ], ) @@ -80,7 +90,7 @@ def herd_body(tx, ty, sx, sy, a, b): ) ], ) - offset0 = affine_apply(scaled_index_map_width, [tx]) + offset0 = affine_apply(scaled_index_map_height, [tx]) offset1 = affine_apply(scaled_index_map_width, [ty]) tile_index_height = affine_apply(create_tile_index_height, [tx]) compute_tile_id = affine_apply( @@ -114,7 +124,7 @@ def herd_body(tx, ty, sx, sy, a, b): for j in range_(TILE_HEIGHT): for i in range_(TILE_WIDTH): # Load the input value from tile_in - val_in = load(tile_in, [i, j]) + val_in = load(tile_in, [j, i]) # Compute the output value val_out = arith.addi( @@ -122,7 +132,7 @@ def herd_body(tx, ty, sx, sy, a, b): ) # Store the output value in tile_out - store(val_out, tile_out, [i, j]) + store(val_out, tile_out, [j, i]) yield_([]) yield_([]) diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py index 956b7ac1b..471ad532a 100644 --- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py +++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py @@ -45,18 +45,28 @@ def copy(arg0, arg1): operands=[arg0, arg1], ) def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b): - scaled_index_map = AffineMap.get( + scaled_index_map_height = AffineMap.get( 0, 1, [ AffineExpr.get_mul( AffineSymbolExpr.get(0), - AffineConstantExpr.get(IMAGE_HEIGHT), + AffineConstantExpr.get(TILE_HEIGHT), ) ], ) - offset0 = affine_apply(scaled_index_map, [tile_index0]) - offset1 = affine_apply(scaled_index_map, [tile_index1]) + scaled_index_map_width = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(TILE_WIDTH), + ) + ], + ) + offset0 = affine_apply(scaled_index_map_height, [tile_index0]) + offset1 = affine_apply(scaled_index_map_width, [tile_index1]) # Put data into the channel tile by tile ChannelPut( @@ -111,7 +121,7 @@ def herd_body(tx, ty, sx, sy, a, b): for j in range_(TILE_HEIGHT): for i in range_(TILE_WIDTH): # Load the input value from tile_in - val_in = load(tile_in, [i, j]) + val_in = load(tile_in, [j, i]) # Compute the output value TODO(hunhoffe): this is not correct, not sure how to percolate launch info here val_out = arith.addi( @@ -122,7 +132,7 @@ def herd_body(tx, ty, sx, sy, a, b): store( val_out, tile_out, - [i, j], + [j, i], ) yield_([]) yield_([]) diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit b/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit index 12342da6c..fe881ef0f 100644 --- a/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit +++ b/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit @@ -6,4 +6,3 @@ // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile run | FileCheck %s // CHECK: PASS! - // XFAIL: * \ No newline at end of file diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py index 037e615ec..d2e3ff8be 100644 --- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py +++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py @@ -46,8 +46,8 @@ def launch_body(a, b): for tile_index0 in range_(IMAGE_HEIGHT // TILE_HEIGHT): for tile_index1 in range_(IMAGE_WIDTH // TILE_WIDTH): # Convert the type of the tile size variable to the Index type - tile_size0 = arith.ConstantOp.create_index(IMAGE_HEIGHT) - tile_size1 = arith.ConstantOp.create_index(IMAGE_HEIGHT) + tile_size0 = arith.ConstantOp.create_index(TILE_HEIGHT) + tile_size1 = arith.ConstantOp.create_index(TILE_WIDTH) # Calculate the offset into the channel data, which is based on which tile index # we are at using tile_index0 and tile_index1 (our loop vars). @@ -111,7 +111,7 @@ def herd_body(_tx, _ty, _sx, _sy): for j in range_(TILE_HEIGHT): for i in range_(TILE_WIDTH): # Load the input value from tile_in - val_in = load(tile_in, [i, j]) + val_in = load(tile_in, [j, i]) # Compute the output value val_out = arith.addi( @@ -122,7 +122,7 @@ def herd_body(_tx, _ty, _sx, _sy): store( val_out, tile_out, - [i, j], + [j, i], ) yield_([]) yield_([]) diff --git a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py index af9068bfe..2b1c03575 100644 --- a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py +++ b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py @@ -65,8 +65,8 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): tile_out = AllocOp(tile_type, [], []) # Convert the type of the tile size variable to the Index type - tile_size0 = arith.ConstantOp.create_index(IMAGE_HEIGHT) - tile_size1 = arith.ConstantOp.create_index(IMAGE_HEIGHT) + tile_size0 = arith.ConstantOp.create_index(TILE_HEIGHT) + tile_size1 = arith.ConstantOp.create_index(TILE_WIDTH) # Calculate the offset into the channel data, which is based on our loop vars offset0 = arith.MulIOp(tile_size0, tile_index0) @@ -74,7 +74,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): tile_num = arith.MulIOp( tile_index0, arith.ConstantOp.create_index( - IMAGE_HEIGHT // TILE_HEIGHT + IMAGE_WIDTH // TILE_WIDTH ), ) tile_num = arith.AddIOp(tile_num, tile_index1) @@ -92,7 +92,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): for j in range_(TILE_HEIGHT): for i in range_(TILE_WIDTH): # Load the input value from tile_in - val_in = load(tile_in, [i, j]) + val_in = load(tile_in, [j, i]) # Compute the output value val_out = arith.addi( @@ -100,7 +100,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): ) # Store the output value in tile_out - store(val_out, tile_out, [i, j]) + store(val_out, tile_out, [j, i]) yield_([]) yield_([])