Skip to content

Commit

Permalink
Various runtime::ttmetal CQExecutor buffer map improvements / segfaul…
Browse files Browse the repository at this point in the history
…t workaround / TTMetal Tests (#529)

* A pair of runtime::ttmetal CQExecutor buffer map improvements (#408)
 - Prevent duplicate Buffers from being created inside
   CreateBufferCommand handler by checking for existence in buffers umap.
 - Change to use buffers.erase() in DeallocateBufferCommand to
   actually remove the entry from buffers umap. Buffer will
   still be destroyed because it goes out of scope.
 - Neither of these help with the original segfault in this ticket
   but these were found through visual observation.

* Temporary Workaround for tt-metal Segfaults during teardown (#408)
 - A hack, in createBufferFromTensorRef(), remove when proper
   bug fix is made in tt-metal and propagates here.

* Remove UNSUPPORTED: true flag from tests now that CI mixing tests issue resolved
 - Update test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir with CHECK to avoid
   errors and add missing flag --ttmetal-serialize-to-binary to run ttm binary in CI
  • Loading branch information
kmabeeTT authored Aug 31, 2024
1 parent ab9a8b2 commit fd466fc
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 7 deletions.
13 changes: 13 additions & 0 deletions runtime/include/tt/runtime/detail/ttmetal.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,19 @@ createBufferFromTensorRef(::tt::tt_metal::Device *device,
std::shared_ptr<::tt::tt_metal::Buffer> buffer =
::tt::tt_metal::CreateBuffer(shardedBufferConfig);
assert(tensorRef->address());

// Issue #408: Temporary Hack, remove when fix available.
// Update tt-metal BUFFER_MAP with updated address and remove
// entry for original alloc'd address.
auto &buffer_map = tt::tt_metal::detail::BUFFER_MAP;
auto map_copy = buffer_map.value();
auto old_key = std::make_tuple(device->id(), buffer->address());
if (auto it = map_copy.find(old_key); it != map_copy.end()) {
auto new_key = std::make_tuple(device->id(), tensorRef->address());
buffer_map.insert(new_key, it->second);
buffer_map.erase(old_key);
}

buffer->set_address(tensorRef->address());
return buffer;
}
Expand Down
8 changes: 5 additions & 3 deletions runtime/lib/ttmetal/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,10 @@ void CQExecutor::execute(

void CQExecutor::execute(
::tt::target::metal::CreateBufferCommand const *command) {
buffers[command->ref()->global_id()] =
createBufferFromTensorRef(device, command->ref());
if (buffers.find(command->ref()->global_id()) == buffers.end()) {
buffers[command->ref()->global_id()] =
createBufferFromTensorRef(device, command->ref());
}
}

void CQExecutor::execute(
Expand All @@ -252,7 +254,7 @@ void CQExecutor::execute(
assert(iter != buffers.end() && "Buffer not allocated");
assert(iter->second != nullptr && "Buffer already deallocated");
::tt::tt_metal::DeallocateBuffer(*iter->second);
iter->second.reset();
buffers.erase(iter);
}

void CQExecutor::execute(
Expand Down
1 change: 0 additions & 1 deletion test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
// UNSUPPORTED: true
#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>

func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
Expand Down
25 changes: 23 additions & 2 deletions test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
// UNSUPPORTED: true
// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
#l1_ = #tt.memory_space<l1>

#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
#tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>>
#untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> {
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%0 = tensor.empty() : tensor<64x128xf32, #tilized>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #untilized>, tensor<64x128xf32, #tilized>) -> tensor<64x128xf32, #tilized>
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%2 = tensor.empty() : tensor<64x128xf32, #tilized2x2>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%3 = "ttir.to_layout"(%1, %2) : (tensor<64x128xf32, #tilized>, tensor<64x128xf32, #tilized2x2>) -> tensor<64x128xf32, #tilized2x2>
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%4 = tensor.empty() : tensor<64x128xf32, #untilized2x2>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%5 = "ttir.to_layout"(%3, %4) : (tensor<64x128xf32, #tilized2x2>, tensor<64x128xf32, #untilized2x2>) -> tensor<64x128xf32, #untilized2x2>
return %5 : tensor<64x128xf32, #untilized2x2>
}
Expand All @@ -22,13 +27,19 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64
#tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>>
#untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>>
func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> {
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%1 = "ttir.to_layout"(%arg0, %0) : (tensor<2x3x64x128xf32, #untilized4D>, tensor<2x3x64x128xf32, #tilized4D>) -> tensor<2x3x64x128xf32, #tilized4D>

// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%2 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D_2x2>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%3 = "ttir.to_layout"(%1, %2) : (tensor<2x3x64x128xf32, #tilized4D>, tensor<2x3x64x128xf32, #tilized4D_2x2>) -> tensor<2x3x64x128xf32, #tilized4D_2x2>

// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%4 = tensor.empty() : tensor<2x3x64x128xf32, #untilized4D_2x2>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%5 = "ttir.to_layout"(%3, %4) : (tensor<2x3x64x128xf32, #tilized4D_2x2>, tensor<2x3x64x128xf32, #untilized4D_2x2>) -> tensor<2x3x64x128xf32, #untilized4D_2x2>

return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2>
Expand All @@ -40,23 +51,33 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten
#tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>>
func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> {
// move to tilized 1x1
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%0 = tensor.empty() : tensor<96x192xf32, #tilized_big>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%1 = "ttir.to_layout"(%arg0, %0) : (tensor<96x192xf32, #untilized_big>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big>

// move to tilized 2x3
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%2 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x2>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%3 = "ttir.to_layout"(%1, %2) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #tilized_big_3x2>) -> tensor<96x192xf32, #tilized_big_3x2>

// move to tilized 3x3
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%4 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x6>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%5 = "ttir.to_layout"(%3, %4) : (tensor<96x192xf32, #tilized_big_3x2>, tensor<96x192xf32, #tilized_big_3x6>) -> tensor<96x192xf32, #tilized_big_3x6>

// move back to tilized 1x1
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%6 = tensor.empty() : tensor<96x192xf32, #tilized_big>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%7 = "ttir.to_layout"(%5, %6) : (tensor<96x192xf32, #tilized_big_3x6>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big>

// untilize
// CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
%8 = tensor.empty() : tensor<96x192xf32, #untilized_big>
// CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
%9 = "ttir.to_layout"(%7, %8) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big>

return %9 : tensor<96x192xf32, #untilized_big>
Expand Down
1 change: 0 additions & 1 deletion test/ttmlir/Silicon/TTMetal/to_layout.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
// UNSUPPORTED: true
#l1_ = #tt.memory_space<l1>

#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>
Expand Down

0 comments on commit fd466fc

Please sign in to comment.