Various runtime::ttmetal CQExecutor buffer map improvements / segfaul…

…t workaround / TTMetal Tests (#529) * A pair of runtime::ttmetal CQExecutor buffer map improvements (#408) - Prevent duplicate Buffers from being created inside CreateBufferCommand handler by checking for existence in buffers umap. - Change to use buffers.erase() in DeallocateBufferCommand to actually remove the entry from buffers umap. Buffer will still be destroyed because it goes out of scope. - Neither of these help with the original segfault in this ticket but these were found through visual observation. * Temporary Workaround for tt-metal Segfaults during teardown (#408) - A hack, in createBufferFromTensorRef(), remove when proper bug fix is made in tt-metal and propagates here. * Remove UNSUPPORTED: true flag from tests now that CI mixing tests issue resolved - Update test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir with CHECK to avoid errors and add missing flag --ttmetal-serialize-to-binary to run ttm binary in CI
tenstorrent · Aug 31, 2024 · fd466fc · fd466fc
1 parent ab9a8b2
commit fd466fc
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 7 deletions.
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -161,6 +161,19 @@ createBufferFromTensorRef(::tt::tt_metal::Device *device,
   std::shared_ptr<::tt::tt_metal::Buffer> buffer =
       ::tt::tt_metal::CreateBuffer(shardedBufferConfig);
   assert(tensorRef->address());
+
+  // Issue #408: Temporary Hack, remove when fix available.
+  // Update tt-metal BUFFER_MAP with updated address and remove
+  // entry for original alloc'd address.
+  auto &buffer_map = tt::tt_metal::detail::BUFFER_MAP;
+  auto map_copy = buffer_map.value();
+  auto old_key = std::make_tuple(device->id(), buffer->address());
+  if (auto it = map_copy.find(old_key); it != map_copy.end()) {
+    auto new_key = std::make_tuple(device->id(), tensorRef->address());
+    buffer_map.insert(new_key, it->second);
+    buffer_map.erase(old_key);
+  }
+
   buffer->set_address(tensorRef->address());
   return buffer;
 }

diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp
@@ -242,8 +242,10 @@ void CQExecutor::execute(
 
 void CQExecutor::execute(
     ::tt::target::metal::CreateBufferCommand const *command) {
-  buffers[command->ref()->global_id()] =
-      createBufferFromTensorRef(device, command->ref());
+  if (buffers.find(command->ref()->global_id()) == buffers.end()) {
+    buffers[command->ref()->global_id()] =
+        createBufferFromTensorRef(device, command->ref());
+  }
 }
 
 void CQExecutor::execute(
@@ -252,7 +254,7 @@ void CQExecutor::execute(
   assert(iter != buffers.end() && "Buffer not allocated");
   assert(iter->second != nullptr && "Buffer already deallocated");
   ::tt::tt_metal::DeallocateBuffer(*iter->second);
-  iter->second.reset();
+  buffers.erase(iter);
 }
 
 void CQExecutor::execute(

diff --git a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
@@ -1,5 +1,4 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
-// UNSUPPORTED: true
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 
 func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {

diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -1,17 +1,22 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
-// UNSUPPORTED: true
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
 #l1_ = #tt.memory_space<l1>
 
 #untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
 #tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
 #tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>>
 #untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
 func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32, #tilized>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #untilized>, tensor<64x128xf32, #tilized>) -> tensor<64x128xf32, #tilized>
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %2 = tensor.empty() : tensor<64x128xf32, #tilized2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %3 = "ttir.to_layout"(%1, %2) : (tensor<64x128xf32, #tilized>, tensor<64x128xf32, #tilized2x2>) -> tensor<64x128xf32, #tilized2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %4 = tensor.empty() : tensor<64x128xf32, #untilized2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %5 = "ttir.to_layout"(%3, %4) : (tensor<64x128xf32, #tilized2x2>, tensor<64x128xf32, #untilized2x2>) -> tensor<64x128xf32, #untilized2x2>
   return %5 : tensor<64x128xf32, #untilized2x2>
 }
@@ -22,13 +27,19 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64
 #tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>>
 #untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>>
 func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %1 = "ttir.to_layout"(%arg0, %0) : (tensor<2x3x64x128xf32, #untilized4D>, tensor<2x3x64x128xf32, #tilized4D>) -> tensor<2x3x64x128xf32, #tilized4D>
 
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %2 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D_2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %3 = "ttir.to_layout"(%1, %2) : (tensor<2x3x64x128xf32, #tilized4D>, tensor<2x3x64x128xf32, #tilized4D_2x2>) -> tensor<2x3x64x128xf32, #tilized4D_2x2>
 
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %4 = tensor.empty() : tensor<2x3x64x128xf32, #untilized4D_2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %5 = "ttir.to_layout"(%3, %4) : (tensor<2x3x64x128xf32, #tilized4D_2x2>, tensor<2x3x64x128xf32, #untilized4D_2x2>) -> tensor<2x3x64x128xf32, #untilized4D_2x2>
 
   return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2>
@@ -40,23 +51,33 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten
 #tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>>
 func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> {
   // move to tilized 1x1
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<96x192xf32, #tilized_big>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %1 = "ttir.to_layout"(%arg0, %0) : (tensor<96x192xf32, #untilized_big>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big>
 
   // move to tilized 2x3
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %2 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %3 = "ttir.to_layout"(%1, %2) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #tilized_big_3x2>) -> tensor<96x192xf32, #tilized_big_3x2>
 
   // move to tilized 3x3
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %4 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x6>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %5 = "ttir.to_layout"(%3, %4) : (tensor<96x192xf32, #tilized_big_3x2>, tensor<96x192xf32, #tilized_big_3x6>) -> tensor<96x192xf32, #tilized_big_3x6>
 
   // move back to tilized 1x1
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %6 = tensor.empty() : tensor<96x192xf32, #tilized_big>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %7 = "ttir.to_layout"(%5, %6) : (tensor<96x192xf32, #tilized_big_3x6>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big>
 
   // untilize
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %8 = tensor.empty() : tensor<96x192xf32, #untilized_big>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %9 = "ttir.to_layout"(%7, %8) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big>
 
   return %9 : tensor<96x192xf32, #untilized_big>

diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
@@ -1,5 +1,4 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
-// UNSUPPORTED: true
 #l1_ = #tt.memory_space<l1>
 
 #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>