Enabling op model interface for constraints and L1 usage. (#1554)

This PR plumbs OpModelInterface to the underlying tt-metal op queries for validation and L1 memory consumption. `TTNNOpModelInterface.td` getOpConstraints takes input(s) and output `TTNNLayoutAttr` and returns a tuple of three values: 1. A boolean indicating if the op is legal for the given input/output layouts. 2. If the op is legal, a tuple of three values representing the op memory L1 usage estimate in bytes. - The first value is the CB L1 peak allocation in bytes. - The second value is the Tensor L1 peak allocation in bytes. - The third value is the Output L1 buffer allocation in bytes. 3. If the op is illegal, a string describing the failure. `TTNNOpModelInterface.cpp` implements hooks to the _wrapper library_ 'TTNNOpModelLib' (where metal API is). Per each op, implementation takes - tensor shapes (`llvm::ArrayRef<>`) from its operands, - worker grid (used for virtual to physical cores conversion), - op specific params (like softmax dimension), and - with layouts `TTNNLayoutsAttr` and pass them to the _wrapper library_ `TTNNOpModelLib`. `TTNNOpModelLib` converts mlir structures to metal structures, and calls into underlying 'tt-metal' op interface. Underlying `tt-metal` op interface `::ttnn::graph::query_op_constraints(..)` consumes a target op (e.g. 'ttnn::relu') and it's arguments in the order of op implemented ::invoke function that we are targeting. Implemented `SingletonDeviceContext` to avoid constant opening/closing device. This class should ensure opened device is a mockup device when it's implemented on the tt-metal side (tenstorrent/tt-metal#14000) Added 3 types of unit tests: - TestConversion - tests conversion of the MLIR to TTNN types - TestOpModelLib - tests interface to metal API - TestOpModelInterface - tests interface built in metal ops Due to differences in tt-metal and LLVM project setups (compiler standard, exceptions) these are implemented as the place Google unit test. Unlike other unit tests that are also Google unit tests but wrapped into LLVM (and invoked using llvm-lit). As these tests require TT hardware (until mockup device is implemented), changed Build tt-mlir op_model flavour to use n300 runners. Additionally, wired op model interface in the ShardSolver; mnist_sharded.mlir compiles and runs. @odjuricicTT confirmed found solution is the one we expected. Internal doc describing more details can be found [here](https://tenstorrent-my.sharepoint.com/:w:/p/mbezulj/ETC6JOzVU9dAhQjgIAiwGt8BdjbNdXmMw-fZTo7As1BVXw?e=kM7a3c)
tenstorrent · Dec 31, 2024 · 3745a88 · 3745a88
1 parent 7ec7430
commit 3745a88
Show file tree

Hide file tree

Showing 34 changed files with 2,924 additions and 360 deletions.
diff --git a/.github/actions/build-tt-mlir-action/action.yaml b/.github/actions/build-tt-mlir-action/action.yaml
@@ -0,0 +1,124 @@
+name: "Build tt-mlir"
+description: "Composite action for building, testing, and uploading artifacts for tt-mlir."
+inputs:
+  enable-perf:
+    description: "Enable performance tracing"
+    required: true
+  enable-op-model:
+    description: "Enable op model interface tests"
+    required: true
+  build-name:
+    description: "A unique name for this build (e.g., 'run' or 'perf')"
+    required: true
+  build-output-dir:
+    description: "Build folder location"
+    required: true
+  install-output-dir:
+    description: "Install folder location"
+    required: true
+  work-dir:
+    description: "tt-mlir root"
+    required: true
+  test_report_path:
+    description: "Path to test report"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+
+    - name: Configure CMake
+      shell: bash
+      run: |
+        source env/activate
+        cmake -G Ninja \
+          -B ${{ inputs.build-output-dir }} \
+          -DCMAKE_CXX_COMPILER=clang++-17 \
+          -DCMAKE_C_COMPILER=clang-17 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=${{ inputs.install-output-dir }} \
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+          -DTTMLIR_ENABLE_RUNTIME=ON \
+          -DTTMLIR_ENABLE_RUNTIME_TESTS=ON \
+          -DTT_RUNTIME_ENABLE_PERF_TRACE=${{ inputs.enable-perf }} \
+          -DTTMLIR_ENABLE_STABLEHLO=ON \
+          -DTTMLIR_ENABLE_OPMODEL=${{ inputs.enable-op-model }} \
+          -S ${{ inputs.work-dir }}
+
+    - name: Build and Install
+      shell: bash
+      run: |
+        source env/activate
+        cmake --build ${{ inputs.build-output-dir }}
+        cmake --install ${{ inputs.build-output-dir }} --component Test
+
+    - name: Build ttrt
+      shell: bash
+      run: |
+        source env/activate
+        cmake --build ${{ inputs.build-output-dir }} -- ttrt
+
+    - name: Generate and set system descriptor
+      shell: bash
+      if: inputs.enable-op-model == 'ON'
+      run: |
+        source env/activate
+        ttrt query --save-artifacts
+
+    - name: Run tt-mlir tests
+      shell: bash
+      run: |
+        source env/activate
+        if [ -f "${{ inputs.work-dir }}/ttrt-artifacts/system_desc.ttsys" ]; then
+          export SYSTEM_DESC_PATH="${{ inputs.work-dir }}/ttrt-artifacts/system_desc.ttsys"
+        fi
+        cmake --build ${{ inputs.build-output-dir }} -- check-ttmlir
+        cp build/test/report.xml ${{ inputs.test_report_path }}
+
+    - name: Run OpModelInterface Tests
+      shell: bash
+      if: inputs.enable-op-model == 'ON'
+      run: |
+        source env/activate
+        if [ -f "${{ inputs.work-dir }}/ttrt-artifacts/system_desc.ttsys" ]; then
+          export SYSTEM_DESC_PATH="${{ inputs.work-dir }}/ttrt-artifacts/system_desc.ttsys"
+        fi
+        ${{ inputs.build-output-dir }}/test/unittests/OpModel/TTNN/Conversion/TestConversion
+        ${{ inputs.build-output-dir }}/test/unittests/OpModel/TTNN/Lib/TestOpModelLib
+        ${{ inputs.build-output-dir }}/test/unittests/OpModel/TTNN/Op/TestOpModelInterface
+
+    - name: Upload Test Report
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-reports-${{ inputs.runs-on }}-perf-${{ inputs.enable-perf }}-op_model-${{ inputs.enable-op-model }}
+        path: ${{ inputs.test_report_path }}
+
+
+    - name: Upload ttrt .whl
+      uses: actions/upload-artifact@v4
+      with:
+        name: ttrt-whl-${{ inputs.build-name }}
+        path: build/runtime/tools/python/build/ttrt*.whl
+
+    - name: Archive Install Directory
+      shell: bash
+      working-directory: ${{ inputs.install-output-dir }}
+      run: tar cvf artifact.tar .
+
+    - name: Upload Install Folder
+      uses: actions/upload-artifact@v4
+      with:
+        name: install-artifacts-${{ inputs.build-name }}
+        path: ${{ inputs.install-output-dir }}/artifact.tar
+
+    - name: Get Latest Tag and Version
+      shell: bash
+      run: |
+        latest_tag=$(git describe --tags --abbrev=0)
+        latest_tag=${latest_tag#v}
+        echo "latest_tag=$latest_tag" >> $GITHUB_ENV
+        commit_count=$(git rev-list ${{ env.latest_tag }}..HEAD --count)
+        echo "commit_count=$commit_count" >> $GITHUB_ENV
+        version="${{ env.latest_tag }}.${{ env.commit_count }}"
+        echo "version=$version" >> $GITHUB_ENV
+        echo $version
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -80,7 +80,7 @@ jobs:
         -DTTMLIR_ENABLE_RUNTIME=ON \
         -DTTMLIR_ENABLE_RUNTIME_TESTS=ON \
         -DTTMLIR_ENABLE_STABLEHLO=ON \
-        -DTTMLIR_ENABLE_OP_MODEL=ON \
+        -DTTMLIR_ENABLE_OPMODEL=ON \
         -S ${{ steps.strings.outputs.work-dir }}
 
     - name: Lint
@@ -99,10 +99,9 @@ jobs:
         build: [
           {runs-on: ubuntu-latest, enable_perf: OFF, enable_op_model: OFF, name: "run", ttrt_flags: ""},
           {runs-on: ubuntu-latest, enable_perf: ON, enable_op_model: OFF, name: "perf", ttrt_flags: ""},
-          {runs-on: ubuntu-latest, enable_perf: OFF, enable_op_model: ON, name: "op_model" , ttrt_flags: ""}
         ]
 
-    name: Build tt-mlir
+    name: Build and test tt-mlir (compute machine)
     runs-on: ${{ matrix.build.runs-on }}
 
     container:
@@ -142,116 +141,16 @@ jobs:
         create-symlink: true
         key: ${{ matrix.build.runs-on }}-run-ON-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}-${{ env.SDK_VERSION }}
 
-    # Build project
-
-    - name: Configure CMake
-      shell: bash
-      run: |
-        source env/activate
-        cmake -G Ninja \
-        -B ${{ steps.strings.outputs.build-output-dir }} \
-        -DCMAKE_CXX_COMPILER=clang++-17 \
-        -DCMAKE_C_COMPILER=clang-17 \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.install-output-dir }} \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-        -DTTMLIR_ENABLE_RUNTIME=ON \
-        -DTTMLIR_ENABLE_RUNTIME_TESTS=ON \
-        -DTT_RUNTIME_ENABLE_PERF_TRACE=${{ matrix.build.enable_perf }} \
-        -DTTMLIR_ENABLE_STABLEHLO=ON \
-        -DTTMLIR_ENABLE_OP_MODEL=${{ matrix.build.enable_op_model }} \
-        -S ${{ steps.strings.outputs.work-dir }}
-
-    - name: Build
-      shell: bash
-      run: |
-        source env/activate
-        cmake --build ${{ steps.strings.outputs.build-output-dir }}
-        cmake --install ${{ steps.strings.outputs.build-output-dir }} --component Test
-
-    - name: Unique-ify clang-tidy fixes
-      shell: bash
-      if: failure() && steps.lint.outcome == 'failure'
-      run: |
-        source env/activate
-        python tools/scripts/filter-clang-tidy-fixes.py ${{ steps.strings.outputs.build-output-dir }}/clang-tidy-fixes.yaml
-
-    - name: Clang-tidy PR Comments
-      uses: platisd/clang-tidy-pr-comments@a8811fa17cd6bd02c52a3791b44f9840777e396a
-      if: failure() && steps.lint.outcome == 'failure'
+    - name: Run build and test tt-mlir
+      uses: ./.github/actions/build-tt-mlir-action
       with:
-        # The GitHub token (or a personal access token)
-        github_token: ${{ secrets.GITHUB_TOKEN }}
-        # The path to the clang-tidy fixes generated above
-        clang_tidy_fixes: ${{ steps.strings.outputs.build-output-dir }}/clang-tidy-fixes.yaml
-        # Optionally set to true if you want the Action to request
-        # changes in case warnings are found
-        request_changes: false
-        # Optionally set the number of comments per review
-        # to avoid GitHub API timeouts for heavily loaded
-        # pull requests
-        suggestions_per_comment: 10
-        python_path: "python3"
-
-    - name: Run Test
-      shell: bash
-      run: |
-        source env/activate
-        cmake --build ${{ steps.strings.outputs.build-output-dir }} -- check-ttmlir
-        cp build/test/report.xml ${{ steps.strings.outputs.test_report_path }}
-
-    - name: Upload Test Report
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-reports-${{ matrix.build.runs-on }}-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}
-        path: ${{ steps.strings.outputs.test_report_path }}
-
-    - name: Show Test Report
-      uses: mikepenz/action-junit-report@v4
-      if: success() || failure()
-      with:
-        report_paths: ${{ steps.strings.outputs.test_report_path }}
-        check_name: MLIR Tests
-
-    # Build and upload ttrt
-
-    - name: Build ttrt
-      shell: bash
-      run: |
-        source env/activate
-        cmake --build ${{ steps.strings.outputs.build-output-dir }} -- ttrt
-
-    - name: Upload ttrt whl
-      uses: actions/upload-artifact@v4
-      with:
-        name: ttrt-whl-${{ matrix.build.name }}
-        path: build/runtime/tools/python/build/ttrt*.whl
-
-    # This is needed to preserve file permissions
-    # https://github.com/actions/upload-artifact?tab=readme-ov-file#permission-loss
-    - name: 'Tar install directory'
-      shell: bash
-      working-directory: ${{ steps.strings.outputs.install-output-dir }}
-      run: tar cvf artifact.tar .
-
-    - name: Upload install folder to archive
-      uses: actions/upload-artifact@v4
-      with:
-        name: install-artifacts-${{ matrix.build.name }}
-        path: ${{ steps.strings.outputs.install-output-dir }}/artifact.tar
-
-    - name: Get the latest tag
-      shell: bash
-      run: |
-        latest_tag=$(git describe --tags --abbrev=0)
-        latest_tag=${latest_tag#v}
-        echo "latest_tag=$latest_tag" >> $GITHUB_ENV
-        commit_count=$(git rev-list ${{ env.latest_tag }}..HEAD --count)
-        echo "commit_count=$commit_count" >> $GITHUB_ENV
-        version="${{ env.latest_tag }}.${{ env.commit_count }}"
-        echo "version=$version" >> $GITHUB_ENV
-        echo $version
-
+        enable-perf: ${{ matrix.build.enable_perf }}
+        enable-op-model: ${{ matrix.build.enable_op_model }}
+        build-name: ${{ matrix.build.name }}
+        build-output-dir: ${{ steps.strings.outputs.build-output-dir }}
+        install-output-dir: ${{ steps.strings.outputs.install-output-dir }}
+        work-dir: ${{ steps.strings.outputs.work-dir }}
+        test_report_path: ${{ steps.strings.outputs.test_report_path }}
 
   # Run tests on TT hardware
 
@@ -673,7 +572,7 @@ jobs:
         -DTTMLIR_ENABLE_RUNTIME_TESTS=OFF \
         -DTT_RUNTIME_ENABLE_PERF_TRACE=${{ matrix.build.enable_perf }} \
         -DTTMLIR_ENABLE_STABLEHLO=OFF \
-        -DTTMLIR_ENABLE_OP_MODEL=${{ matrix.build.enable_op_model }} \
+        -DTTMLIR_ENABLE_OPMODEL=${{ matrix.build.enable_op_model }} \
         -S ${{ steps.strings.outputs.work-dir }}
 
     - name: Build tt-explorer
@@ -688,3 +587,73 @@ jobs:
         source env/activate
         pytest tools/explorer/test/run_tests.py
         # collect results
+
+
+  build-ttmlir-opmodelinterface:
+    needs: build-image
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        build: [
+          {runs-on: n300, enable_perf: OFF, enable_op_model: ON, name: "op_model" , ttrt_flags: ""}
+        ]
+
+    name: Run build and test tt-mlir (TT machine)
+    runs-on: ${{ matrix.build.runs-on }}
+
+    container:
+      image: ${{ needs.build-image.outputs.docker-image }}
+      options: --device /dev/tenstorrent/0
+      volumes:
+        - /dev/hugepages:/dev/hugepages
+        - /dev/hugepages-1G:/dev/hugepages-1G
+        - /etc/udev/rules.d:/etc/udev/rules.d
+        - /lib/modules:/lib/modules
+        - /opt/tt_metal_infra/provisioning/provisioning_env:/opt/tt_metal_infra/provisioning/provisioning_env
+
+    steps:
+
+    - uses: actions/checkout@v4
+      with:
+          fetch-depth: 0
+
+    - name: Set reusable strings
+      id: strings
+      shell: bash
+      env:
+        job-name: "Build tt-mlir (${{ matrix.build.runs-on }}, ${{ matrix.build.enable_perf }}, ${{ matrix.build.enable_op_model }}, ${{ matrix.build.name }})"
+      run: |
+        echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
+        echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
+        echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
+
+        # Github job context unfortunately doesn't contain job_id, this is the workaround how to fetch it using GH API
+        echo "Expected job name: ${{ env.job-name }}"
+        JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GH_TOKEN }}" \
+          "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}/jobs" | \
+          jq -r '.jobs[] | select(.name | contains("${{ env.job-name }}")) | .id ')
+        echo "Current job id: $JOB_ID"
+        echo "job-id=$JOB_ID" >> "$GITHUB_OUTPUT"
+        echo "test_report_path=report_$JOB_ID.xml" >> "$GITHUB_OUTPUT"
+
+    - name: Git safe dir
+      run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
+
+    - name: ccache
+      uses: hendrikmuhs/[email protected]
+      with:
+        create-symlink: true
+        key: ${{ matrix.build.runs-on }}-run-ON-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}-${{ env.SDK_VERSION }}
+
+    # Build project
+    - name: Run build and test tt-mlir
+      uses: ./.github/actions/build-tt-mlir-action
+      with:
+        enable-perf: ${{ matrix.build.enable_perf }}
+        enable-op-model: ${{ matrix.build.enable_op_model }}
+        build-name: ${{ matrix.build.name }}
+        build-output-dir: ${{ steps.strings.outputs.build-output-dir }}
+        install-output-dir: ${{ steps.strings.outputs.install-output-dir }}
+        work-dir: ${{ steps.strings.outputs.work-dir }}
+        test_report_path: ${{ steps.strings.outputs.test_report_path }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,7 @@ endif()
 option(TT_RUNTIME_ENABLE_PERF_TRACE "Enable performance mode" OFF)
 option(TTMLIR_ENABLE_RUNTIME "Enable runtime" OFF)
 option(TTMLIR_ENABLE_STABLEHLO "Enable StableHLO support" OFF)
-option(TTMLIR_ENABLE_OP_MODEL "Enable OpModel support" OFF)
+option(TTMLIR_ENABLE_OPMODEL "Enable OpModel support" OFF)
 option(TTMLIR_ENABLE_SHARED_LIB "Enable Shared lib building" ON)
 
 if (NOT TTMLIR_ENABLE_RUNTIME)
@@ -27,7 +27,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(TTMLIR_ENABLE_BINDINGS_PYTHON ON CACHE BOOL "Enable Python bindings")
 
 if (APPLE)
-  set(TTMLIR_ENABLE_OP_MODEL OFF)
+  set(TTMLIR_ENABLE_OPMODEL OFF)
   message(WARNING "TTNNOpModelLib is disabled on Apple platforms. Optimizer will not get true performance.")
 endif()