diff --git a/Jenkinsfile b/Jenkinsfile index 3d0d0bbf0..0f9dc70b4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -62,7 +62,7 @@ def dailyDeviceTest = { sh "pytest examples/app_mobilenetv2" } runPytestDevice("8x8/test_broadcast", "-n 1 --tc 1", "broadcast_1") - runPytestDevice("8x8/test_concatenate", "-n 1 --tc 1", "concat_1") + runPytestDevice("16x8/test_transpose", "-n 1", "16x8_transpose") runPytestDevice("8x8/test_concatenate", "-n 1 --tc 5", "concat_5") runPytestDevice("8x8/test_mean", "-n 1 --tc 1", "mean_1") runPytestDevice("16x8/test_mean", "-n 1 --tc 1", "16x8_mean_1") diff --git a/build.sh b/build.sh index 773ee3426..9a09bbefb 100755 --- a/build.sh +++ b/build.sh @@ -4,24 +4,27 @@ set -e OS=$(uname) if [ "$OS" = "Linux" ]; then - NUM_PROCS=$(nproc) + NUM_PROCS=$(nproc) elif [ "$OS" = "Darwin" ]; then - NUM_PROCS=$(sysctl -n hw.ncpu) + NUM_PROCS=$(sysctl -n hw.ncpu) else - echo "Unsupported operating system." - exit 1 + echo "Unsupported operating system." + exit 1 fi ACTION="--build" TARGET="" DEBUG="false" LSP="false" -SCRIPT_DIR="$(cd "$(dirname "$0")"; pwd -P)" +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" ARCH=$(uname -m) MACHINE_ARCH="" -if [ "$ARCH" = "x86_64" ] ; then +if [ "$ARCH" = "x86_64" ]; then MACHINE_ARCH="x86" -elif [[ "$ARCH" == *"arm"* ]] ; then +elif [[ "$ARCH" == *"arm"* ]]; then MACHINE_ARCH="arm" else echo "Unknown architecture" @@ -29,207 +32,212 @@ else fi help() { - echo "Usage: $(basename "$0") [ACTIONS]..." - echo " -b Build (default)" - echo " -c Clean build" - echo " -t Test build" - echo " -d Enable debug" - echo " -j [NUM_PROCS] Set number of jobs (default: nproc)" - echo " -T [TARGET] Set target:" - echo " init Initialise repository (update submodules and patch ltflm)" - echo " all Build everything" - echo " xinterpreter Build interpreter only" - echo " xformer Build compiler only" - echo " -h Show this help message" - exit 1 + echo "Usage: $(basename "$0") [ACTIONS]..." + echo " -b Build (default)" + echo " -c Clean build" + echo " -t Test build" + echo " -d Enable debug" + echo " -j [NUM_PROCS] Set number of jobs (default: nproc)" + echo " -T [TARGET] Set target:" + echo " init Initialise repository (update submodules and patch ltflm)" + echo " all Build everything" + echo " xinterpreter Build interpreter only" + echo " xformer Build compiler only" + echo " -h Show this help message" + exit 1 } while getopts "cbtdj:T:hl" opt; do - case $opt in - c) - ACTION="--clean";; - b) - ACTION="--build";; - t) - ACTION="--test";; - d) - DEBUG="true";; - j) - NUM_PROCS="$OPTARG";; - T) - TARGET="$OPTARG";; - h) - help;; - l) - LSP="true";; - *) - echo "Invalid option: -$OPTARG" >&2 - help;; - esac + case $opt in + c) + ACTION="--clean" + ;; + b) + ACTION="--build" + ;; + t) + ACTION="--test" + ;; + d) + DEBUG="true" + ;; + j) + NUM_PROCS="$OPTARG" + ;; + T) + TARGET="$OPTARG" + ;; + h) + help + ;; + l) + LSP="true" + ;; + *) + echo "Invalid option: -$OPTARG" >&2 + help + ;; + esac done if [ -z "$TARGET" ]; then - echo "No target specified." - help + echo "No target specified." + help fi bazel_compile_commands() { - cd xformer - bazel run @hedron_compile_commands//:refresh_all - cd $SCRIPT_DIR + cd xformer + bazel run @hedron_compile_commands//:refresh_all + cd "$SCRIPT_DIR" } build_xformer() { - if [ "$LSP" = "true" ] ; then - bazel_compile_commands - fi - cd xformer - bazel_cmd="bazel build --jobs $NUM_PROCS //:xcore-opt" - if [ "$MACHINE_ARCH" = "arm" ] ; then - bazel_cmd+=" --cpu=darwin_arm64" - fi - if [ "$DEBUG" = "true" ] ; then - bazel_cmd+=" -c dbg --spawn_strategy=local --javacopt=\"-g\" --copt=\"-g\" --strip=\"never\" --verbose_failures --sandbox_debug" - fi - eval $bazel_cmd - cd $SCRIPT_DIR + if [ "$LSP" = "true" ]; then + bazel_compile_commands + fi + cd xformer + bazel_cmd="bazel build --jobs $NUM_PROCS //:xcore-opt" + if [ "$MACHINE_ARCH" = "arm" ]; then + bazel_cmd+=" --cpu=darwin_arm64" + fi + if [ "$DEBUG" = "true" ]; then + bazel_cmd+=" -c dbg --spawn_strategy=local --javacopt=\"-g\" --copt=\"-g\" --strip=\"never\" --verbose_failures --sandbox_debug" + fi + eval "$bazel_cmd" + cd "$SCRIPT_DIR" } version_check() { - cd xformer - ./version_check.sh - cd $SCRIPT_DIR + cd xformer + ./version_check.sh + cd "$SCRIPT_DIR" } submodule_update() { - git submodule update --init --recursive --jobs $NUM_PROCS + git submodule update --init --recursive --jobs "$NUM_PROCS" } patch() { - make -C third_party/lib_tflite_micro patch + make -C third_party/lib_tflite_micro patch } unsupported_action() { - echo "Action $ACTION not supported for target $TARGET" - exit 1 + echo "Action $ACTION not supported for target $TARGET" + exit 1 } create_zip() { - cd third_party/lib_tflite_micro - mkdir -p build - cd build - if [ "$1" = "xcore" ]; then - cmake .. --toolchain=../lib_tflite_micro/submodules/xmos_cmake_toolchain/xs3a.cmake - else - cmake .. -DLIB_NAME=x86tflitemicro - fi - make create_zip -j$NUM_PROCS - cd $SCRIPT_DIR - mv third_party/lib_tflite_micro/build/release_archive.zip python/xmos_ai_tools/runtime/release_archive.zip - cd python/xmos_ai_tools/runtime - rm -rf include - unzip -o release_archive.zip - rm release_archive.zip - cd $SCRIPT_DIR + cd third_party/lib_tflite_micro + mkdir -p build + cd build + if [ "$1" = "xcore" ]; then + cmake .. --toolchain=../lib_tflite_micro/submodules/xmos_cmake_toolchain/xs3a.cmake + else + cmake .. -DLIB_NAME=x86tflitemicro + fi + make create_zip "-j$NUM_PROCS" + cd "$SCRIPT_DIR" + mv third_party/lib_tflite_micro/build/release_archive.zip python/xmos_ai_tools/runtime/release_archive.zip + cd python/xmos_ai_tools/runtime + rm -rf include + unzip -o release_archive.zip + rm release_archive.zip + cd "$SCRIPT_DIR" } build_xinterpreter() { - cd $SCRIPT_DIR - if [ "$LSP" = "true" ] ; then - bear make -C python/xmos_ai_tools/xinterpreters install -j$NUM_PROCS - else - make -C python/xmos_ai_tools/xinterpreters install -j$NUM_PROCS - fi + cd "$SCRIPT_DIR" + make -C python/xmos_ai_tools/xinterpreters install "-j$NUM_PROCS" } xformer_integration_test() { - pytest integration_tests/runner.py --models_path integration_tests/models/8x8 -n $NUM_PROCS - pytest integration_tests/runner.py --models_path integration_tests/models/16x8 -n $NUM_PROCS - pytest integration_tests/runner.py --models_path integration_tests/models/float32 -n $NUM_PROCS - pytest integration_tests/runner.py --models_path integration_tests/models/bnns --bnn -n $NUM_PROCS + pytest integration_tests/runner.py --models_path integration_tests/models/8x8 -n "$NUM_PROCS" + pytest integration_tests/runner.py --models_path integration_tests/models/16x8 -n "$NUM_PROCS" + pytest integration_tests/runner.py --models_path integration_tests/models/float32 -n "$NUM_PROCS" + pytest integration_tests/runner.py --models_path integration_tests/models/bnns --bnn -n"$NUM_PROCS" } clean_xinterpreter() { - make -C python/xmos_ai_tools/xinterpreters clean + make -C python/xmos_ai_tools/xinterpreters clean } clean_runtime() { - rm -rf third_party/lib_tflite_micro/build + rm -rf third_party/lib_tflite_micro/build } test_xinterpreter() { - echo "Not implemented yet" - exit 1 + echo "Not implemented yet" + exit 1 } # we want this script to build the repository it's in, no matter where we call it from -cd $SCRIPT_DIR +cd "$SCRIPT_DIR" case $TARGET in - init) - submodule_update - patch +init) + submodule_update + patch + ;; +xformer) + build_xformer + ;; +xinterpreter) + case $ACTION in + --build) + version_check + # create_zip "xcore" + clean_runtime + create_zip "x86" + build_xinterpreter ;; - xformer) + --clean) + clean_xinterpreter + clean_runtime + ;; + --test) + test_xinterpreter + ;; + *) + unsupported_action + ;; + esac + ;; +# this is a mess: xinterpreter-nozip only used for CI +xinterpreter-nozip) + case $ACTION in + --build) + version_check + build_xinterpreter + ;; + --clean) + clean_xinterpreter + ;; + --test) + test_xinterpreter + ;; + esac + ;; +all) + case $ACTION in + --build) + version_check build_xformer + create_zip "xcore" + build_xinterpreter + ;; + --clean) + clean_xinterpreter ;; - xinterpreter) - case $ACTION in - --build) - version_check - create_zip "xcore" - clean_runtime - create_zip "x86" - build_xinterpreter - ;; - --clean) - clean_xinterpreter - clean_runtime - ;; - --test) - test_xinterpreter - ;; - *) - unsupported_action - ;; - esac - ;; - # this is a mess: xinterpreter-nozip only used for CI - xinterpreter-nozip) - case $ACTION in - --build) - version_check - build_xinterpreter - ;; - --clean) - clean_xinterpreter - ;; - --test) - test_xinterpreter - ;; - esac - ;; - all) - case $ACTION in - --build) - version_check - build_xformer - create_zip "xcore" - build_xinterpreter - ;; - --clean) - clean_xinterpreter - ;; - --test) - xformer_integration_test - ;; - *) - unsupported_action - ;; - esac + --test) + xformer_integration_test ;; *) - echo "Unknown target: $TARGET" - help + unsupported_action ;; + esac + ;; +*) + echo "Unknown target: $TARGET" + help + ;; esac diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_0.mlir b/integration_tests/models/16x8/test_transpose/test_transpose_0.mlir new file mode 100644 index 000000000..178917139 --- /dev/null +++ b/integration_tests/models/16x8/test_transpose/test_transpose_0.mlir @@ -0,0 +1,5 @@ +func.func @main(%arg0: tensor<4x6x5x8x!quant.uniform>) -> (tensor<4x5x6x8x!quant.uniform>) { + %0 = "tfl.pseudo_qconst"() {qtype = tensor<4xi32>, value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32> + %1 = "tfl.transpose"(%arg0, %0) : (tensor<4x6x5x8x!quant.uniform>, tensor<4xi32>) -> tensor<4x5x6x8x!quant.uniform> + return %1 : tensor<4x5x6x8x!quant.uniform> +} diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_0.tflite b/integration_tests/models/16x8/test_transpose/test_transpose_0.tflite new file mode 100644 index 000000000..6d527fc6f Binary files /dev/null and b/integration_tests/models/16x8/test_transpose/test_transpose_0.tflite differ diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_1.mlir b/integration_tests/models/16x8/test_transpose/test_transpose_1.mlir new file mode 100644 index 000000000..92c14d23e --- /dev/null +++ b/integration_tests/models/16x8/test_transpose/test_transpose_1.mlir @@ -0,0 +1,8 @@ +func.func @main(%arg0: tensor<3x7x6x4x!quant.uniform>) -> (tensor<4x3x7x6x!quant.uniform>) { + %0 = "tfl.pseudo_qconst"() {qtype = tensor<4xi32>, value = dense<[0, 3, 2, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %1 = "tfl.transpose"(%arg0, %0) : (tensor<3x7x6x4x!quant.uniform>, tensor<4xi32>) -> tensor<3x4x6x7x!quant.uniform> + %2 = "tfl.pseudo_qconst"() {qtype = tensor<4xi32>, value = dense<[1, 0, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + %3 = "tfl.transpose"(%1, %2) : (tensor<3x4x6x7x!quant.uniform>, tensor<4xi32>) -> tensor<4x3x7x6x!quant.uniform> + return %3 : tensor<4x3x7x6x!quant.uniform> +} + diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_1.tflite b/integration_tests/models/16x8/test_transpose/test_transpose_1.tflite new file mode 100644 index 000000000..c7eeeeece Binary files /dev/null and b/integration_tests/models/16x8/test_transpose/test_transpose_1.tflite differ diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_2.mlir b/integration_tests/models/16x8/test_transpose/test_transpose_2.mlir new file mode 100644 index 000000000..669f570f8 --- /dev/null +++ b/integration_tests/models/16x8/test_transpose/test_transpose_2.mlir @@ -0,0 +1,10 @@ +func.func @main(%arg0: tensor<2x8x4x6x!quant.uniform>) -> (tensor<4x2x6x8x!quant.uniform>) { + %0 = "tfl.pseudo_qconst"() {qtype = tensor<4xi32>, value = dense<[1, 0, 3, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + %1 = "tfl.transpose"(%arg0, %0) : (tensor<2x8x4x6x!quant.uniform>, tensor<4xi32>) -> tensor<8x2x6x4x!quant.uniform> + %2 = "tfl.pseudo_qconst"() {qtype = tensor<4xi32>, value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %3 = "tfl.transpose"(%1, %2) : (tensor<8x2x6x4x!quant.uniform>, tensor<4xi32>) -> tensor<8x6x4x2x!quant.uniform> + %4 = "tfl.pseudo_qconst"() {qtype = tensor<4xi32>, value = dense<[2, 3, 1, 0]> : tensor<4xi32>} : () -> tensor<4xi32> + %5 = "tfl.transpose"(%3, %4) : (tensor<8x6x4x2x!quant.uniform>, tensor<4xi32>) -> tensor<4x2x6x8x!quant.uniform> + return %5 : tensor<4x2x6x8x!quant.uniform> +} + diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_2.tflite b/integration_tests/models/16x8/test_transpose/test_transpose_2.tflite new file mode 100644 index 000000000..ded9a14ba Binary files /dev/null and b/integration_tests/models/16x8/test_transpose/test_transpose_2.tflite differ diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_3.mlir b/integration_tests/models/16x8/test_transpose/test_transpose_3.mlir new file mode 100644 index 000000000..292a6cc39 --- /dev/null +++ b/integration_tests/models/16x8/test_transpose/test_transpose_3.mlir @@ -0,0 +1,6 @@ +func.func @main(%arg0: tensor<2x3x4x5x6x!quant.uniform>) -> (tensor<2x6x5x4x3x!quant.uniform>) { + %0 = "tfl.pseudo_qconst"() {qtype = tensor<5xi32>, value = dense<[0, 4, 3, 2, 1]> : tensor<5xi32>} : () -> tensor<5xi32> + %1 = "tfl.transpose"(%arg0, %0) : (tensor<2x3x4x5x6x!quant.uniform>, tensor<5xi32>) -> tensor<2x6x5x4x3x!quant.uniform> + return %1 : tensor<2x6x5x4x3x!quant.uniform> +} + diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_3.tflite b/integration_tests/models/16x8/test_transpose/test_transpose_3.tflite new file mode 100644 index 000000000..9db72fe2b Binary files /dev/null and b/integration_tests/models/16x8/test_transpose/test_transpose_3.tflite differ diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_4.mlir b/integration_tests/models/16x8/test_transpose/test_transpose_4.mlir new file mode 100644 index 000000000..b9ec6939e --- /dev/null +++ b/integration_tests/models/16x8/test_transpose/test_transpose_4.mlir @@ -0,0 +1,7 @@ +func.func @main(%arg0: tensor<3x4x5x6x7x!quant.uniform>) -> (tensor<4x5x6x7x3x!quant.uniform>) { + %0 = "tfl.pseudo_qconst"() {qtype = tensor<5xi32>, value = dense<[0, 4, 3, 2, 1]> : tensor<5xi32>} : () -> tensor<5xi32> + %1 = "tfl.transpose"(%arg0, %0) : (tensor<3x4x5x6x7x!quant.uniform>, tensor<5xi32>) -> tensor<3x7x6x5x4x!quant.uniform> + %2 = "tfl.pseudo_qconst"() {qtype = tensor<5xi32>, value = dense<[4, 3, 2, 1, 0]> : tensor<5xi32>} : () -> tensor<5xi32> + %3 = "tfl.transpose"(%1, %2) : (tensor<3x7x6x5x4x!quant.uniform>, tensor<5xi32>) -> tensor<4x5x6x7x3x!quant.uniform> + return %3 : tensor<4x5x6x7x3x!quant.uniform> +} diff --git a/integration_tests/models/16x8/test_transpose/test_transpose_4.tflite b/integration_tests/models/16x8/test_transpose/test_transpose_4.tflite new file mode 100644 index 000000000..067ace8a9 Binary files /dev/null and b/integration_tests/models/16x8/test_transpose/test_transpose_4.tflite differ diff --git a/third_party/lib_tflite_micro b/third_party/lib_tflite_micro index 9adb79863..6a387b04c 160000 --- a/third_party/lib_tflite_micro +++ b/third_party/lib_tflite_micro @@ -1 +1 @@ -Subproject commit 9adb79863e4fab86bb45c534dd2f0fb555fc038c +Subproject commit 6a387b04c20602383ab9af903de5092290d70091 diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td index acc1598d7..e4c3ccba0 100644 --- a/xformer/IR/XCoreOps.td +++ b/xformer/IR/XCoreOps.td @@ -162,6 +162,21 @@ def XC_ConcatOp : XC_Op<"concat", [Pure]> { let results = (outs TensorOf<[QI8, QI16, F32, I8, I32]> : $output); } +def XC_TransposeOp : XC_Op<"transpose", [Pure]> { + let summary = "Transpose op"; + + let description = [{Transpose op.}]; + + let arguments = (ins + TensorOf<[QI8, QI16, F32, I8, I32]> : $input, + + I32ArrayAttr:$offsets, + I32ArrayAttr:$t_shape + ); + + let results = (outs TensorOf<[QI8, QI16, F32, I8, I32]> : $output); +} + def XC_AddOp : XC_Op<"add", [Pure, XC_MemoryOverlappable]> { let summary = "Add op"; diff --git a/xformer/Test/pytorch_transpose.mlir b/xformer/Test/pytorch_transpose.mlir index 55ab7d8c0..e5a02e09e 100644 --- a/xformer/Test/pytorch_transpose.mlir +++ b/xformer/Test/pytorch_transpose.mlir @@ -1,5 +1,4 @@ // RUN: xcore-opt --mlir-io %s --xcore-optimize-transpose | FileCheck %s -// RUN: xcore-opt --mlir-io %s --xcore-optimize-transpose --xcore-allow-input-modification | FileCheck %s -check-prefix=INPUT-CHECK // CHECK-LABEL: hoist_pad_above_transpose func.func @hoist_pad_above_transpose(%arg0: tensor>) -> (tensor>) { @@ -14,8 +13,6 @@ func.func @hoist_pad_above_transpose(%arg0: tensor> } -// ----- - // CHECK-LABEL: fold_cancellable_transpose func.func @fold_cancellable_transpose(%arg0: tensor>) -> (tensor>) { %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -29,30 +26,14 @@ func.func @fold_cancellable_transpose(%arg0: tensor> } -// ----- - -// INPUT-CHECK-LABEL: fold_to_input -// INPUT-CHECK: %arg0: tensor>) -> (tensor>) { - %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> - %11 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> - %12 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32> - // INPUT-CHECK-NOT: transpose - %19 = "tfl.transpose"(%arg0, %11) : (tensor>, tensor<4xi32>) -> tensor> - %20 = "tfl.pad"(%19, %12) : (tensor>, tensor<4x2xi32>) -> tensor> - return %20 : tensor> -} - -// ----- - -// CHECK-LABEL: fold_to_input2 -// CHECK: %arg0: tensor>) -> (tensor>) { - %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> - %11 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> - %12 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32> +// CHECK-LABEL: merge_consecutive_transposes +func.func @merge_consecutive_transposes(%arg0: tensor>) -> (tensor>) { + // CHECK: dense<[0, 1, 3, 2]> // CHECK: transpose - %19 = "tfl.transpose"(%arg0, %11) : (tensor>, tensor<4xi32>) -> tensor> - %20 = "tfl.pad"(%19, %12) : (tensor>, tensor<4x2xi32>) -> tensor> - return %20 : tensor> + // CHECK-NOT: transpose + %10 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %11 = "tfl.pseudo_const"() {value = dense<[0, 3, 2, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %18 = "tfl.transpose"(%arg0, %10) : (tensor>, tensor<4xi32>) -> tensor> + %19 = "tfl.transpose"(%18, %11) : (tensor>, tensor<4xi32>) -> tensor> + return %19 : tensor> } diff --git a/xformer/Test/pytorch_transpose_input.mlir b/xformer/Test/pytorch_transpose_input.mlir new file mode 100644 index 000000000..55ab7d8c0 --- /dev/null +++ b/xformer/Test/pytorch_transpose_input.mlir @@ -0,0 +1,58 @@ +// RUN: xcore-opt --mlir-io %s --xcore-optimize-transpose | FileCheck %s +// RUN: xcore-opt --mlir-io %s --xcore-optimize-transpose --xcore-allow-input-modification | FileCheck %s -check-prefix=INPUT-CHECK + +// CHECK-LABEL: hoist_pad_above_transpose +func.func @hoist_pad_above_transpose(%arg0: tensor>) -> (tensor>) { + %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + %11 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %12 = "tfl.pseudo_const"() {value = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32> + // CHECK: pad + // CHECK-NOT: transpose + %18 = "tfl.transpose"(%arg0, %10) : (tensor>, tensor<4xi32>) -> tensor> + %20 = "tfl.pad"(%18, %12) : (tensor>, tensor<4x2xi32>) -> tensor> + %19 = "tfl.transpose"(%20, %11) : (tensor>, tensor<4xi32>) -> tensor> + return %19 : tensor> +} + +// ----- + +// CHECK-LABEL: fold_cancellable_transpose +func.func @fold_cancellable_transpose(%arg0: tensor>) -> (tensor>) { + %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + %11 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %12 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32> + // CHECK-NOT: transpose + // CHECK: pad + %18 = "tfl.transpose"(%arg0, %10) : (tensor>, tensor<4xi32>) -> tensor> + %19 = "tfl.transpose"(%18, %11) : (tensor>, tensor<4xi32>) -> tensor> + %20 = "tfl.pad"(%19, %12) : (tensor>, tensor<4x2xi32>) -> tensor> + return %20 : tensor> +} + +// ----- + +// INPUT-CHECK-LABEL: fold_to_input +// INPUT-CHECK: %arg0: tensor>) -> (tensor>) { + %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + %11 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %12 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32> + // INPUT-CHECK-NOT: transpose + %19 = "tfl.transpose"(%arg0, %11) : (tensor>, tensor<4xi32>) -> tensor> + %20 = "tfl.pad"(%19, %12) : (tensor>, tensor<4x2xi32>) -> tensor> + return %20 : tensor> +} + +// ----- + +// CHECK-LABEL: fold_to_input2 +// CHECK: %arg0: tensor>) -> (tensor>) { + %10 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + %11 = "tfl.pseudo_const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + %12 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32> + // CHECK: transpose + %19 = "tfl.transpose"(%arg0, %11) : (tensor>, tensor<4xi32>) -> tensor> + %20 = "tfl.pad"(%19, %12) : (tensor>, tensor<4x2xi32>) -> tensor> + return %20 : tensor> +} diff --git a/xformer/Transforms/OptimizeTranspose.cpp b/xformer/Transforms/OptimizeTranspose.cpp index 8ac71be2e..aa52ed7f5 100644 --- a/xformer/Transforms/OptimizeTranspose.cpp +++ b/xformer/Transforms/OptimizeTranspose.cpp @@ -140,6 +140,107 @@ struct FoldTrReFCPattern : public OpRewritePattern { } }; +struct FoldDoubleTransposePattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(TFL::TransposeOp transposeOp, + PatternRewriter &rewriter) const override { + // Ensure the TransposeOp has a single use + if (!transposeOp->hasOneUse()) + return failure(); + // Check if the user operation is a transpose op + Operation *userOp = *transposeOp->getUsers().begin(); + auto userTransposeOp = dyn_cast(userOp); + if (!userTransposeOp) + return failure(); + // Get the permutation used in the transposes + DenseIntElementsAttr perm0; + DenseIntElementsAttr perm1; + if (!matchPattern(transposeOp.getPerm(), m_Constant(&perm0)) || + !matchPattern(userTransposeOp.getPerm(), m_Constant(&perm1))) + return failure(); + + SmallVector permVec; + for (auto val : perm1.getValues()) { + permVec.push_back(perm0.getValues()[val]); + } + // Create perm constant op + auto permType = RankedTensorType::get( + {static_cast(permVec.size())}, rewriter.getIntegerType(32)); + + auto permAttr = DenseIntElementsAttr::get(permType, permVec); + auto permConstOp = + rewriter.create(transposeOp.getLoc(), permType, permAttr); + + // Create new transposeOp + auto newTransposeOp = rewriter.create( + transposeOp.getLoc(), userTransposeOp.getType(), transposeOp.getInput(), + permConstOp.getResult()); + + rewriter.replaceOp(userOp, newTransposeOp.getResult()); + rewriter.eraseOp(transposeOp); + return success(); + } +}; + +// Replace TransposeOp with ReshapeOp if equivalent +// Transpose is equivalent to reshape if we only permute consecutive dimensions +// and only one of those permuted dimensions isn't of size 1 +struct FoldTransposeToReshapePattern + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(TFL::TransposeOp transposeOp, + PatternRewriter &rewriter) const override { + DenseIntElementsAttr perm; + if (!matchPattern(transposeOp.getPerm(), m_Constant(&perm))) + return failure(); + + SmallVector permVec; + for (auto val : perm.getValues()) { + permVec.push_back(val); + } + // get input shape + auto inputType = + transposeOp.getInput().getType().dyn_cast(); + if (!inputType) + return failure(); + ArrayRef inputShape = inputType.getShape(); + for (size_t i = 0; i < permVec.size(); ++i) { + if (permVec[i] == i) + continue; + // check if all shapes between i and permVec[i] are 1, except for at most + // one + int s = permVec[i] < i ? permVec[i] : i; + int e = permVec[i] < i ? i : permVec[i]; + bool foundNonOne = false; + for (size_t j = s; j <= e; ++j) { + if (inputShape[j] != 1) { + if (foundNonOne) + return failure(); + foundNonOne = true; + } + } + } + // get output shape + auto outputType = + transposeOp.getResult().getType().dyn_cast(); + // convert to small vector + SmallVector outputShape; + for (auto val : outputType.getShape()) { + outputShape.push_back(val); + } + // Create new shape constant op + auto newShapeConstOp = + utils::createShapeConstOp(rewriter, transposeOp.getLoc(), outputShape); + // assume transpose can be replaced with reshape + auto reshapeOp = rewriter.create( + transposeOp.getLoc(), transposeOp.getType(), transposeOp.getInput(), + newShapeConstOp); + + rewriter.replaceOp(transposeOp, reshapeOp.getResult()); + return success(); + } +}; + struct FoldFCReTrPattern : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -367,39 +468,6 @@ struct MoveTransposeForwardOverUnaryOpPattern } }; -struct FoldCancellableTransposePattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(TFL::TransposeOp op, - PatternRewriter &rewriter) const override { - - // Check for invalid types and return - // Defining op must be transpose - auto transposeOp = - dyn_cast_or_null(op.getInput().getDefiningOp()); - if (!transposeOp) { - return failure(); - } - - // Get transpose permutations - DenseIntElementsAttr perm0; - DenseIntElementsAttr perm1; - if (!matchPattern(op.getPerm(), m_Constant(&perm0)) || - !matchPattern(transposeOp.getPerm(), m_Constant(&perm1))) { - return failure(); - } - - // Do permutation indices cancel each other? - if (!TF::AreCancellablePermutations(perm0, perm1)) { - return failure(); - } - - rewriter.replaceOp(op, transposeOp.getInput()); - - return success(); - } -}; struct MoveTransposeForwardOverConcatOpPattern : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -494,8 +562,8 @@ struct MoveTransposeForwardOverConcatOpPattern auto permType = RankedTensorType::get( {static_cast(permVec.size())}, rewriter.getIntegerType(32)); auto permAttr = DenseIntElementsAttr::get(permType, permVec); - auto permConstOp = rewriter.create(concatOp.getLoc(), - permType, permAttr); + auto permConstOp = + rewriter.create(concatOp.getLoc(), permType, permAttr); // Create the new TransposeOp with the original output type auto newTransposeOp = rewriter.create( @@ -563,7 +631,7 @@ struct HoistTransposeWCHAbovePadPattern std::vector paddingValues{0, 0, 1, 1, 1, 1, 0, 0}; auto paddingAttr = DenseIntElementsAttr::get( RankedTensorType::get({4, 2}, rewriter.getI32Type()), paddingValues); - auto paddingOp = rewriter.create( + auto paddingOp = rewriter.create( padOp->getLoc(), RankedTensorType::get({4, 2}, rewriter.getI32Type()), paddingAttr); auto newPad = rewriter.create( @@ -629,9 +697,10 @@ void OptimizeTranspose::runOnOperation() { // Try to merge transpose -> ops -> inverse transpose RewritePatternSet mergePatterns(ctx); - mergePatterns.insert(ctx); + mergePatterns + .insert(ctx); if (mergeTransposeOption) { (void)applyPatternsAndFoldGreedily(func, std::move(mergePatterns)); } @@ -640,7 +709,8 @@ void OptimizeTranspose::runOnOperation() { RewritePatternSet patterns(ctx); patterns.insert(ctx); - patterns.insert(ctx); + patterns.insert(ctx); + patterns.insert(ctx); // TODO - enable after transpose permutation fix // patterns.insert(ctx); // patterns.insert(ctx); diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp index fc7e9e12d..6acce3f26 100644 --- a/xformer/Transforms/Passes.cpp +++ b/xformer/Transforms/Passes.cpp @@ -24,6 +24,7 @@ void buildXCoreRemainingPassPipeline(OpPassManager &pm) { pm.addPass(createOptimizeTransposePass()); // Run canonicalization for constant folding Transpose, if any pm.addPass(mlir::createCanonicalizerPass()); + pm.addPass(createOptimizeTransposePass()); pm.addPass(createReplaceAvgPoolWithConv2DPass()); pm.addPass(createReplaceFCWithConv2DPass()); if (opSplitTensorArenaOption) { @@ -49,6 +50,7 @@ void buildXCoreRemainingPassPipeline(OpPassManager &pm) { pm.addPass(createReplaceSlicePass()); pm.addPass(createReplaceBroadcastPass()); pm.addPass(createReplaceConcatPass()); + pm.addPass(createReplaceTransposePass()); pm.addPass(createApplyXCPatternsPass()); // Add to pipeline only if weights file option is provided if (!weightsFilenameOption.empty()) { diff --git a/xformer/Transforms/Passes.h b/xformer/Transforms/Passes.h index bf7ac59ae..dfdc15a7e 100644 --- a/xformer/Transforms/Passes.h +++ b/xformer/Transforms/Passes.h @@ -41,6 +41,7 @@ std::unique_ptr> createReplaceSlicePass(); std::unique_ptr> createReplaceBroadcastPass(); std::unique_ptr> createReplacePadPass(); std::unique_ptr> createReplaceConcatPass(); +std::unique_ptr> createReplaceTransposePass(); std::unique_ptr> createReplaceConv2DPass(); std::unique_ptr> createReplaceTransposeConvPass(); std::unique_ptr> createApplyXCPatternsPass(); diff --git a/xformer/Transforms/ReplaceTranspose.cpp b/xformer/Transforms/ReplaceTranspose.cpp new file mode 100644 index 000000000..2e1c9f8a8 --- /dev/null +++ b/xformer/Transforms/ReplaceTranspose.cpp @@ -0,0 +1,192 @@ +// Replace TFL Transpose with Transpose for XCore. +#include "IR/XCoreOps.h" +#include "Utils/Util.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h" + +namespace mlir::xcore { + +namespace { + +// Replace TFL Transpose with xcore Transpose for XCore. +struct ReplaceTranspose + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReplaceTranspose) + + void getDependentDialects(DialectRegistry ®istry) const final { + registry.insert(); + } + StringRef getArgument() const final { return "xcore-replace-transpose"; } + StringRef getDescription() const final { + return "Replace TFL Transpose with xcore Transpose for XCore."; + } + void runOnOperation() override; +}; + +int twoConsecutive(const SmallVector &perm) { + for (int i = 0; i < (int)perm.size() - 1; ++i) { + if (perm[i] + 1 == perm[i + 1]) { + return i; // Return the index i, not perm[i] + } + } + return -1; +} + +struct ReplaceTransposePattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TFL::TransposeOp transposeOp, + PatternRewriter &rewriter) const override { + + auto inputType = + transposeOp.getInput().getType().dyn_cast(); + if (!inputType || !inputType.hasStaticShape()) + return failure(); + + ArrayRef inputShape = inputType.getShape(); + int64_t rank = inputShape.size(); + + Value permValue = transposeOp.getPerm(); + DenseIntElementsAttr permAttr; + if (!matchPattern(permValue, m_Constant(&permAttr))) { + return failure(); + } + + SmallVector perm; + for (auto val : permAttr.getValues()) { + perm.push_back(val.getSExtValue()); + } + + SmallVector reducedShape; + SmallVector reducedPerm; + { + SmallVector oldToNew; + oldToNew.reserve(rank); + for (int i = 0; i < rank; ++i) { + if (inputShape[i] != 1) { + oldToNew.push_back((int)reducedShape.size()); + reducedShape.push_back(inputShape[i]); + } else { + oldToNew.push_back(-1); + } + } + + for (auto p : perm) { + if (oldToNew[p] != -1) { + reducedPerm.push_back(oldToNew[p]); + } + } + + if (reducedShape.empty()) { + reducedShape.push_back(1); + reducedPerm.push_back(0); + } + } + + const size_t dtype_size = utils::getTypeSize(inputType.getElementType()); + if (dtype_size != 1) { + reducedShape.push_back((int64_t)dtype_size); + reducedPerm.push_back((int64_t)reducedShape.size() - 1); + } + auto mergeConsecutiveDims = [&](SmallVector &shape, + SmallVector &perm) { + while (true) { + int i = twoConsecutive(perm); + if (i == -1) + break; + int64_t p1 = perm[i]; + int64_t p2 = perm[i + 1]; + + shape[p1] *= shape[p2]; + shape.erase(shape.begin() + p2); + perm.erase(perm.begin() + i + 1); + for (int j = 0; j < (int)perm.size(); ++j) { + if (perm[j] > p2) { + perm[j] -= 1; + } + } + } + }; + + mergeConsecutiveDims(reducedShape, reducedPerm); + + if (reducedShape.size() > 4) { + return failure(); + } + + // If size of reducedShape < 4, pad with 1's at the beginning + // After padding shapes, adjust perm so that it matches + int dimCount = (int)reducedShape.size(); + int pad = 4 - dimCount; + if (pad > 0) { + // Insert 1's at the start of shape + SmallVector paddedShape; + SmallVector paddedPerm; + paddedShape.resize(4, 1); // fill with ones + for (int i = 0; i < dimCount; ++i) { + paddedShape[pad + i] = reducedShape[i]; + } + + for (int i = 0; i < pad; ++i) { + paddedPerm.push_back(i); + } + for (auto p : reducedPerm) { + paddedPerm.push_back(p + pad); + } + + reducedShape = paddedShape; + reducedPerm = paddedPerm; + } + + const int RANK = 4; + SmallVector offsets(RANK); + for (int i = 0; i < RANK; ++i) { + int32_t prod = 1; + for (int j = i + 1; j < RANK; ++j) { + prod *= (int32_t)reducedShape[j]; + } + offsets[i] = prod; + } + + SmallVector permutedOffsets(RANK); + for (int i = 0; i < RANK; ++i) { + permutedOffsets[i] = offsets[reducedPerm[i]]; + } + + // t_shape = tuple(SHAPE[p] for p in PERM) + SmallVector tShape(RANK); + for (int i = 0; i < RANK; ++i) { + tShape[i] = (int32_t)reducedShape[reducedPerm[i]]; + } + + auto outputType = transposeOp.getOutput().getType(); + auto newTransposeOp = rewriter.create( + transposeOp.getLoc(), outputType, transposeOp.getInput(), + rewriter.getI32ArrayAttr(permutedOffsets), + rewriter.getI32ArrayAttr(tShape)); + + rewriter.replaceOp(transposeOp, newTransposeOp.getResult()); + + return success(); + } +}; + +void ReplaceTranspose::runOnOperation() { + auto *ctx = &getContext(); + func::FuncOp func = getOperation(); + RewritePatternSet patterns(ctx); + patterns.insert(ctx); + (void)applyPatternsAndFoldGreedily(func, std::move(patterns)); +} + +} // namespace + +// Creates an instance of the ReplaceTranspose pass. +std::unique_ptr> createReplaceTransposePass() { + return std::make_unique(); +} + +static PassRegistration pass; + +} // namespace mlir::xcore diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp index 035fc13cf..475e9f16f 100644 --- a/xformer/Transforms/TranslateToCustomOp.cpp +++ b/xformer/Transforms/TranslateToCustomOp.cpp @@ -151,6 +151,26 @@ std::vector ConcatOp::buildCustomOptions() { return fbb.GetBuffer(); } +std::vector TransposeOp::buildCustomOptions() { + flexbuffers::Builder fbb; + auto rootMap = fbb.StartMap(); + auto tShapeVec = fbb.StartVector("s"); + auto tShape = getTShape().cast(); + for (int i = 0; i < 4; ++i) { + fbb.Int(tShape[i].cast().getInt()); + } + fbb.EndVector(tShapeVec, false, false); + auto offsetsVec = fbb.StartVector("o"); + auto offsets = getOffsets().cast(); + for (int j = 0; j < 4; ++j) { + fbb.Int(offsets[j].cast().getInt()); + } + fbb.EndVector(offsetsVec, false, false); + fbb.EndMap(rootMap); + fbb.Finish(); + return fbb.GetBuffer(); +} + std::vector LoadWeightsOp::buildCustomOptions() { flexbuffers::Builder fbb; auto rootMap = fbb.StartMap(); @@ -287,6 +307,7 @@ void TranslateToCustomOp::runOnOperation() { patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); + patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); patterns.insert>(ctx); diff --git a/xformer/Transforms/XCPatterns.td b/xformer/Transforms/XCPatterns.td index dcdbb6e9b..802897558 100644 --- a/xformer/Transforms/XCPatterns.td +++ b/xformer/Transforms/XCPatterns.td @@ -143,10 +143,6 @@ def: Pat<(TFL_MulOp // Pad patterns def getPadValue : NativeCodeCall<"getPadValue($_builder, $0)">; -def getPaddingPlan - : NativeCodeCall< - "getPaddingPlan($_builder, $0.getDefiningOp())">; - def Has3To4Channel : Constraint().getDimSize(3) == 3 && " "$1.getType().cast().getDimSize(3) == 4">>; diff --git a/xformer/Utils/Util.cpp b/xformer/Utils/Util.cpp index 874c638bf..c8ae9904f 100644 --- a/xformer/Utils/Util.cpp +++ b/xformer/Utils/Util.cpp @@ -138,25 +138,13 @@ int mergeAxes(std::vector &begin, std::vector &size, return rank; } -// Converts int64_t vector to int32_t vector, returns failure if any value is -// out of int32_t range. -LogicalResult convertToI32Array(const SmallVectorImpl &input, - SmallVectorImpl &output) { - for (auto val : input) { - if (val > std::numeric_limits::max() || - val < std::numeric_limits::min()) - return failure(); - output.push_back(static_cast(val)); - } - return success(); -} - // Creates a constant op for a shape vector. Value createShapeConstOp(PatternRewriter &rewriter, Location loc, - const SmallVectorImpl &shapeVec) { + const SmallVector &shapeVec) { SmallVector shapeVecI32; - if (failed(convertToI32Array(shapeVec, shapeVecI32))) - return nullptr; + for (auto val : shapeVec) { + shapeVecI32.push_back(static_cast(val)); + } auto shapeType = RankedTensorType::get( {static_cast(shapeVecI32.size())}, rewriter.getI32Type()); auto shapeAttr = DenseIntElementsAttr::get(shapeType, shapeVecI32); @@ -166,9 +154,9 @@ Value createShapeConstOp(PatternRewriter &rewriter, Location loc, // Helper function for reshape-transpose-reshape pattern. LogicalResult reshapeTransposeReshape(PatternRewriter &rewriter, Value tensor, - const SmallVectorImpl &reshapeShape, - const SmallVectorImpl &permVec, - const SmallVectorImpl &origShape, + const SmallVector &reshapeShape, + const SmallVector &permVec, + const SmallVector &origShape, Value &result) { auto loc = tensor.getLoc(); auto tensorType = tensor.getType().cast(); @@ -184,8 +172,9 @@ reshapeTransposeReshape(PatternRewriter &rewriter, Value tensor, // Convert permVecExclBatch to int32_t vector. SmallVector permVecI32; - if (failed(convertToI32Array(permVec, permVecI32))) - return failure(); + for (auto val : permVec) { + permVecI32.push_back(static_cast(val)); + } // Create perm op. auto permType = RankedTensorType::get( diff --git a/xformer/Utils/Util.h b/xformer/Utils/Util.h index 97de0b725..23e082292 100644 --- a/xformer/Utils/Util.h +++ b/xformer/Utils/Util.h @@ -72,17 +72,14 @@ int mergeAxes(std::vector &begin, std::vector &size, std::vector &inShape, std::vector &outShape, int rank); -LogicalResult convertToI32Array(const SmallVectorImpl &input, - SmallVectorImpl &output); - Value createShapeConstOp(PatternRewriter &rewriter, Location loc, - const SmallVectorImpl &shapeVec); + const SmallVector &shapeVec); LogicalResult reshapeTransposeReshape(PatternRewriter &rewriter, Value tensor, - const SmallVectorImpl &reshapeShape, - const SmallVectorImpl &permVec, - const SmallVectorImpl &origShape, + const SmallVector &reshapeShape, + const SmallVector &permVec, + const SmallVector &origShape, Value &result); template diff --git a/xformer/lib_tflite_micro.BUILD b/xformer/lib_tflite_micro.BUILD index a87c6536d..3ba5d5739 100644 --- a/xformer/lib_tflite_micro.BUILD +++ b/xformer/lib_tflite_micro.BUILD @@ -37,6 +37,7 @@ filegroup( "lib_tflite_micro/src/tflite-xcore-kernels/xcore_add.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_pad.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_concat.cc", + "lib_tflite_micro/src/tflite-xcore-kernels/xcore_transpose.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_3_to_4.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_slice.cc", "lib_tflite_micro/src/tflite-xcore-kernels/xcore_broadcast.cc",