ROCm · dhernandez0 · Nov 18, 2024 · Nov 18, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -206,12 +206,13 @@ def Rock_ReduceOp :
 }
 
 def Rock_AttentionOp :
-  Rock_Op<"attention", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot]>,
+  Rock_Op<"attention", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot, AttrSizedOperandSegments]>,
   Arguments<(ins
     TensorOrMemRefOf<[F32, F16, I8]>:$queries,
     TensorOrMemRefOf<[F32, F16, I8]>:$keys,
     TensorOrMemRefOf<[F32, F16]>:$values,
     Variadic<TensorOrMemRefOf<[F32, F16, I8]>>:$preSoftmaxElemWiseInputs,
+    Optional<TensorOrMemRefOf<[I32]>>:$currentSeqLen,
     TensorOrMemRefOf<[F32, F16]>:$out,
     UnitAttr:$qTransposed,
     UnitAttr:$kTransposed,
@@ -244,6 +245,7 @@ def Rock_AttentionOp :
   let assemblyFormat = [{
     `{` `\n`
         ` ` `qk` `=` (`tr` $qTransposed^)? $queries `*` (`tr` $kTransposed^)? $keys `:` type($queries) `,` type($keys) `\n`
+        (`currentSeqLen` `=` `(` $currentSeqLen^ `:` type($currentSeqLen) `)` `\n`)?
         (`qk` `=` `elementwise` (`otherIns` `(` $preSoftmaxElemWiseInputs^ `:` type($preSoftmaxElemWiseInputs) `)`)? $preSoftmaxBody^ `\n`)?
         (`tr` $oTransposed^)? $out `=` `softmax` `(` `qk` `)` `*` (`tr` $vTransposed^)? $values `:` type($values) `->` type($out) `\n`
     `}` attr-dict (`->` type($result)^)?
@@ -431,11 +433,12 @@ def Rock_GridwiseGemmAccelOp :
 
 // gridwise_attention_accel
 def Rock_GridwiseAttentionAccelOp :
-    Rock_Op<"gridwise_attention_accel", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot]>,
+    Rock_Op<"gridwise_attention_accel", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot, AttrSizedOperandSegments]>,
     Arguments<(ins MemRefRankOf<[F32, F16, I8], [3]>:$queries,
                    MemRefRankOf<[F32, F16, I8], [3]>:$keys,
                    MemRefRankOf<[F32, F16], [3]>:$values,
                    Variadic<TensorOrMemRefOf<[F32, F16, I8]>>:$preSoftmaxElemWiseInputs,
+                   Optional<MemRefRankOf<[I32], [1]>>:$currentSeqLen,
                    MemRefRankOf<[F32, F16], [3]>:$out,
                    StrAttr:$arch,
                    Rock_GemmFeaturesAttr:$features,
@@ -677,7 +680,7 @@ def Rock_TransformingForOp :
     Results<(outs Variadic<AnyType>:$results)> {
   let summary = "for loop with coordinate transforms";
   let description = [{
-    Loops over several a rectangular regeon of dimensions `bounds` in several
+    Loops over several a rectangular region of dimensions `bounds` in several
     iteration domains, which are coordinate spaces that are the upper coordinates
     for a sequence of coordinate transformations.
 
@@ -779,7 +782,7 @@ def Rock_TransformingForOp :
       return *(getLowerStarts().getValues<uint32_t>().begin() + n);
     }
 
-    // Retreive the block arguments corresponding to the lower coordinates
+    // Retrieve the block arguments corresponding to the lower coordinates
     // for a given iteration domain.
     Block::BlockArgListType getLowerCoords(uint32_t domain) {
       uint32_t start = getLowerStart(domain);

@@ -11,7 +11,8 @@ namespace mlir {
 namespace rock {
 /// Utility op to emit constant float op
 Value createConstantFloatOp(OpBuilder &b, Location loc, Type type,
-                            Type elemType, float value);
+                            Type elemType, float value,
+                            APFloat::opStatus expectedStatus = APFloat::opOK);
 
 /// Utility op to emit constant int op
 Value createConstantIntOp(OpBuilder &b, Location loc, Type type, Type elemType,

@@ -1089,9 +1089,11 @@ struct AttentionRewritePattern : public OpRewritePattern<tosa::MatMulOp> {
     tosa::MatMulOp firstMatMulOp = maybeFirstMatMul.value();
     IntegerAttr numCUAttr =
         numCu.has_value() ? rewriter.getI32IntegerAttr(numCu.value()) : nullptr;
+
+    // TODO: extract currentSeqLen from tosa
     rock::AttentionOp attnOp = rewriter.create<rock::AttentionOp>(
         loc, outputType, firstMatMulOp.getA(), firstMatMulOp.getB(), op.getB(),
-        elemwiseOtherArgs, output,
+        elemwiseOtherArgs, nullptr, output,
         // TODO(implement transpose fusion support here)
         /*qTransposed=*/nullptr,
         /*kTransposed=*/nullptr,

@@ -2098,6 +2098,41 @@
   if (keyN != valueK) {
     return emitError("reduction dimensions of second gemm do not match");
   }
+
+  // check output type
+  ShapedType oType = getOut().getType();
+  int64_t oBatchDim = oType.getShape().size() == 3 ? oType.getShape()[0] : 1;
+
+  ArrayRef<int64_t> oLastDims = oType.getShape().slice(oType.getRank() - 2);
+  auto [outputSeqLen, outputHeadDim] =
+      getOTransposed() ? std::tuple{oLastDims[1], oLastDims[0]}
+                       : std::tuple{oLastDims[0], oLastDims[1]};
+
+  if (qType.getShape().size() != oType.getShape().size()) {
+    return emitError("Number of dimensions do not match (Q and Output)");
+  }
+  if (qBatchDim != oBatchDim) {
+    return emitError("Batch dimensions do not match (Q and Output)");
+  }
+  if (queryM != outputSeqLen) {
+    return emitError("Sequence length does not match (Q and Output)");
+  }
+  if (valueN != outputHeadDim) {
+    return emitError("Head dimensions do not match (V and Output)");
+  }
+
+  // check currentSeqLen (KV Cache)
+  auto currentSeqLen = getCurrentSeqLen();
+  if (currentSeqLen) {
+    ShapedType seqLenType = currentSeqLen.getType();
+    if (seqLenType.getShape().size() != 1) {
+      return emitError("Number of dimensions is not one (currentSeqLen)");
+    }
+    if (seqLenType.getShape()[0] != oBatchDim) {
+      return emitError(
+          "Batch dimensions do not match (currentSeqLen and Output)");
+    }
+  }
   return success();
 }
 

@@ -246,8 +246,7 @@
   if (!isAccel) {
     op.emitError("Currently, attention op is only supported on GPUs "
                  "with matrix accelerator extentions");
-    signalPassFailure();
-    return;
+    return signalPassFailure();
   }
   Attribute params0 = op.getParams0().value_or(nullptr);
   // set a default one if params is not provided
@@ -262,6 +261,7 @@
   auto attnPerfConfig = AttnPerfConfigAttr::get(perfConfigStrAttr);
   if (!attnPerfConfig) {
     op.emitError("perf config string has an incorrect format.");
+    return signalPassFailure();
   }
   GemmFeatures features = op.getFeatures();
   RockAccelTuningParamAttrInterface accelParams0;
@@ -283,8 +283,7 @@
   if (attnPerfConfig.getMPerBlockG0() > attnPerfConfig.getMPerBlockG1()) {
     op.emitError(
         "The MPerBlockG0 should be larger or equal to getMPerBlockG1.");
-    signalPassFailure();
-    return;
+    return signalPassFailure();
   }
   RockAccelTuningParamAttrInterface accelParams1 =
       deriveGemm1TuningParams(builder, op, attnPerfConfig);
@@ -308,8 +307,7 @@
           /*enableDPerWaveFiltering=*/false);
   if (isValidBlockwiseGemm0.failed() || isValidBlockwiseGemm1.failed()) {
     op.emitError("The provided perf config is not valid");
-    signalPassFailure();
-    return;
+    return signalPassFailure();
   }
 
   IntegerAttr blockSizeAttr = builder.getI32IntegerAttr(blockSize);

@@ -538,8 +538,9 @@ AttentionRewritePattern::matchAndRewrite(AttentionOp op,
     prePadG0NAttr = rw.getIndexAttr(gemm0Size.n);
   }
   auto newOp = rw.create<GridwiseAttentionAccelOp>(
-      loc, queries, keys, values, adaptor.getPreSoftmaxElemWiseInputs(), out,
-      op.getArchAttr(), op.getFeaturesAttr(), blockSizeAttr, gridSizeAttr,
+      loc, queries, keys, values, adaptor.getPreSoftmaxElemWiseInputs(),
+      op.getCurrentSeqLen(), out, op.getArchAttr(), op.getFeaturesAttr(),
+      blockSizeAttr, gridSizeAttr,
       /*disableQBypassLDS=*/nullptr, prePadG0MAttr, prePadG0NAttr, params0,
       params1, op.getFirstGemmIdxAttr());
   bool linalgOpFound = false;