Fix for TOSA-to-linalg lowering of tosa.transpose op (#72698)

rafaelubalmw · web-flow · commit ea47887e6f5f · 2023-11-22T13:21:36.000Z
The TOSA-to-linalg conversion of `tosa.transpose` contains a bug in the
computation of the result tensor shape when using dynamic dimensions.
This bug may have widespread implications in projects such as
Tensorflow, where `tosa.transpose` is frequently generated.

Consider the following TOSA code using only static dimensions. The code
transposes a tensor of shape 10x11x12 into 12x10x11 by permuting
dimensions [2, 0, 1] into [0, 1, 2].
 
```
func.func @test_tosa_transpose(%input: tensor&lt;10x11x12xf32&gt;) -&gt; tensor&lt;12x10x11xf32&gt; {
  %perms = "tosa.const"() &lt;{value = dense&lt;[2, 0, 1]&gt; : tensor&lt;3xi32&gt;}&gt; : () -&gt; tensor&lt;3xi32&gt;
  %transposed = "tosa.transpose"(%input, %perms) : (tensor&lt;10x11x12xf32&gt;, tensor&lt;3xi32&gt;) -&gt; tensor&lt;12x10x11xf32&gt;
  return %transposed : tensor&lt;12x10x11xf32&gt;
}
```
 
The code is correctly lowered to:
 
```
#map = affine_map&lt;(d0, d1, d2) -&gt; (d1, d2, d0)&gt;
#map1 = affine_map&lt;(d0, d1, d2) -&gt; (d0, d1, d2)&gt;
module {
  func.func @test_tosa_transpose(%arg0: tensor&lt;10x11x12xf32&gt;) -&gt; tensor&lt;12x10x11xf32&gt; {
    %empty = tensor.empty() : tensor&lt;12x10x11xf32&gt;
    %transposed = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor&lt;10x11x12xf32&gt;) outs(%empty : tensor&lt;12x10x11xf32&gt;) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -&gt; tensor&lt;12x10x11xf32&gt;
    return %transposed : tensor&lt;12x10x11xf32&gt;
  }
}
```
 
Now let's make all dimensions dynamic in the TOSA code:
 
```
func.func @test_tosa_transpose(%input: tensor&lt;?x?x?xf32&gt;) -&gt; tensor&lt;?x?x?xf32&gt; {
  %perms = "tosa.const"() &lt;{value = dense&lt;[2, 0, 1]&gt; : tensor&lt;3xi32&gt;}&gt; : () -&gt; tensor&lt;3xi32&gt;
  %transposed = "tosa.transpose"(%input, %perms) : (tensor&lt;?x?x?xf32&gt;, tensor&lt;3xi32&gt;) -&gt; tensor&lt;?x?x?xf32&gt;
  return %transposed : tensor&lt;?x?x?xf32&gt;
}
```
 
The `tensor.empty()` op now needs additional information about the size
of the output tensor, which is computed dynamically with a set of
`tensor.dim` ops. The comments below assume an input tensor of size
10x11x12, as before. The code is lowered as:
 
```
#map = affine_map&lt;(d0, d1, d2) -&gt; (d1, d2, d0)&gt;
#map1 = affine_map&lt;(d0, d1, d2) -&gt; (d0, d1, d2)&gt;
module {
  func.func @test_tosa_transpose(%arg0: tensor&lt;?x?x?xf32&gt;) -&gt; tensor&lt;?x?x?xf32&gt; {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
 
    %arg0_dim0 = tensor.dim %arg0, %c0 : tensor&lt;?x?x?xf32&gt;   // Evaluates to 10
    %arg0_dim1 = tensor.dim %arg0, %c1 : tensor&lt;?x?x?xf32&gt;   // Evaluates to 11
    %arg0_dim2 = tensor.dim %arg0, %c2 : tensor&lt;?x?x?xf32&gt;   // Evaluates to 12
 
    %empty = tensor.empty(%arg0_dim1, %arg0_dim2, %arg0_dim0) : tensor&lt;?x?x?xf32&gt;   // Output of type tensor&lt;11x12x10&gt;  WRONG!
    %transposed = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor&lt;?x?x?xf32&gt;) outs(%empty : tensor&lt;?x?x?xf32&gt;) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -&gt; tensor&lt;?x?x?xf32&gt;
    return %transposed : tensor&lt;?x?x?xf32&gt;
  }
}
```
 
The output tensor shape is dynamically computed as 11x12x10 instead of
12x10x11. Since the total size of the output tensor is still the same,
the code does not segfault after bufferization. However, index
computations are invalid and lead to SWAs.
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1072,12 +1072,11 @@ class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
 
     SmallVector<AffineExpr, 2> inputExprs;
     inputExprs.resize(resultTy.getRank());
-    auto operandTy = cast<ShapedType>(input.getType());
     for (const auto &permutation : llvm::enumerate(perms.getValues<APInt>())) {
       auto index = permutation.index();
       auto value = permutation.value().getZExtValue();
-      if (!operandTy.hasRank() || operandTy.isDynamicDim(index)) {
-        dynDims[value] = rewriter.create<tensor::DimOp>(loc, input, index);
+      if (!resultTy.hasRank() || resultTy.isDynamicDim(index)) {
+        dynDims[index] = rewriter.create<tensor::DimOp>(loc, input, value);
       }
       inputExprs[value] = rewriter.getAffineDimExpr(index);
     }
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -877,14 +877,14 @@ func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 
-// CHECK-LABEL: @test_transpose_dyn
+// CHECK-LABEL: @test_transpose_dyn_multiple_2d
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
-func.func @test_transpose_dyn_multiple(%arg0: tensor<?x?xf32>) -> () {
+func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
   %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK: %[[C1:.+]] = arith.constant 1
-  // CHECK: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+  // CHECK-DAG: %[[C0:.+]] = arith.constant 0
+  // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+  // CHECK-DAG: %[[C1:.+]] = arith.constant 1
+  // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
   // CHECK: %[[GENERIC:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?xf32>) outs([[OUT:%.+]] : tensor<?x?xf32>)
   // CHECK: ^bb0([[ARG1:%.+]]: f32, [[ARG2:%.+]]: f32)
@@ -896,6 +896,29 @@ func.func @test_transpose_dyn_multiple(%arg0: tensor<?x?xf32>) -> () {
 
 // -----
 
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK-LABEL: @test_transpose_dyn_multiple_3d
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
+func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
+  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+  // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
+  // CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) {
+  // CHECK: ^bb0(%[[IN0:.*]]: f32, %[[OUT0:.*]]: f32):
+  // CHECK:   linalg.yield %[[IN0]] : f32
+  // CHECK: } -> tensor<?x?x?xf32>
+  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  return
+}
+
+// -----
 
 // CHECK-LABEL: @reduce_float
 // CHECK-SAME: [[ARG0:%.+]]: tensor<5x4xf32>

Original file line number	Diff line number	Diff line change
`@@ -1072,12 +1072,11 @@ class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {`
`1072`	`1072`
`1073`	`1073`	`SmallVector<AffineExpr, 2> inputExprs;`
`1074`	`1074`	`inputExprs.resize(resultTy.getRank());`
`1075`		`- auto operandTy = cast<ShapedType>(input.getType());`
`1076`	`1075`	`for (const auto &permutation : llvm::enumerate(perms.getValues<APInt>())) {`
`1077`	`1076`	`auto index = permutation.index();`
`1078`	`1077`	`auto value = permutation.value().getZExtValue();`
`1079`		`- if (!operandTy.hasRank() \|\| operandTy.isDynamicDim(index)) {`
`1080`		`- dynDims[value] = rewriter.create<tensor::DimOp>(loc, input, index);`
	`1078`	`+ if (!resultTy.hasRank() \|\| resultTy.isDynamicDim(index)) {`
	`1079`	`+ dynDims[index] = rewriter.create<tensor::DimOp>(loc, input, value);`
`1081`	`1080`	`}`
`1082`	`1081`	`inputExprs[value] = rewriter.getAffineDimExpr(index);`
`1083`	`1082`	`}`