[mlir][sparse] retry sparse-only for cyclic iteration graphs

aartbik · aartbik · commit 5508516b0663 · 2021-01-14T22:39:29.000-08:00
This is a very minor improvement during iteration graph construction. If the first attempt considering the dimension order of all tensors fails, a second attempt is made using the constraints of sparse tensors only. Dense tensors prefer dimension order (locality) but provide random access if needed, enabling the compilation of more sparse kernels. Reviewed By: penpornk Differential Revision: https://reviews.llvm.org/D94709
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
@@ -274,6 +274,11 @@ class Merger {
     return false;
   }
 
+  // Returns true if tensor has any sparse dimension.
+  bool isSparseTensor(unsigned t) const {
+    return llvm::any_of(dims[t], [](Dim d) { return d == Dim::kSparse; });
+  }
+
   // Setter
   void setDim(unsigned t, unsigned i, Dim d) { dims[t][i] = d; }
 
@@ -382,17 +387,22 @@ static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,
 /// for sparse storage formats since these only support access along fixed
 /// dimensions. Even for dense storage formats, however, the natural index
 /// order yields innermost unit-stride access with better spatial locality.
-static bool computeIterationGraph(linalg::GenericOp op,
-                                  std::vector<unsigned> &topSort) {
+static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
+                                  std::vector<unsigned> &topSort,
+                                  bool sparseOnly) {
   // Set up an n x n from/to adjacency matrix of the iteration graph
   // for the implicit loop indices i_0 .. i_n-1.
   unsigned n = op.getNumLoops();
   std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
 
   // Iterate over the indexing maps of every tensor in the tensor expression.
-  for (auto imap : llvm::enumerate(op.indexing_maps())) {
-    auto map = imap.value().template cast<AffineMapAttr>().getValue();
+  unsigned numTensors = op.getNumShapedOperands();
+  for (unsigned t = 0; t < numTensors; t++) {
+    auto map = op.getIndexingMap(t);
     assert(map.getNumDims() == n);
+    // Skip dense tensor constraints when sparse only is requested.
+    if (sparseOnly && !merger.isSparseTensor(t))
+      continue;
     // At the moment, we take the index variables in the tensor access
     // expression in the order in which they appear (conceptually a
     // "row-major" layout of every tensor). So, a tensor access A_ijk
@@ -407,6 +417,7 @@ static bool computeIterationGraph(linalg::GenericOp op,
 
   // Topologically sort the iteration graph to determine loop order.
   // Report failure for a cyclic iteration graph.
+  topSort.clear();
   topSort.reserve(n);
   std::vector<unsigned> visit(n, 0);
   for (unsigned i = 0; i < n; i++)
@@ -1207,10 +1218,9 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
     // tensors are visited in natural index order. Fails on cycles.
     // This assumes that higher-level passes have already put the
     // tensors in each tensor expression in a feasible order.
-    // TODO: try again without *dense* constraints on failure or
-    //       even try to insert sparse reorderings to resolve cycles
     std::vector<unsigned> topSort;
-    if (!computeIterationGraph(op, topSort))
+    if (!computeIterationGraph(merger, op, topSort, /*sparseOnly=*/false) &&
+        !computeIterationGraph(merger, op, topSort, /*sparseOnly=*/true))
       return failure();
 
     // Finds the terminating yield statement and builds the tensor
diff --git a/mlir/test/Dialect/Linalg/sparse_nd.mlir b/mlir/test/Dialect/Linalg/sparse_nd.mlir
@@ -0,0 +1,94 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: mlir-opt %s -test-sparsification | FileCheck %s
+
+// Example with cyclic iteration graph with sparse and dense constraints,
+// but an acyclic iteration graph using sparse constraints only.
+#trait_mul = {
+  indexing_maps = [
+    affine_map<(i,j,k,l,m,n,o,p) -> (i,j,k,l,m,n,o,p)>,  // A
+    affine_map<(i,j,k,l,m,n,o,p) -> (p,o,n,m,l,k,j,i)>,  // B
+    affine_map<(i,j,k,l,m,n,o,p) -> (i,j,k,l,m,n,o,p)>   // X
+  ],
+  sparse = [
+    [ "D", "D", "D", "D", "D", "D", "D", "D" ],  // a
+    [ "D", "D", "D", "S", "S", "D", "D", "D" ],  // b
+    [ "D", "D", "D", "D", "D", "D", "D", "D" ]   // x
+  ],
+  iterator_types = ["parallel", "parallel", "parallel", "parallel",
+                    "parallel", "parallel", "parallel", "parallel"],
+  doc = "X(i,j,k,l,m,n,o,p) = A(i,j,k,l,m,n,o,p)  * B(p,o,n,m,l,k,j,i)"
+}
+
+// CHECK-LABEL:   func @mul(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<100x200x300x400x500x600x700x800xf32>,
+// CHECK-SAME:              %[[VAL_1:.*]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant 999 : index
+// CHECK:           %[[VAL_3:.*]] = constant 100 : index
+// CHECK:           %[[VAL_4:.*]] = constant 200 : index
+// CHECK:           %[[VAL_5:.*]] = constant 300 : index
+// CHECK:           %[[VAL_6:.*]] = constant 600 : index
+// CHECK:           %[[VAL_7:.*]] = constant 700 : index
+// CHECK:           %[[VAL_8:.*]] = constant 800 : index
+// CHECK:           %[[VAL_9:.*]] = constant 0 : index
+// CHECK:           %[[VAL_10:.*]] = constant 1 : index
+// CHECK:           %[[VAL_11:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_12:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_15:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
+// CHECK:           %[[VAL_17:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_9]] to %[[VAL_8]] step %[[VAL_10]] {
+// CHECK:             scf.for %[[VAL_19:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_10]] {
+// CHECK:               %[[VAL_20:.*]] = muli %[[VAL_18]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_21:.*]] = addi %[[VAL_20]], %[[VAL_19]] : index
+// CHECK:               scf.for %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_6]] step %[[VAL_10]] {
+// CHECK:                 %[[VAL_23:.*]] = muli %[[VAL_21]], %[[VAL_6]] : index
+// CHECK:                 %[[VAL_24:.*]] = addi %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:                 %[[VAL_25:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
+// CHECK:                 %[[VAL_26:.*]] = addi %[[VAL_24]], %[[VAL_10]] : index
+// CHECK:                 %[[VAL_27:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_28:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_10]] {
+// CHECK:                   %[[VAL_29:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                   %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                   %[[VAL_31:.*]] = addi %[[VAL_28]], %[[VAL_10]] : index
+// CHECK:                   %[[VAL_32:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_33:.*]] = %[[VAL_30]] to %[[VAL_32]] step %[[VAL_10]] {
+// CHECK:                     %[[VAL_34:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_33]]] : memref<?xindex>
+// CHECK:                     scf.for %[[VAL_35:.*]] = %[[VAL_9]] to %[[VAL_5]] step %[[VAL_10]] {
+// CHECK:                       %[[VAL_36:.*]] = muli %[[VAL_33]], %[[VAL_5]] : index
+// CHECK:                       %[[VAL_37:.*]] = addi %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                       scf.for %[[VAL_38:.*]] = %[[VAL_9]] to %[[VAL_4]] step %[[VAL_10]] {
+// CHECK:                         %[[VAL_39:.*]] = muli %[[VAL_37]], %[[VAL_4]] : index
+// CHECK:                         %[[VAL_40:.*]] = addi %[[VAL_39]], %[[VAL_38]] : index
+// CHECK:                         scf.for %[[VAL_41:.*]] = %[[VAL_9]] to %[[VAL_3]] step %[[VAL_10]] {
+// CHECK:                           %[[VAL_42:.*]] = muli %[[VAL_40]], %[[VAL_3]] : index
+// CHECK:                           %[[VAL_43:.*]] = addi %[[VAL_42]], %[[VAL_41]] : index
+// CHECK:                           %[[VAL_44:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                           %[[VAL_45:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_43]]] : memref<?xf32>
+// CHECK:                           %[[VAL_46:.*]] = mulf %[[VAL_44]], %[[VAL_45]] : f32
+// CHECK:                           store %[[VAL_46]], %[[VAL_17]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                         }
+// CHECK:                       }
+// CHECK:                     }
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_47:.*]] = tensor_load %[[VAL_17]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           return %[[VAL_47]] : tensor<100x200x300x400x500x600x700x800xf32>
+// CHECK:         }
+func @mul(%arga: tensor<100x200x300x400x500x600x700x800xf32>,
+          %argb: tensor<100x200x300x400x500x600x700x800xf32>)
+	      -> tensor<100x200x300x400x500x600x700x800xf32> {
+  %0 = linalg.generic #trait_mul
+    ins(%arga, %argb: tensor<100x200x300x400x500x600x700x800xf32>,
+                      tensor<100x200x300x400x500x600x700x800xf32>)
+    outs(%arga: tensor<100x200x300x400x500x600x700x800xf32>) {
+      ^bb(%a: f32, %b: f32, %s : f32):
+        %0 = mulf %a, %b : f32
+        linalg.yield %0 : f32
+    }      -> tensor<100x200x300x400x500x600x700x800xf32>
+  return %0 : tensor<100x200x300x400x500x600x700x800xf32>
+}