llvm
diff --git a/‎mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
Lines changed: 55 additions & 26 deletions b/‎mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
Lines changed: 55 additions & 26 deletions
diff --git a/‎mlir/test/Dialect/SparseTensor/sparse_2d.mlir
Lines changed: 29 additions & 30 deletions b/‎mlir/test/Dialect/SparseTensor/sparse_2d.mlir
Lines changed: 29 additions & 30 deletions
@@ -47,12 +47,29 @@ namespace {
 
 /// Iteration graph sorting.
 enum SortMask {
-  kSparseOnly = 0x0,
-  kIncludeDense = 0x1,
-  kIncludeUndef = 0x2,
-  kIncludeAll = 0x3
+  // The individual mask bits.
+  kIncludeDenseOutput = 0x1, // b001
+  kIncludeDenseInput = 0x2,  // b010
+  kIncludeUndef = 0x4,       // b100
+  // The subsets of mask bits.
+  kIncludeAll = 0x7,   // b111
+  kIncludeDense = 0x3, // b011
+  kSparseOnly = 0x0,   // b000
 };
 
+/// SortMask tests on individual bits.
+inline static bool includeDenseInput(unsigned mask) {
+  return mask & SortMask::kIncludeDenseInput;
+}
+
+inline static bool includeDenseOutput(unsigned mask) {
+  return mask & SortMask::kIncludeDenseOutput;
+}
+
+inline static bool includeUndef(unsigned mask) {
+  return mask & SortMask::kIncludeUndef;
+}
+
 /// A helper class that visits an affine expression and tries to find an
 /// AffineDimExpr to which the corresponding iterator from a GenericOp matches
 /// the desired iterator type.
@@ -453,9 +470,35 @@ static bool computeIterationGraph(CodegenEnv &env, unsigned mask,
     const auto map = env.op().getMatchingIndexingMap(&t);
     const auto enc = getSparseTensorEncoding(t.get().getType());
     assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(env.op()) == n);
-    // Skip dense tensor constraints when not requested.
-    if (!(mask & SortMask::kIncludeDense) && !enc)
+
+    bool isDenseInput = !enc && env.op().isDpsInput(&t);
+    bool isDenseOutput = !enc && !isDenseInput;
+
+    // Skips dense inputs/outputs when not requested.
+    if ((isDenseInput && !includeDenseInput(mask)) ||
+        (isDenseOutput && !includeDenseOutput(mask)))
       continue;
+
+    // Push unrelated loops into sparse iteration space, so these
+    // will be skipped more often.
+    // TODO: Do we really need this?
+    if (includeUndef(mask)) {
+      unsigned tensor = t.getOperandNumber();
+      for (unsigned i = 0; i < n; i++) {
+        if (isCompressedDLT(env.dlt(tensor, i)) ||
+            isSingletonDLT(env.dlt(tensor, i))) {
+          for (unsigned j = 0; j < n; j++)
+            if (isUndefDLT(env.dlt(tensor, j))) {
+              adjM[i][j] = true;
+              inDegree[j]++;
+            }
+        } else {
+          assert(isDenseDLT(env.dlt(tensor, i)) ||
+                 isUndefDLT(env.dlt(tensor, i)));
+        }
+      }
+    }
+
     // Each tensor expression and optional dimension ordering (row-major
     // by default) puts an ordering constraint on the loop indices. For
     // example, the tensor expresion A_ijk forces the ordering i < j < k
@@ -508,24 +551,6 @@ static bool computeIterationGraph(CodegenEnv &env, unsigned mask,
         addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
       }
     }
-    // Push unrelated loops into sparse iteration space, so these
-    // will be skipped more often.
-    if (mask & SortMask::kIncludeUndef) {
-      unsigned tensor = t.getOperandNumber();
-      for (unsigned i = 0; i < n; i++) {
-        if (isCompressedDLT(env.dlt(tensor, i)) ||
-            isSingletonDLT(env.dlt(tensor, i))) {
-          for (unsigned j = 0; j < n; j++)
-            if (isUndefDLT(env.dlt(tensor, j))) {
-              adjM[i][j] = true;
-              inDegree[j]++;
-            }
-        } else {
-          assert(isDenseDLT(env.dlt(tensor, i)) ||
-                 isUndefDLT(env.dlt(tensor, i)));
-        }
-      }
-    }
   }
   // Topologically sort the iteration graph to determine loop order.
   // Report failure for a cyclic iteration graph.
@@ -1532,8 +1557,12 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
 
     // An const list of all masks that we used for interation graph
     // computation. Must be ordered from more strict to less strict.
-    const auto allMask = {SortMask::kIncludeAll, SortMask::kIncludeUndef,
-                          SortMask::kIncludeDense, SortMask::kSparseOnly};
+    // Ideally (though might not be guaranteed), the eariler a constraint mask
+    // can be satisfied, the faster the generated kernel will be.
+    const auto allMask = {
+        SortMask::kIncludeAll,        SortMask::kIncludeDense,
+        SortMask::kIncludeDenseInput, SortMask::kIncludeDenseOutput,
+        SortMask::kIncludeUndef,      SortMask::kSparseOnly};
     for (auto mask : allMask) {
       if (computeIterationGraph(env, mask)) {
         hasCycle = false;
 
@@ -1002,46 +1002,45 @@ func.func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor
   doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
 }
 
-// CHECK-LABEL:   func @sampled_dense_dense(
+// CHECK-LABEL:   func.func @sampled_dense_dense(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>,
 // CHECK-SAME:      %[[VAL_1:.*1]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[VAL_2:.*2]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
-// CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK:           scf.for %[[VAL_20:.*]] = %[[VAL_18]] to %[[VAL_19]] step %[[VAL_5]] {
-// CHECK:             %[[VAL_21:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_20]]] : memref<?xindex>
-// CHECK:             %[[VAL_22:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_20]]] : memref<?xindex>
-// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_20]], %[[VAL_5]] : index
-// CHECK:             %[[VAL_24:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_23]]] : memref<?xindex>
-// CHECK:             scf.for %[[VAL_25:.*]] = %[[VAL_22]] to %[[VAL_24]] step %[[VAL_5]] {
-// CHECK-DAG:       %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_25]]] : memref<?xindex>
-// CHECK-DAG:       %[[VAL_27:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_25]]] : memref<?xf32>
-// CHECK-DAG:       %[[VAL_28:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_21]], %[[VAL_26]]] : memref<?x?xf32>
-// CHECK:               %[[VAL_29:.*]] = scf.for %[[VAL_30:.*]] = %[[VAL_4]] to %[[VAL_12]] step %[[VAL_5]] iter_args(%[[VAL_31:.*]] = %[[VAL_28]]) -> (f32) {
-// CHECK:                 %[[VAL_32:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_21]], %[[VAL_30]]] : memref<?x?xf32>
-// CHECK:                 %[[VAL_33:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_30]], %[[VAL_26]]] : memref<?x?xf32>
-// CHECK:                 %[[VAL_34:.*]] = arith.mulf %[[VAL_32]], %[[VAL_33]] : f32
-// CHECK:                 %[[VAL_35:.*]] = arith.mulf %[[VAL_27]], %[[VAL_34]] : f32
-// CHECK:                 %[[VAL_36:.*]] = arith.addf %[[VAL_31]], %[[VAL_35]] : f32
-// CHECK:                 scf.yield %[[VAL_36]] : f32
-// CHECK:               }
-// CHECK:               memref.store %[[VAL_29]], %[[VAL_17]]{{\[}}%[[VAL_21]], %[[VAL_26]]] : memref<?x?xf32>
-// CHECK:             }
-// CHECK:           }
-// CHECK:           %[[VAL_38:.*]] = bufferization.to_tensor %[[VAL_17]] : memref<?x?xf32>
-// CHECK:           return %[[VAL_38]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
+// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_4]] {
+// CHECK:             %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK:             scf.for %[[VAL_19:.*]] = %[[VAL_5]] to %[[VAL_11]] step %[[VAL_4]] {
+// CHECK:               %[[VAL_20:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_18]], %[[VAL_19]]] : memref<?x?xf32>
+// CHECK:               %[[VAL_21:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_17]], %[[VAL_4]] : index
+// CHECK:               %[[VAL_23:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_22]]] : memref<?xindex>
+// CHECK:               scf.for %[[VAL_24:.*]] = %[[VAL_21]] to %[[VAL_23]] step %[[VAL_4]] {
+// CHECK:                 %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_24]]] : memref<?xindex>
+// CHECK:                 %[[VAL_26:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_18]], %[[VAL_25]]] : memref<?x?xf32>
+// CHECK:                 %[[VAL_27:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_24]]] : memref<?xf32>
+// CHECK:                 %[[VAL_28:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_19]], %[[VAL_25]]] : memref<?x?xf32>
+// CHECK:                 %[[VAL_29:.*]] = arith.mulf %[[VAL_20]], %[[VAL_28]] : f32
+// CHECK:                 %[[VAL_30:.*]] = arith.mulf %[[VAL_27]], %[[VAL_29]] : f32
+// CHECK:                 %[[VAL_31:.*]] = arith.addf %[[VAL_26]], %[[VAL_30]] : f32
+// CHECK:                 memref.store %[[VAL_31]], %[[VAL_14]]{{\[}}%[[VAL_18]], %[[VAL_25]]] : memref<?x?xf32>
+// CHECK:               } {"Emitted from" = "linalg.generic"}
+// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           %[[VAL_32:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<?x?xf32>
+// CHECK:           return %[[VAL_32]] : tensor<?x?xf32>
 // CHECK:         }
 func.func @sampled_dense_dense(%args: tensor<?x?xf32, #Tss>,
                           %arga: tensor<?x?xf32>,