[mlir][Vector] Use a simpler lowering when emulating narrow type for vector.maskedload

tyb0807 · tyb0807 · commit 51348ae94c99 · 2023-10-27T00:06:57.000Z
arith.select should be used instead of a series of manual mask
manipulating ops (arith.and/or/extsi)
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -135,35 +135,23 @@ struct ConvertVectorMaskedLoad final
     //
     //   %mask = vector.constant_mask [3] : vector<6xi1>
     //   %1 = vector.maskedload %0[%c0, %c0], %mask, %pass_thru :
-    //   memref<3x6xi4>, vector<6xi1>, vector<6xi4> into vector<6xi4>
+    //        memref<3x6xi4>, vector<6xi1>, vector<6xi4> into vector<6xi4>
     //
     // can be replaced with
     //
     //   %new_mask = vector.constant_mask [2] : vector<3xi1>
-    //   %new_pass_thru = vector.bitcast %pass_thru : vector<6xi4> to
-    //   vector<3xi8> %1 = vector.maskedload %0[%linear_index], %new_mask,
-    //   %new_pass_thru : memref<9xi8>, vector<3xi1>, vector<3xi8> into
-    //   vector<3xi8>
+    //   %new_pass_thru = vector.bitcast %pass_thru :
+    //        vector<6xi4> to vector<3xi8>
+    //   %1 = vector.maskedload %0[%linear_index], %new_mask, %new_pass_thru :
+    //        memref<9xi8>, vector<3xi1>, vector<3xi8> into vector<3xi8>
+    //   %2 = vector.bitcast %1 : vector<3xi8> to vector<6xi4>
     //
     // Since we are effectively loading 16 bits (2xi8) from the memref with the
     // new mask, while originally we only wanted to effectively load 12 bits
     // (3xi4) from the memref, we need to set the second half of the last i8
-    // that was effectively loaded (i.e. the second i8) to 0.
+    // that was effectively loaded (i.e. the second i8) to %pass_thru.
     //
-    //   %unset_mask = arith.extsi %mask : vector<6xi1> to vector<6xi4>
-    //   %2 = vector.bitcast %unset_mask : vector<6xi4> to vector<3xi8>
-    //   %3 = arith.andi %1, %2 : vector<3xi8>
-    //
-    // Then if the second half of the second i8 from %pass_thru is not all 0s,
-    // we need to write their values back to the result.
-    //
-    //   %cst_1 = arith.constant dense<-1> : vector<6xi4>
-    //   %set_mask = arith.xori %unset_mask, %cst_1 : vector<6xi4>
-    //   %4 = vector.bitcast %set_mask : vector<6xi4> to vector<3xi8>
-    //   %5 = arith.andi %new_pass_thru, %4 : vector<3xi8>
-    //
-    //   %6 = arith.ori %3, %5 : vector<3xi8>
-    //   %7 = vector.bitcast %6 : vector<3xi8> to vector<6xi4>
+    //   %3 = arith.select %mask, %2, %pass_thru : vector<6xi1>, vector<6xi4>
     //
     // Given these input values:
     //   %mask = [1, 1, 1, 0, 0, 0]
@@ -177,17 +165,8 @@ struct ConvertVectorMaskedLoad final
     //   %new_mask = [1, 1, 0]
     //   %new_pass_thru = [0x78, 0x9A, 0xBC]
     //   %1 = [0x12, 0x34, 0xBC]
-    //
-    //   %unset_mask = [0xF, 0xF, 0xF, 0, 0, 0]
-    //   %2 = [0xFF, 0xF0, 0]
-    //   %3 = [0x12, 0x30, 0]
-    //
-    //   %set_mask = [0, 0, 0, 0xF, 0xF, 0xF]
-    //   %4 = [0, 0x0F, 0xFF]
-    //   %5 = [0, 0x0A, 0xBC]
-    //
-    //   %6 = [0x12, 0x3A, 0xBC]
-    //   %7 = [0x1, 0x2, 0x3, 0xA, 0xB, 0xC]
+    //   %2 = [0x1, 0x2, 0x3, 0x4, 0xB, 0xC]
+    //   %3 = [0x1, 0x2, 0x3, 0xA, 0xB, 0xC]
     //
     // TODO: Currently, only the even number of elements loading is supported.
     // To deal with the odd number of elements, one has to extract the
@@ -280,32 +259,13 @@ struct ConvertVectorMaskedLoad final
         newMask->getResult(0), newPassThru);
 
     // Setting the part that originally was not effectively loaded from memory
-    // to 0.
-    auto andMask = rewriter.create<arith::ExtSIOp>(loc, origType, op.getMask());
-    auto bitCastedAndMask =
-        rewriter.create<vector::BitCastOp>(loc, newType, andMask);
-    auto loadedFromMem =
-        rewriter.create<arith::AndIOp>(loc, newLoad, bitCastedAndMask);
-
-    // Copying from pass through.
-    auto allOne = rewriter.create<arith::ConstantOp>(
-        loc, origType,
-        DenseIntElementsAttr::get(origType, {APInt::getAllOnes(srcBits)}));
-    auto passThruMask = rewriter.create<arith::XOrIOp>(loc, allOne.getResult(),
-                                                       andMask.getResult());
-    auto bitCastedPassThruMask =
-        rewriter.create<vector::BitCastOp>(loc, newType, passThruMask);
-    auto copiedFromPassThru =
-        rewriter.create<arith::AndIOp>(loc, newPassThru, bitCastedPassThruMask);
-
-    // Or-ing the first part loaded from memory and the second one copied from
-    // pass through to form the result.
-    auto result =
-        rewriter.create<arith::OrIOp>(loc, loadedFromMem, copiedFromPassThru);
+    // to pass through.
     auto bitCast =
-        rewriter.create<vector::BitCastOp>(loc, op.getType(), result);
+        rewriter.create<vector::BitCastOp>(loc, op.getType(), newLoad);
+    auto select = rewriter.create<arith::SelectOp>(loc, op.getMask(), bitCast,
+                                                   op.getPassThru());
+    rewriter.replaceOp(op, select->getResult(0));
 
-    rewriter.replaceOp(op, bitCast->getResult(0));
     return success();
   }
 };
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
@@ -141,16 +141,9 @@ func.func @vector_maskedload_i8(%arg1: index, %arg2: index, %arg3: index, %passt
 //      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<4xi8> to vector<1xi32>
 //      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
 // CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
-//      CHECK32:   %[[EXT:.+]] = arith.extsi %[[ORIG_MASK]] : vector<4xi1> to vector<4xi8>
-//      CHECK32:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<4xi8> to vector<1xi32>
-//      CHECK32:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<1xi32>
-//      CHECK32:   %[[ONES:.+]] = arith.constant dense<-1> : vector<4xi8>
-//      CHECK32:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<4xi8>
-//      CHECK32:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<4xi8> to vector<1xi32>
-//      CHECK32:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<1xi32>
-//      CHECK32:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<1xi32>
-//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<4xi8>
-//      CHECK32:   return %[[VEC_I4]]
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<4xi8>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<4xi1>, vector<4xi8>
+//      CHECK32:   return %[[SELECT]]
 
 // -----
 
@@ -176,15 +169,8 @@ func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passt
 //      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<8xi4> to vector<4xi8>
 //      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
 // CHECK-SAME:     memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
-//      CHECK:   %[[EXT:.+]] = arith.extsi %[[ORIG_MASK]] : vector<8xi1> to vector<8xi4>
-//      CHECK:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<8xi4> to vector<4xi8>
-//      CHECK:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<4xi8>
-//      CHECK:   %[[ONES:.+]] = arith.constant dense<-1> : vector<8xi4>
-//      CHECK:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<8xi4>
-//      CHECK:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<8xi4> to vector<4xi8>
-//      CHECK:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<4xi8>
-//      CHECK:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<4xi8>
-//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xi4>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>
 
 //  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
 //  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
@@ -199,15 +185,8 @@ func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passt
 //      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<8xi4> to vector<1xi32>
 //      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
 // CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
-//      CHECK32:   %[[EXT:.+]] = arith.extsi %[[ORIG_MASK]] : vector<8xi1> to vector<8xi4>
-//      CHECK32:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<8xi4> to vector<1xi32>
-//      CHECK32:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<1xi32>
-//      CHECK32:   %[[ONES:.+]] = arith.constant dense<-1> : vector<8xi4>
-//      CHECK32:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<8xi4>
-//      CHECK32:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<8xi4> to vector<1xi32>
-//      CHECK32:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<1xi32>
-//      CHECK32:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<1xi32>
-//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xi4>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>
 
 // -----
 
@@ -239,16 +218,9 @@ func.func @vector_cst_maskedload_i8(%arg1: index, %arg2: index, %passthru: vecto
 //      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<4xi8> to vector<1xi32>
 //      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
 // CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
-//      CHECK32:   %[[EXT:.+]] = arith.extsi %[[ORIG_MASK]] : vector<4xi1> to vector<4xi8>
-//      CHECK32:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<4xi8> to vector<1xi32>
-//      CHECK32:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<1xi32>
-//      CHECK32:   %[[ONES:.+]] = arith.constant dense<-1> : vector<4xi8>
-//      CHECK32:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<4xi8>
-//      CHECK32:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<4xi8> to vector<1xi32>
-//      CHECK32:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<1xi32>
-//      CHECK32:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<1xi32>
-//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<4xi8>
-//      CHECK32:   return %[[VEC_I4]]
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<4xi8>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<4xi1>, vector<4xi8>
+//      CHECK32:   return %[[SELECT]]
 
 // -----
 
@@ -272,36 +244,22 @@ func.func @vector_cst_maskedload_i4(%arg1: index, %arg2: index, %passthru: vecto
 //      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG2]] : vector<8xi4> to vector<4xi8>
 //      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
 // CHECK-SAME:     memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
-//      CHECK:   %[[EXT:.+]] = arith.extsi %[[ORIG_MASK]] : vector<8xi1> to vector<8xi4>
-//      CHECK:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<8xi4> to vector<4xi8>
-//      CHECK:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<4xi8>
-//      CHECK:   %[[ONES:.+]] = arith.constant dense<-1> : vector<8xi4>
-//      CHECK:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<8xi4>
-//      CHECK:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<8xi4> to vector<4xi8>
-//      CHECK:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<4xi8>
-//      CHECK:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<4xi8>
-//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xi4>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG2]] : vector<8xi1>, vector<8xi4>
 
 //  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
 //      CHECK32: func @vector_cst_maskedload_i4(
 // CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
 // CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<8xi4>)
 //      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
-//      CHECK32: %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
 //      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
 //      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask [1] : vector<1xi1>
 //      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG2]] : vector<8xi4> to vector<1xi32>
 //      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
 // CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
-//      CHECK32:   %[[EXT:.+]] = arith.extsi %[[ORIG_MASK]] : vector<8xi1> to vector<8xi4>
-//      CHECK32:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<8xi4> to vector<1xi32>
-//      CHECK32:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<1xi32>
-//      CHECK32:   %[[ONES:.+]] = arith.constant dense<-1> : vector<8xi4>
-//      CHECK32:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<8xi4>
-//      CHECK32:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<8xi4> to vector<1xi32>
-//      CHECK32:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<1xi32>
-//      CHECK32:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<1xi32>
-//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xi4>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG2]] : vector<8xi1>, vector<8xi4>
 
 // -----
 
@@ -331,15 +289,8 @@ func.func @vector_extract_maskedload_i4(%arg1: index) -> vector<8x8x16xi4> {
 //      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<8xi8>
 //      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
 // CHECK-SAME:     memref<512xi8>, vector<8xi1>, vector<8xi8> into vector<8xi8>
-//      CHECK:   %[[EXT:.+]] = arith.extsi %[[ORIG_EXT2]] : vector<16xi1> to vector<16xi4>
-//      CHECK:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<16xi4> to vector<8xi8>
-//      CHECK:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<8xi8>
-//      CHECK:   %[[ONES:.+]] = arith.constant dense<-1> : vector<16xi4>
-//      CHECK:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<16xi4>
-//      CHECK:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<16xi4> to vector<8xi8>
-//      CHECK:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<8xi8>
-//      CHECK:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<8xi8>
-//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<8xi8> to vector<16xi4>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<8xi8> to vector<16xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
 
 //      CHECK32: func @vector_extract_maskedload_i4(
 //      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<128xi32>
@@ -353,15 +304,8 @@ func.func @vector_extract_maskedload_i4(%arg1: index) -> vector<8x8x16xi4> {
 //      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<2xi32>
 //      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
 // CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
-//      CHECK32:   %[[EXT:.+]] = arith.extsi %[[ORIG_EXT2]] : vector<16xi1> to vector<16xi4>
-//      CHECK32:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<16xi4> to vector<2xi32>
-//      CHECK32:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<2xi32>
-//      CHECK32:   %[[ONES:.+]] = arith.constant dense<-1> : vector<16xi4>
-//      CHECK32:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<16xi4>
-//      CHECK32:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<16xi4> to vector<2xi32>
-//      CHECK32:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<2xi32>
-//      CHECK32:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<2xi32>
-//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<2xi32> to vector<16xi4>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
 
 // -----
 
@@ -389,15 +333,8 @@ func.func @vector_extract_cst_maskedload_i4() -> vector<8x8x16xi4> {
 //      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<8xi8>
 //      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
 // CHECK-SAME:     memref<512xi8>, vector<8xi1>, vector<8xi8> into vector<8xi8>
-//      CHECK:   %[[EXT:.+]] = arith.extsi %[[ORIG_EXT2]] : vector<16xi1> to vector<16xi4>
-//      CHECK:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<16xi4> to vector<8xi8>
-//      CHECK:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<8xi8>
-//      CHECK:   %[[ONES:.+]] = arith.constant dense<-1> : vector<16xi4>
-//      CHECK:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<16xi4>
-//      CHECK:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<16xi4> to vector<8xi8>
-//      CHECK:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<8xi8>
-//      CHECK:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<8xi8>
-//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<8xi8> to vector<16xi4>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<8xi8> to vector<16xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
 
 //      CHECK32: func @vector_extract_cst_maskedload_i4(
 //      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<128xi32>
@@ -411,12 +348,5 @@ func.func @vector_extract_cst_maskedload_i4() -> vector<8x8x16xi4> {
 //      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<2xi32>
 //      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
 // CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
-//      CHECK32:   %[[EXT:.+]] = arith.extsi %[[ORIG_EXT2]] : vector<16xi1> to vector<16xi4>
-//      CHECK32:   %[[AND_MASK:.+]] = vector.bitcast %[[EXT]] : vector<16xi4> to vector<2xi32>
-//      CHECK32:   %[[FIRST_PART:.+]] = arith.andi %[[LOAD]], %[[AND_MASK]] : vector<2xi32>
-//      CHECK32:   %[[ONES:.+]] = arith.constant dense<-1> : vector<16xi4>
-//      CHECK32:   %[[XOR:.+]] = arith.xori %[[ONES]], %[[EXT]] : vector<16xi4>
-//      CHECK32:   %[[PASSTHRU_MASK:.+]] = vector.bitcast %[[XOR]] : vector<16xi4> to vector<2xi32>
-//      CHECK32:   %[[SECOND_PART:.+]] = arith.andi %[[NEW_PASSTHRU]], %[[PASSTHRU_MASK]] : vector<2xi32>
-//      CHECK32:   %[[VEC:.+]] = arith.ori %[[FIRST_PART]], %[[SECOND_PART]] : vector<2xi32>
-//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<2xi32> to vector<16xi4>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>