Remove SegSize and provide general lowering

rj-jesus · rj-jesus · commit 9df5ff0d939d · 2024-10-07T06:12:09.000-07:00
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -19969,47 +19969,33 @@ dependent.
 
 ::
 
-    declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<n> x <ty>> %op2, <<n> x i1> %mask, i32 <segsize>)
-    declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <vscale x <n> x <ty>> %op2, <vscale x <n> x i1> %mask, i32 <segsize>)
+    declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<m> x <ty>> %op2, <<n> x i1> %mask)
+    declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <<m> x <ty>> %op2, <vscale x <n> x i1> %mask)
 
 Overview:
 """""""""
 
-Find elements of the first argument matching any elements of the second.
+Find active elements of the first argument matching any elements of the second.
 
 Arguments:
 """"""""""
 
-The first argument is the search vector, the second argument is the vector of
+The first argument is the search vector, the second argument the vector of
 elements we are searching for (i.e. for which we consider a match successful),
 and the third argument is a mask that controls which elements of the first
-argument are active. The fourth argument is an immediate that sets the segment
-size for the search window.
+argument are active.
 
 Semantics:
 """"""""""
 
-The '``llvm.experimental.vector.match``' intrinsic compares each element in the
-first argument against potentially several elements of the second, placing
+The '``llvm.experimental.vector.match``' intrinsic compares each active element
+in the first argument against the elements of the second argument, placing
 ``1`` in the corresponding element of the output vector if any comparison is
 successful, and ``0`` otherwise. Inactive elements in the mask are set to ``0``
-in the output. The segment size controls the number of elements of the second
-argument that are compared against.
-
-For example, for vectors with 16 elements, if ``segsize = 16`` then each
-element of the first argument is compared against all 16 elements of the second
-argument; but if ``segsize = 4``, then each of the first four elements of the
-first argument is compared against the first four elements of the second
-argument, each of the second four elements of the first argument is compared
-against the second four elements of the second argument, and so forth.
-
-Currently, ``segsize`` needs to be an immediate value. The special value of
-``-1`` is allowed to indicate all elements should be searched.
-
-Support for specific vector types is target dependent. For AArch64 targets with
-SVE2 support, the intrinsic is valid on ``<16 x i8>`` or ``<8 x i16>`` vectors,
-or the scalable equivalents, with a ``segsize`` equal to the known minimum
-number of elements of the vectors (16 or 8, respectively).
+in the output.
+
+The second argument needs to be a fixed-length vector with the same element
+type as the first argument.
 
 Matrix Intrinsics
 -----------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1744,9 +1744,10 @@ class TargetTransformInfo {
   bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                              Align Alignment) const;
 
-  /// \returns Returns true if the target supports vector match operations for
-  /// the vector type `VT` using a segment size of `SegSize`.
-  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+  /// \returns True if the target has hardware support for vector match
+  /// operations between vectors of type `VT` and search vectors of `SearchSize`
+  /// elements, and false otherwise.
+  bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const;
 
   struct VPLegalization {
     enum VPTransform {
@@ -2186,7 +2187,7 @@ class TargetTransformInfo::Concept {
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                                      Align Alignment) const = 0;
-  virtual bool hasVectorMatch(VectorType *VT, unsigned SegSize) const = 0;
+  virtual bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2957,8 +2958,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
   }
 
-  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const override {
-    return Impl.hasVectorMatch(VT, SegSize);
+  bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const override {
+    return Impl.hasVectorMatch(VT, SearchSize);
   }
 
   VPLegalization
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -972,7 +972,9 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
-  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }
+  bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const {
+    return false;
+  }
 
   TargetTransformInfo::VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -1916,11 +1916,9 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
 def int_experimental_vector_match : DefaultAttrsIntrinsic<
                              [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
                              [ llvm_anyvector_ty,
-                               LLVMMatchType<0>,
-                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,  // Mask
-                               llvm_i32_ty ],  // Segment size
-                             [ IntrNoMem, IntrNoSync, IntrWillReturn,
-                               ImmArg<ArgIndex<3>> ]>;
+                               llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],  // Mask
+                             [ IntrNoMem, IntrNoSync, IntrWillReturn ]>;
 
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1355,8 +1355,8 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
 }
 
 bool TargetTransformInfo::hasVectorMatch(VectorType *VT,
-                                         unsigned SegSize) const {
-  return TTIImpl->hasVectorMatch(VT, SegSize);
+                                         unsigned SearchSize) const {
+  return TTIImpl->hasVectorMatch(VT, SearchSize);
 }
 
 TargetTransformInfo::Concept::~Concept() = default;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8138,12 +8138,39 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::experimental_vector_match: {
-    auto *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
-    auto SegmentSize = cast<ConstantInt>(I.getOperand(3))->getLimitedValue();
+    SDValue Op1 = getValue(I.getOperand(0));
+    SDValue Op2 = getValue(I.getOperand(1));
+    SDValue Mask = getValue(I.getOperand(2));
+    EVT Op1VT = Op1.getValueType();
+    EVT Op2VT = Op2.getValueType();
+    EVT ResVT = Mask.getValueType();
+    unsigned SearchSize = Op2VT.getVectorNumElements();
+
+    LLVMContext &Ctx = *DAG.getContext();
     const auto &TTI =
         TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
-    assert(VT && TTI.hasVectorMatch(VT, SegmentSize) && "Unsupported type!");
-    visitTargetIntrinsic(I, Intrinsic);
+
+    // If the target has native support for this vector match operation, lower
+    // the intrinsic directly; otherwise, lower it below.
+    if (TTI.hasVectorMatch(cast<VectorType>(Op1VT.getTypeForEVT(Ctx)),
+                           SearchSize)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+
+    SDValue Ret = DAG.getNode(ISD::SPLAT_VECTOR, sdl, ResVT,
+                              DAG.getConstant(0, sdl, MVT::i1));
+
+    for (unsigned i = 0; i < SearchSize; ++i) {
+      SDValue Op2Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl,
+                                    Op2VT.getVectorElementType(), Op2,
+                                    DAG.getVectorIdxConstant(i, sdl));
+      SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, sdl, Op1VT, Op2Elem);
+      SDValue Cmp = DAG.getSetCC(sdl, ResVT, Op1, Splat, ISD::SETEQ);
+      Ret = DAG.getNode(ISD::OR, sdl, ResVT, Ret, Cmp);
+    }
+
+    setValue(&I, DAG.getNode(ISD::AND, sdl, ResVT, Ret, Mask));
     return;
   }
   case Intrinsic::vector_reverse:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -6112,28 +6112,21 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Value *Mask = Call.getArgOperand(2);
-    Value *SegSize = Call.getArgOperand(3);
 
-    VectorType *OpTy = dyn_cast<VectorType>(Op1->getType());
+    VectorType *Op1Ty = dyn_cast<VectorType>(Op1->getType());
+    VectorType *Op2Ty = dyn_cast<VectorType>(Op2->getType());
     VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
-    Check(OpTy && MaskTy, "experimental.vector.match operands are not vectors.",
-          &Call);
-    Check(Op2->getType() == OpTy,
-          "experimental.vector.match first two operands must have matching "
-          "types.",
-          &Call);
-    Check(isa<ConstantInt>(SegSize),
-          "experimental.vector.match segment size needs to be an immediate "
-          "integer.",
-          &Call);
 
-    ElementCount EC = OpTy->getElementCount();
-    Check(MaskTy->getElementCount() == EC,
-          "experimental.vector.match mask must have the same number of "
-          "elements as the remaining vector operands.",
+    Check(Op1Ty && Op2Ty && MaskTy, "Operands must be vectors.", &Call);
+    Check(!isa<ScalableVectorType>(Op2Ty), "Second operand cannot be scalable.",
+          &Call);
+    Check(Op1Ty->getElementType() == Op2Ty->getElementType(),
+          "First two operands must have the same element type.", &Call);
+    Check(Op1Ty->getElementCount() == MaskTy->getElementCount(),
+          "First operand and mask must have the same number of elements.",
           &Call);
     Check(MaskTy->getElementType()->isIntegerTy(1),
-          "experimental.vector.match mask element type is not i1.", &Call);
+          "Mask must be a vector of i1's.", &Call);
     break;
   }
   case Intrinsic::vector_insert: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6262,41 +6262,48 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     auto Op1 = Op.getOperand(1);
     auto Op2 = Op.getOperand(2);
     auto Mask = Op.getOperand(3);
-    auto SegmentSize =
-        cast<ConstantSDNode>(Op.getOperand(4))->getLimitedValue();
 
-    EVT VT = Op.getValueType();
-    auto MinNumElts = VT.getVectorMinNumElements();
-
-    assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
-    assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
-           "Custom lower only works on 128-bit segments.");
-    assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
-            Op1.getValueType().getVectorElementType() == MVT::i16) &&
-           "Custom lower only supports 8-bit or 16-bit characters.");
-    assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
-                                        "match minimum number of elements.");
-
-    if (VT.isScalableVector())
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Mask, Op1, Op2);
-
-    // We can use the SVE2 match instruction to lower this intrinsic by
-    // converting the operands to scalable vectors, doing a match, and then
-    // extracting a fixed-width subvector from the scalable vector.
+    EVT Op1VT = Op1.getValueType();
+    EVT Op2VT = Op2.getValueType();
+    EVT ResVT = Op.getValueType();
 
-    EVT OpVT = Op1.getValueType();
-    EVT OpContainerVT = getContainerForFixedLengthVector(DAG, OpVT);
+    assert((Op1VT.getVectorElementType() == MVT::i8 ||
+            Op1VT.getVectorElementType() == MVT::i16) &&
+           "Expected 8-bit or 16-bit characters.");
+    assert(!Op2VT.isScalableVector() && "Search vector cannot be scalable.");
+    assert(Op1VT.getVectorElementType() == Op2VT.getVectorElementType() &&
+           "Operand type mismatch.");
+    assert(Op1VT.getVectorMinNumElements() == Op2VT.getVectorNumElements() &&
+           "Invalid operands.");
+
+    // Wrap the search vector in a scalable vector.
+    EVT OpContainerVT = getContainerForFixedLengthVector(DAG, Op2VT);
+    Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
+
+    // If the result is scalable, we need to broadbast the search vector across
+    // the SVE register and then carry out the MATCH.
+    if (ResVT.isScalableVector()) {
+      Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
+                        DAG.getTargetConstant(0, dl, MVT::i64));
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1,
+                         Op2);
+    }
+
+    // If the result is fixed, we can still use MATCH but we need to wrap the
+    // first operand and the mask in scalable vectors before doing so.
     EVT MatchVT = OpContainerVT.changeElementType(MVT::i1);
 
-    auto ScalableOp1 = convertToScalableVector(DAG, OpContainerVT, Op1);
-    auto ScalableOp2 = convertToScalableVector(DAG, OpContainerVT, Op2);
-    auto ScalableMask = DAG.getNode(ISD::SIGN_EXTEND, dl, OpVT, Mask);
-    ScalableMask = convertFixedMaskToScalableVector(ScalableMask, DAG);
+    // Wrap the operands.
+    Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl, Op1VT, Mask);
+    Mask = convertFixedMaskToScalableVector(Mask, DAG);
 
-    SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID,
-                                ScalableMask, ScalableOp1, ScalableOp2);
+    // Carry out the match.
+    SDValue Match =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID, Mask, Op1, Op2);
 
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT,
+    // Extract and return the result.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op1VT,
                        DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match),
                        DAG.getVectorIdxConstant(0, dl));
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4041,14 +4041,26 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   }
 }
 
-bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
-  // Check that the target has SVE2 (and SVE is available), that `VT' is a
-  // legal type for MATCH, and that the segment size is 128-bit.
+bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SearchSize) const {
+  // Check that (i) the target has SVE2 and SVE is available, (ii) `VT' is a
+  // legal type for MATCH, and (iii) the search vector can be broadcast
+  // efficently to a legal type.
+  //
+  // Currently, we require the length of the search vector to match the minimum
+  // number of elements of `VT'. In practice this means we only support the
+  // cases (nxv16i8, 16), (v16i8, 16), (nxv8i16, 8), and (v8i16, 8), where the
+  // first element of the tuples corresponds to the type of the first argument
+  // and the second the length of the search vector.
+  //
+  // In the future we can support more cases. For example, (nxv16i8, 4) could
+  // be efficiently supported by using a DUP.S to broadcast the search
+  // elements, and more exotic cases like (nxv16i8, 5) could be supported by a
+  // sequence of SEL(DUP).
   if (ST->hasSVE2() && ST->isSVEAvailable() &&
       VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
-      VT->getElementCount().getKnownMinValue() == SegSize &&
       (VT->getElementCount().getKnownMinValue() == 8 ||
-       VT->getElementCount().getKnownMinValue() == 16))
+       VT->getElementCount().getKnownMinValue() == 16) &&
+      VT->getElementCount().getKnownMinValue() == SearchSize)
     return true;
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -392,7 +392,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->hasSVE();
   }
 
-  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+  bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const;
 
   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                              std::optional<FastMathFlags> FMF,
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll

Original file line number	Diff line number	Diff line change
`@@ -972,7 +972,9 @@ class TargetTransformInfoImplBase {`
`972`	`972`	`return false;`
`973`	`973`	`}`
`974`	`974`
`975`		`- bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }`
	`975`	`+ bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const {`
	`976`	`+ return false;`
	`977`	`+ }`
`976`	`978`
`977`	`979`	`TargetTransformInfo::VPLegalization`
`978`	`980`	`getVPLegalizationStrategy(const VPIntrinsic &PI) const {`
Original file line number	Diff line number	Diff line change
`@@ -1355,8 +1355,8 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,`
`1355`	`1355`	`}`
`1356`	`1356`
`1357`	`1357`	`bool TargetTransformInfo::hasVectorMatch(VectorType *VT,`
`1358`		`- unsigned SegSize) const {`
`1359`		`- return TTIImpl->hasVectorMatch(VT, SegSize);`
	`1358`	`+ unsigned SearchSize) const {`
	`1359`	`+ return TTIImpl->hasVectorMatch(VT, SearchSize);`
`1360`	`1360`	`}`
`1361`	`1361`
`1362`	`1362`	`TargetTransformInfo::Concept::~Concept() = default;`
Original file line number	Diff line number	Diff line change
`@@ -392,7 +392,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {`
`392`	`392`	`return ST->hasSVE();`
`393`	`393`	`}`
`394`	`394`
`395`		`- bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;`
	`395`	`+ bool hasVectorMatch(VectorType *VT, unsigned SearchSize) const;`
`396`	`396`
`397`	`397`	`InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,`
`398`	`398`	`std::optional<FastMathFlags> FMF,`