[AArch64] Add @llvm.experimental.vector.match

rj-jesus · rj-jesus · commit 3e95323988f3 · 2024-10-07T06:11:37.000-07:00
This patch introduces an experimental intrinsic for matching the
elements of one vector against the elements of another.

For AArch64 targets that support SVE2, it lowers to a MATCH instruction
for supported fixed and scalar types.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -19958,6 +19958,59 @@ are undefined.
     }
 
 
+'``llvm.experimental.vector.match.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. Support for specific vector types is target
+dependent.
+
+::
+
+    declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<n> x <ty>> %op2, <<n> x i1> %mask, i32 <segsize>)
+    declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <vscale x <n> x <ty>> %op2, <vscale x <n> x i1> %mask, i32 <segsize>)
+
+Overview:
+"""""""""
+
+Find elements of the first argument matching any elements of the second.
+
+Arguments:
+""""""""""
+
+The first argument is the search vector, the second argument is the vector of
+elements we are searching for (i.e. for which we consider a match successful),
+and the third argument is a mask that controls which elements of the first
+argument are active. The fourth argument is an immediate that sets the segment
+size for the search window.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.match``' intrinsic compares each element in the
+first argument against potentially several elements of the second, placing
+``1`` in the corresponding element of the output vector if any comparison is
+successful, and ``0`` otherwise. Inactive elements in the mask are set to ``0``
+in the output. The segment size controls the number of elements of the second
+argument that are compared against.
+
+For example, for vectors with 16 elements, if ``segsize = 16`` then each
+element of the first argument is compared against all 16 elements of the second
+argument; but if ``segsize = 4``, then each of the first four elements of the
+first argument is compared against the first four elements of the second
+argument, each of the second four elements of the first argument is compared
+against the second four elements of the second argument, and so forth.
+
+Currently, ``segsize`` needs to be an immediate value. The special value of
+``-1`` is allowed to indicate all elements should be searched.
+
+Support for specific vector types is target dependent. For AArch64 targets with
+SVE2 support, the intrinsic is valid on ``<16 x i8>`` or ``<8 x i16>`` vectors,
+or the scalable equivalents, with a ``segsize`` equal to the known minimum
+number of elements of the vectors (16 or 8, respectively).
+
 Matrix Intrinsics
 -----------------
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1744,6 +1744,10 @@ class TargetTransformInfo {
   bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                              Align Alignment) const;
 
+  /// \returns Returns true if the target supports vector match operations for
+  /// the vector type `VT` using a segment size of `SegSize`.
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
   struct VPLegalization {
     enum VPTransform {
       // keep the predicating parameter
@@ -2182,6 +2186,7 @@ class TargetTransformInfo::Concept {
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                                      Align Alignment) const = 0;
+  virtual bool hasVectorMatch(VectorType *VT, unsigned SegSize) const = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2952,6 +2957,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
   }
 
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const override {
+    return Impl.hasVectorMatch(VT, SegSize);
+  }
+
   VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
     return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -972,6 +972,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }
+
   TargetTransformInfo::VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const {
     return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -1912,6 +1912,16 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
                              [ IntrArgMemOnly ]>;
 
+// Experimental match
+def int_experimental_vector_match : DefaultAttrsIntrinsic<
+                             [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+                             [ llvm_anyvector_ty,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,  // Mask
+                               llvm_i32_ty ],  // Segment size
+                             [ IntrNoMem, IntrNoSync, IntrWillReturn,
+                               ImmArg<ArgIndex<3>> ]>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1354,6 +1354,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::hasVectorMatch(VectorType *VT,
+                                         unsigned SegSize) const {
+  return TTIImpl->hasVectorMatch(VT, SegSize);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8137,6 +8137,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
              DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
     return;
   }
+  case Intrinsic::experimental_vector_match: {
+    auto *VT = dyn_cast<VectorType>(I.getOperand(0)->getType());
+    auto SegmentSize = cast<ConstantInt>(I.getOperand(3))->getLimitedValue();
+    const auto &TTI =
+        TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
+    assert(VT && TTI.hasVectorMatch(VT, SegmentSize) && "Unsupported type!");
+    visitTargetIntrinsic(I, Intrinsic);
+    return;
+  }
   case Intrinsic::vector_reverse:
     visitVectorReverse(I);
     return;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -6108,6 +6108,34 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           &Call);
     break;
   }
+  case Intrinsic::experimental_vector_match: {
+    Value *Op1 = Call.getArgOperand(0);
+    Value *Op2 = Call.getArgOperand(1);
+    Value *Mask = Call.getArgOperand(2);
+    Value *SegSize = Call.getArgOperand(3);
+
+    VectorType *OpTy = dyn_cast<VectorType>(Op1->getType());
+    VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
+    Check(OpTy && MaskTy, "experimental.vector.match operands are not vectors.",
+          &Call);
+    Check(Op2->getType() == OpTy,
+          "experimental.vector.match first two operands must have matching "
+          "types.",
+          &Call);
+    Check(isa<ConstantInt>(SegSize),
+          "experimental.vector.match segment size needs to be an immediate "
+          "integer.",
+          &Call);
+
+    ElementCount EC = OpTy->getElementCount();
+    Check(MaskTy->getElementCount() == EC,
+          "experimental.vector.match mask must have the same number of "
+          "elements as the remaining vector operands.",
+          &Call);
+    Check(MaskTy->getElementType()->isIntegerTy(1),
+          "experimental.vector.match mask element type is not i1.", &Call);
+    break;
+  }
   case Intrinsic::vector_insert: {
     Value *Vec = Call.getArgOperand(0);
     Value *SubVec = Call.getArgOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6255,6 +6255,51 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
     return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
   }
+  case Intrinsic::experimental_vector_match: {
+    SDValue ID =
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
+
+    auto Op1 = Op.getOperand(1);
+    auto Op2 = Op.getOperand(2);
+    auto Mask = Op.getOperand(3);
+    auto SegmentSize =
+        cast<ConstantSDNode>(Op.getOperand(4))->getLimitedValue();
+
+    EVT VT = Op.getValueType();
+    auto MinNumElts = VT.getVectorMinNumElements();
+
+    assert(Op1.getValueType() == Op2.getValueType() && "Type mismatch.");
+    assert(Op1.getValueSizeInBits().getKnownMinValue() == 128 &&
+           "Custom lower only works on 128-bit segments.");
+    assert((Op1.getValueType().getVectorElementType() == MVT::i8 ||
+            Op1.getValueType().getVectorElementType() == MVT::i16) &&
+           "Custom lower only supports 8-bit or 16-bit characters.");
+    assert(SegmentSize == MinNumElts && "Custom lower needs segment size to "
+                                        "match minimum number of elements.");
+
+    if (VT.isScalableVector())
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Mask, Op1, Op2);
+
+    // We can use the SVE2 match instruction to lower this intrinsic by
+    // converting the operands to scalable vectors, doing a match, and then
+    // extracting a fixed-width subvector from the scalable vector.
+
+    EVT OpVT = Op1.getValueType();
+    EVT OpContainerVT = getContainerForFixedLengthVector(DAG, OpVT);
+    EVT MatchVT = OpContainerVT.changeElementType(MVT::i1);
+
+    auto ScalableOp1 = convertToScalableVector(DAG, OpContainerVT, Op1);
+    auto ScalableOp2 = convertToScalableVector(DAG, OpContainerVT, Op2);
+    auto ScalableMask = DAG.getNode(ISD::SIGN_EXTEND, dl, OpVT, Mask);
+    ScalableMask = convertFixedMaskToScalableVector(ScalableMask, DAG);
+
+    SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MatchVT, ID,
+                                ScalableMask, ScalableOp1, ScalableOp2);
+
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT,
+                       DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match),
+                       DAG.getVectorIdxConstant(0, dl));
+  }
   }
 }
 
@@ -27304,6 +27349,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
+    case Intrinsic::experimental_vector_match:
     case Intrinsic::get_active_lane_mask: {
       if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
         return;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4041,6 +4041,18 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   }
 }
 
+bool AArch64TTIImpl::hasVectorMatch(VectorType *VT, unsigned SegSize) const {
+  // Check that the target has SVE2 (and SVE is available), that `VT' is a
+  // legal type for MATCH, and that the segment size is 128-bit.
+  if (ST->hasSVE2() && ST->isSVEAvailable() &&
+      VT->getPrimitiveSizeInBits().getKnownMinValue() == 128 &&
+      VT->getElementCount().getKnownMinValue() == SegSize &&
+      (VT->getElementCount().getKnownMinValue() == 8 ||
+       VT->getElementCount().getKnownMinValue() == 16))
+    return true;
+  return false;
+}
+
 InstructionCost
 AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
                                        FastMathFlags FMF,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -392,6 +392,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->hasSVE();
   }
 
+  bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;
+
   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                              std::optional<FastMathFlags> FMF,
                                              TTI::TargetCostKind CostKind);
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll b/llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define <vscale x 16 x i1> @match_nxv16i8(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %op2, <vscale x 16 x i1> %mask, i32 16)
+  ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 8 x i1> @match_nxv8i16(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: match_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %op2, <vscale x 8 x i1> %mask, i32 8)
+  ret <vscale x 8 x i1> %r
+}
+
+define <16 x i1> @match_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
+; CHECK-LABEL: match_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v2.16b, v2.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
+; CHECK-NEXT:    match p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask, i32 16)
+  ret <16 x i1> %r
+}
+
+define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
+; CHECK-LABEL: match_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v2.8h, v2.8h, #15
+; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z2.h, #0
+; CHECK-NEXT:    match p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask, i32 8)
+  ret <8 x i1> %r
+}
+
+attributes #0 = { "target-features"="+sve2" }

Original file line number	Diff line number	Diff line change
`@@ -972,6 +972,8 @@ class TargetTransformInfoImplBase {`
`972`	`972`	`return false;`
`973`	`973`	`}`
`974`	`974`
	`975`	`+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const { return false; }`
	`976`	`+`
`975`	`977`	`TargetTransformInfo::VPLegalization`
`976`	`978`	`getVPLegalizationStrategy(const VPIntrinsic &PI) const {`
`977`	`979`	`return TargetTransformInfo::VPLegalization(`
Original file line number	Diff line number	Diff line change
`@@ -392,6 +392,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {`
`392`	`392`	`return ST->hasSVE();`
`393`	`393`	`}`
`394`	`394`
	`395`	`+ bool hasVectorMatch(VectorType *VT, unsigned SegSize) const;`
	`396`	`+`
`395`	`397`	`InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,`
`396`	`398`	`std::optional<FastMathFlags> FMF,`
`397`	`399`	`TTI::TargetCostKind CostKind);`