Skip to content

Commit 7f46c67

Browse files
committed
[Intrinsics][AArch64] Add intrinsic to mask off aliasing vector lanes
It can be unsafe to load a vector from an address and write a vector to an address if those two addresses have overlapping lanes within a vectorised loop iteration. This PR adds an intrinsic designed to create a mask with lanes disabled if they overlap between the two pointer arguments, so that only safe lanes are loaded, operated on and stored. Along with the two pointer parameters, the intrinsic also takes an immediate that represents the size in bytes of the vector element types, as well as an immediate i1 that is true if there is a write after-read-hazard or false if there is a read-after-write hazard. This will be used by llvm#100579 and replaces the existing lowering for whilewr since that isn't needed now we have the intrinsic.
1 parent 4cc152c commit 7f46c67

File tree

10 files changed

+852
-14
lines changed

10 files changed

+852
-14
lines changed

llvm/docs/LangRef.rst

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23910,6 +23910,90 @@ Examples:
2391023910
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429)
2391123911
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
2391223912

23913+
.. _int_experimental_get_alias_lane_mask:
23914+
23915+
'``llvm.experimental.get.alias.lane.mask.*``' Intrinsics
23916+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23917+
23918+
Syntax:
23919+
"""""""
23920+
This is an overloaded intrinsic.
23921+
23922+
::
23923+
23924+
declare <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64.i64(i64 %ptrA, i64 %ptrB, i64 immarg %elementSize, i1 immarg %writeAfterRead)
23925+
declare <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %ptrA, i64 %ptrB, i64 immarg %elementSize, i1 immarg %writeAfterRead)
23926+
declare <16 x i1> @llvm.experimental.get.alias.lane.mask.v16i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
23927+
declare <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
23928+
23929+
23930+
Overview:
23931+
"""""""""
23932+
23933+
Create a mask representing lanes that do or not overlap between two pointers
23934+
across one vector loop iteration.
23935+
23936+
23937+
Arguments:
23938+
""""""""""
23939+
23940+
The first two arguments have the same scalar integer type.
23941+
The final two are immediates and the result is a vector with the i1 element type.
23942+
23943+
Semantics:
23944+
""""""""""
23945+
23946+
The intrinsic will return poison if ``%ptrA`` and ``%ptrB`` are within
23947+
VF * ``%elementSize`` of each other and ``%ptrA`` + VF * ``%elementSize`` wraps.
23948+
In other cases when ``%writeAfterRead`` is true, the
23949+
'``llvm.experimental.get.alias.lane.mask.*``' intrinsics are semantically
23950+
equivalent to:
23951+
23952+
::
23953+
23954+
%diff = (%ptrB - %ptrA) / %elementSize
23955+
%m[i] = (icmp ult i, %diff) || (%diff <= 0)
23956+
23957+
When the return value is not poison and ``%writeAfterRead`` is false, the
23958+
'``llvm.experimental.get.alias.lane.mask.*``' intrinsics are semantically
23959+
equivalent to:
23960+
23961+
::
23962+
23963+
%diff = abs(%ptrB - %ptrA) / %elementSize
23964+
%m[i] = (icmp ult i, %diff) || (%diff == 0)
23965+
23966+
where ``%m`` is a vector (mask) of active/inactive lanes with its elements
23967+
indexed by ``i``, and ``%ptrA``, ``%ptrB`` are the two i64 arguments to
23968+
``llvm.experimental.get.alias.lane.mask.*`` and ``%elementSize`` is the first
23969+
immediate argument. The ``%writeAfterRead`` argument is expected to be true if
23970+
``%ptrB`` is stored to after ``%ptrA`` is read from.
23971+
The above is equivalent to:
23972+
23973+
::
23974+
23975+
%m = @llvm.experimental.get.alias.lane.mask(%ptrA, %ptrB, %elementSize, %writeAfterRead)
23976+
23977+
This can, for example, be emitted by the loop vectorizer in which case
23978+
``%ptrA`` is a pointer that is read from within the loop, and ``%ptrB`` is a
23979+
pointer that is stored to within the loop.
23980+
If the difference between these pointers is less than the vector factor, then
23981+
they overlap (alias) within a loop iteration.
23982+
An example is if ``%ptrA`` is 20 and ``%ptrB`` is 23 with a vector factor of 8,
23983+
then lanes 3, 4, 5, 6 and 7 of the vector loaded from ``%ptrA``
23984+
share addresses with lanes 0, 1, 2, 3, 4 and 5 from the vector stored to at
23985+
``%ptrB``.
23986+
23987+
23988+
Examples:
23989+
"""""""""
23990+
23991+
.. code-block:: llvm
23992+
23993+
%alias.lane.mask = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 4, i1 1)
23994+
%vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptrA, i32 4, <4 x i1> %alias.lane.mask, <4 x i32> poison)
23995+
[...]
23996+
call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, <4 x i32>* %ptrB, i32 4, <4 x i1> %alias.lane.mask)
2391323997

2391423998
.. _int_experimental_vp_splice:
2391523999

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,13 @@ class TargetLoweringBase {
480480
return true;
481481
}
482482

483+
/// Return true if the @llvm.experimental.get.alias.lane.mask intrinsic should
484+
/// be expanded using generic code in SelectionDAGBuilder.
485+
virtual bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT,
486+
unsigned EltSize) const {
487+
return true;
488+
}
489+
483490
virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF,
484491
bool IsScalable) const {
485492
return true;

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2391,6 +2391,11 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>
23912391
llvm_i32_ty]>;
23922392
}
23932393

2394+
def int_experimental_get_alias_lane_mask:
2395+
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
2396+
[llvm_anyint_ty, LLVMMatchType<1>, llvm_anyint_ty, llvm_i1_ty],
2397+
[IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
2398+
23942399
def int_get_active_lane_mask:
23952400
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
23962401
[llvm_anyint_ty, LLVMMatchType<1>],

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8237,6 +8237,56 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
82378237
visitVectorExtractLastActive(I, Intrinsic);
82388238
return;
82398239
}
8240+
case Intrinsic::experimental_get_alias_lane_mask: {
8241+
SDValue SourceValue = getValue(I.getOperand(0));
8242+
SDValue SinkValue = getValue(I.getOperand(1));
8243+
SDValue EltSize = getValue(I.getOperand(2));
8244+
bool IsWriteAfterRead =
8245+
cast<ConstantSDNode>(getValue(I.getOperand(3)))->getZExtValue() != 0;
8246+
auto IntrinsicVT = EVT::getEVT(I.getType());
8247+
auto PtrVT = SourceValue->getValueType(0);
8248+
8249+
if (!TLI.shouldExpandGetAliasLaneMask(
8250+
IntrinsicVT, PtrVT,
8251+
cast<ConstantSDNode>(EltSize)->getSExtValue())) {
8252+
visitTargetIntrinsic(I, Intrinsic);
8253+
return;
8254+
}
8255+
8256+
SDValue Diff = DAG.getNode(ISD::SUB, sdl, PtrVT, SinkValue, SourceValue);
8257+
if (!IsWriteAfterRead)
8258+
Diff = DAG.getNode(ISD::ABS, sdl, PtrVT, Diff);
8259+
8260+
Diff = DAG.getNode(ISD::SDIV, sdl, PtrVT, Diff, EltSize);
8261+
SDValue Zero = DAG.getTargetConstant(0, sdl, PtrVT);
8262+
8263+
// If the difference is positive then some elements may alias
8264+
auto CmpVT =
8265+
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), PtrVT);
8266+
SDValue Cmp = DAG.getSetCC(sdl, CmpVT, Diff, Zero,
8267+
IsWriteAfterRead ? ISD::SETLE : ISD::SETEQ);
8268+
8269+
// Splat the compare result then OR it with a lane mask
8270+
SDValue Splat = DAG.getSplat(IntrinsicVT, sdl, Cmp);
8271+
8272+
SDValue DiffMask;
8273+
// Don't emit an active lane mask if the target doesn't support it
8274+
if (TLI.shouldExpandGetActiveLaneMask(IntrinsicVT, PtrVT)) {
8275+
EVT VecTy = EVT::getVectorVT(*DAG.getContext(), PtrVT,
8276+
IntrinsicVT.getVectorElementCount());
8277+
SDValue DiffSplat = DAG.getSplat(VecTy, sdl, Diff);
8278+
SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
8279+
DiffMask = DAG.getSetCC(sdl, IntrinsicVT, VectorStep, DiffSplat,
8280+
ISD::CondCode::SETULT);
8281+
} else {
8282+
DiffMask = DAG.getNode(
8283+
ISD::INTRINSIC_WO_CHAIN, sdl, IntrinsicVT,
8284+
DAG.getTargetConstant(Intrinsic::get_active_lane_mask, sdl, MVT::i64),
8285+
Zero, Diff);
8286+
}
8287+
SDValue Or = DAG.getNode(ISD::OR, sdl, IntrinsicVT, DiffMask, Splat);
8288+
setValue(&I, Or);
8289+
}
82408290
}
82418291
}
82428292

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2109,6 +2109,25 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
21092109
return false;
21102110
}
21112111

2112+
bool AArch64TargetLowering::shouldExpandGetAliasLaneMask(
2113+
EVT VT, EVT PtrVT, unsigned EltSize) const {
2114+
if (!Subtarget->hasSVE2())
2115+
return true;
2116+
2117+
if (PtrVT != MVT::i64)
2118+
return true;
2119+
2120+
if (VT == MVT::v2i1 || VT == MVT::nxv2i1)
2121+
return EltSize != 8;
2122+
if (VT == MVT::v4i1 || VT == MVT::nxv4i1)
2123+
return EltSize != 4;
2124+
if (VT == MVT::v8i1 || VT == MVT::nxv8i1)
2125+
return EltSize != 2;
2126+
if (VT == MVT::v16i1 || VT == MVT::nxv16i1)
2127+
return EltSize != 1;
2128+
return true;
2129+
}
2130+
21122131
bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
21132132
const IntrinsicInst *I) const {
21142133
assert(I->getIntrinsicID() ==
@@ -2862,6 +2881,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
28622881
MAKE_CASE(AArch64ISD::LS64_BUILD)
28632882
MAKE_CASE(AArch64ISD::LS64_EXTRACT)
28642883
MAKE_CASE(AArch64ISD::TBL)
2884+
MAKE_CASE(AArch64ISD::WHILEWR)
2885+
MAKE_CASE(AArch64ISD::WHILERW)
28652886
MAKE_CASE(AArch64ISD::FADD_PRED)
28662887
MAKE_CASE(AArch64ISD::FADDA_PRED)
28672888
MAKE_CASE(AArch64ISD::FADDV_PRED)
@@ -6071,6 +6092,18 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
60716092
EVT PtrVT = getPointerTy(DAG.getDataLayout());
60726093
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
60736094
}
6095+
case Intrinsic::aarch64_sve_whilewr_b:
6096+
case Intrinsic::aarch64_sve_whilewr_h:
6097+
case Intrinsic::aarch64_sve_whilewr_s:
6098+
case Intrinsic::aarch64_sve_whilewr_d:
6099+
return DAG.getNode(AArch64ISD::WHILEWR, dl, Op.getValueType(),
6100+
Op.getOperand(1), Op.getOperand(2));
6101+
case Intrinsic::aarch64_sve_whilerw_b:
6102+
case Intrinsic::aarch64_sve_whilerw_h:
6103+
case Intrinsic::aarch64_sve_whilerw_s:
6104+
case Intrinsic::aarch64_sve_whilerw_d:
6105+
return DAG.getNode(AArch64ISD::WHILERW, dl, Op.getValueType(),
6106+
Op.getOperand(1), Op.getOperand(2));
60746107
case Intrinsic::aarch64_neon_abs: {
60756108
EVT Ty = Op.getValueType();
60766109
if (Ty == MVT::i64) {
@@ -6530,18 +6563,45 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
65306563
return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
65316564
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
65326565
}
6566+
case Intrinsic::experimental_get_alias_lane_mask:
65336567
case Intrinsic::get_active_lane_mask: {
6534-
SDValue ID =
6535-
DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6568+
unsigned IntrinsicID = Intrinsic::aarch64_sve_whilelo;
6569+
if (IntNo == Intrinsic::experimental_get_alias_lane_mask) {
6570+
uint64_t EltSize = Op.getOperand(3)->getAsZExtVal();
6571+
bool IsWriteAfterRead = Op.getOperand(4)->getAsZExtVal() == 1;
6572+
switch (EltSize) {
6573+
case 1:
6574+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b
6575+
: Intrinsic::aarch64_sve_whilerw_b;
6576+
break;
6577+
case 2:
6578+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h
6579+
: Intrinsic::aarch64_sve_whilerw_h;
6580+
break;
6581+
case 4:
6582+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s
6583+
: Intrinsic::aarch64_sve_whilerw_s;
6584+
break;
6585+
case 8:
6586+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d
6587+
: Intrinsic::aarch64_sve_whilerw_d;
6588+
break;
6589+
default:
6590+
llvm_unreachable("Unexpected element size for get.alias.lane.mask");
6591+
break;
6592+
}
6593+
}
6594+
SDValue ID = DAG.getTargetConstant(IntrinsicID, dl, MVT::i64);
65366595

65376596
EVT VT = Op.getValueType();
65386597
if (VT.isScalableVector())
65396598
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
65406599
Op.getOperand(2));
65416600

6542-
// We can use the SVE whilelo instruction to lower this intrinsic by
6543-
// creating the appropriate sequence of scalable vector operations and
6544-
// then extracting a fixed-width subvector from the scalable vector.
6601+
// We can use the SVE whilelo/whilewr/whilerw instruction to lower this
6602+
// intrinsic by creating the appropriate sequence of scalable vector
6603+
// operations and then extracting a fixed-width subvector from the scalable
6604+
// vector.
65456605

65466606
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
65476607
EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
@@ -19677,7 +19737,10 @@ static bool isPredicateCCSettingOp(SDValue N) {
1967719737
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
1967819738
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
1967919739
// get_active_lane_mask is lowered to a whilelo instruction.
19680-
N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19740+
N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask ||
19741+
// get_alias_lane_mask is lowered to a whilewr/rw instruction.
19742+
N.getConstantOperandVal(0) ==
19743+
Intrinsic::experimental_get_alias_lane_mask)))
1968119744
return true;
1968219745

1968319746
return false;
@@ -27716,6 +27779,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
2771627779
return;
2771727780
}
2771827781
case Intrinsic::experimental_vector_match:
27782+
case Intrinsic::experimental_get_alias_lane_mask:
2771927783
case Intrinsic::get_active_lane_mask: {
2772027784
if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
2772127785
return;

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,10 @@ enum NodeType : unsigned {
281281
SMAXV,
282282
UMAXV,
283283

284+
// Alias lane masks
285+
WHILEWR,
286+
WHILERW,
287+
284288
SADDV_PRED,
285289
UADDV_PRED,
286290
SMAXV_PRED,
@@ -994,6 +998,9 @@ class AArch64TargetLowering : public TargetLowering {
994998

995999
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
9961000

1001+
bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT,
1002+
unsigned EltSize) const override;
1003+
9971004
bool
9981005
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
9991006

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
140140
// AArch64 SVE/SVE2 - the remaining node definitions
141141
//
142142

143+
// Alias masks
144+
def SDT_AArch64Mask : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0,i1>]>;
145+
def AArch64whilewr : SDNode<"AArch64ISD::WHILEWR", SDT_AArch64Mask>;
146+
def AArch64whilerw : SDNode<"AArch64ISD::WHILERW", SDT_AArch64Mask>;
147+
143148
// SVE CNT/INC/RDVL
144149
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
145150
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
@@ -4000,9 +4005,9 @@ let Predicates = [HasSVE2_or_SME] in {
40004005
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, int_aarch64_sve_whilelo>;
40014006

40024007
// SVE2 pointer conflict compare
4003-
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
4004-
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
4005-
} // End HasSVE2_or_SME
4008+
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", AArch64whilewr>;
4009+
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", AArch64whilerw>;
4010+
} // End HasSVE2orSME
40064011

40074012
let Predicates = [HasSVEAES, HasNonStreamingSVE2_or_SSVE_AES] in {
40084013
// SVE2 crypto destructive binary operations

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5919,16 +5919,16 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
59195919
let isWhile = 1;
59205920
}
59215921

5922-
multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
5922+
multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> {
59235923
def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
59245924
def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
59255925
def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
59265926
def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
59275927

5928-
def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
5929-
def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
5930-
def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
5931-
def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
5928+
def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
5929+
def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
5930+
def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
5931+
def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
59325932
}
59335933

59345934
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)