Skip to content

Commit 5d404d7

Browse files
authored
[msan] Generalize handlePairwiseShadowOrIntrinsic, and handle x86 pairwise add/sub (#127567)
x86 pairwise add and sub are currently handled by applying the pairwise add intrinsic to the shadow (#124835), due to the lack of an x86 pairwise OR intrinsic. handlePairwiseShadowOrIntrinsic was added (#126008) to handle Arm pairwise add, but assumes that the intrinsic operates on each pair of elements as defined by the LLVM type. In contrast, x86 pairwise add/sub may sometimes have e.g., <1 x i64> as a parameter but actually be operating on <2 x i32>. This patch generalizes handlePairwiseShadowOrIntrinsic, to allow reinterpreting the parameters to be a vector of specified element size, and then uses this function to handle x86 pairwise add/sub.
1 parent 5f6a3e6 commit 5d404d7

File tree

7 files changed

+256
-159
lines changed

7 files changed

+256
-159
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 112 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2607,8 +2607,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
26072607
///
26082608
/// e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
26092609
/// <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
2610-
///
2611-
/// TODO: adapt this function to handle horizontal add/sub?
26122610
void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I) {
26132611
assert(I.arg_size() == 1 || I.arg_size() == 2);
26142612

@@ -2617,8 +2615,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
26172615

26182616
FixedVectorType *ParamType =
26192617
cast<FixedVectorType>(I.getArgOperand(0)->getType());
2620-
if (I.arg_size() == 2)
2621-
assert(ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType()));
2618+
assert((I.arg_size() != 2) ||
2619+
(ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType())));
26222620
[[maybe_unused]] FixedVectorType *ReturnType =
26232621
cast<FixedVectorType>(I.getType());
26242622
assert(ParamType->getNumElements() * I.arg_size() ==
@@ -2656,6 +2654,82 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
26562654
setOriginForNaryOp(I);
26572655
}
26582656

2657+
/// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
2658+
/// fields, with the parameters reinterpreted to have elements of a specified
2659+
/// width. For example:
2660+
/// @llvm.x86.ssse3.phadd.w(<1 x i64> [[VAR1]], <1 x i64> [[VAR2]])
2661+
/// conceptually operates on
2662+
/// (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
2663+
/// and can be handled with ReinterpretElemWidth == 16.
2664+
void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I,
2665+
int ReinterpretElemWidth) {
2666+
assert(I.arg_size() == 1 || I.arg_size() == 2);
2667+
2668+
assert(I.getType()->isVectorTy());
2669+
assert(I.getArgOperand(0)->getType()->isVectorTy());
2670+
2671+
FixedVectorType *ParamType =
2672+
cast<FixedVectorType>(I.getArgOperand(0)->getType());
2673+
assert((I.arg_size() != 2) ||
2674+
(ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType())));
2675+
2676+
[[maybe_unused]] FixedVectorType *ReturnType =
2677+
cast<FixedVectorType>(I.getType());
2678+
assert(ParamType->getNumElements() * I.arg_size() ==
2679+
2 * ReturnType->getNumElements());
2680+
2681+
IRBuilder<> IRB(&I);
2682+
2683+
unsigned TotalNumElems = ParamType->getNumElements() * I.arg_size();
2684+
FixedVectorType *ReinterpretShadowTy = nullptr;
2685+
assert(isAligned(Align(ReinterpretElemWidth),
2686+
ParamType->getPrimitiveSizeInBits()));
2687+
ReinterpretShadowTy = FixedVectorType::get(
2688+
IRB.getIntNTy(ReinterpretElemWidth),
2689+
ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth);
2690+
TotalNumElems = ReinterpretShadowTy->getNumElements() * I.arg_size();
2691+
2692+
// Horizontal OR of shadow
2693+
SmallVector<int, 8> EvenMask;
2694+
SmallVector<int, 8> OddMask;
2695+
for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
2696+
EvenMask.push_back(X);
2697+
OddMask.push_back(X + 1);
2698+
}
2699+
2700+
Value *FirstArgShadow = getShadow(&I, 0);
2701+
FirstArgShadow = IRB.CreateBitCast(FirstArgShadow, ReinterpretShadowTy);
2702+
2703+
// If we had two parameters each with an odd number of elements, the total
2704+
// number of elements is even, but we have never seen this in extant
2705+
// instruction sets, so we enforce that each parameter must have an even
2706+
// number of elements.
2707+
assert(isAligned(
2708+
Align(2),
2709+
cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements()));
2710+
2711+
Value *EvenShadow;
2712+
Value *OddShadow;
2713+
if (I.arg_size() == 2) {
2714+
Value *SecondArgShadow = getShadow(&I, 1);
2715+
SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
2716+
2717+
EvenShadow =
2718+
IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
2719+
OddShadow =
2720+
IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
2721+
} else {
2722+
EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
2723+
OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
2724+
}
2725+
2726+
Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
2727+
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
2728+
2729+
setShadow(&I, OrShadow);
2730+
setOriginForNaryOp(I);
2731+
}
2732+
26592733
void visitFNeg(UnaryOperator &I) { handleShadowOr(I); }
26602734

26612735
// Handle multiplication by constant.
@@ -4156,87 +4230,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
41564230
setOriginForNaryOp(I);
41574231
}
41584232

4159-
void handleAVXHorizontalAddSubIntrinsic(IntrinsicInst &I) {
4160-
// Approximation only:
4161-
// output = horizontal_add/sub(A, B)
4162-
// => shadow[output] = horizontal_add(shadow[A], shadow[B])
4163-
//
4164-
// We always use horizontal add instead of subtract, because subtracting
4165-
// a fully uninitialized shadow would result in a fully initialized shadow.
4166-
//
4167-
// - If we add two adjacent zero (initialized) shadow values, the
4168-
// result always be zero i.e., no false positives.
4169-
// - If we add two shadows, one of which is uninitialized, the
4170-
// result will always be non-zero i.e., no false negatives.
4171-
// - However, we can have false negatives if we do an addition that wraps
4172-
// to zero; we consider this an acceptable tradeoff for performance.
4173-
//
4174-
// To make shadow propagation precise, we want the equivalent of
4175-
// "horizontal OR", but this is not available for SSE3/SSSE3/AVX/AVX2.
4176-
4177-
Intrinsic::ID shadowIntrinsicID = I.getIntrinsicID();
4178-
4179-
switch (I.getIntrinsicID()) {
4180-
case Intrinsic::x86_sse3_hsub_ps:
4181-
shadowIntrinsicID = Intrinsic::x86_sse3_hadd_ps;
4182-
break;
4183-
4184-
case Intrinsic::x86_sse3_hsub_pd:
4185-
shadowIntrinsicID = Intrinsic::x86_sse3_hadd_pd;
4186-
break;
4187-
4188-
case Intrinsic::x86_ssse3_phsub_d:
4189-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_d;
4190-
break;
4191-
4192-
case Intrinsic::x86_ssse3_phsub_d_128:
4193-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_d_128;
4194-
break;
4195-
4196-
case Intrinsic::x86_ssse3_phsub_w:
4197-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_w;
4198-
break;
4199-
4200-
case Intrinsic::x86_ssse3_phsub_w_128:
4201-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_w_128;
4202-
break;
4203-
4204-
case Intrinsic::x86_ssse3_phsub_sw:
4205-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_sw;
4206-
break;
4207-
4208-
case Intrinsic::x86_ssse3_phsub_sw_128:
4209-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_sw_128;
4210-
break;
4211-
4212-
case Intrinsic::x86_avx_hsub_pd_256:
4213-
shadowIntrinsicID = Intrinsic::x86_avx_hadd_pd_256;
4214-
break;
4215-
4216-
case Intrinsic::x86_avx_hsub_ps_256:
4217-
shadowIntrinsicID = Intrinsic::x86_avx_hadd_ps_256;
4218-
break;
4219-
4220-
case Intrinsic::x86_avx2_phsub_d:
4221-
shadowIntrinsicID = Intrinsic::x86_avx2_phadd_d;
4222-
break;
4223-
4224-
case Intrinsic::x86_avx2_phsub_w:
4225-
shadowIntrinsicID = Intrinsic::x86_avx2_phadd_w;
4226-
break;
4227-
4228-
case Intrinsic::x86_avx2_phsub_sw:
4229-
shadowIntrinsicID = Intrinsic::x86_avx2_phadd_sw;
4230-
break;
4231-
4232-
default:
4233-
break;
4234-
}
4235-
4236-
return handleIntrinsicByApplyingToShadow(I, shadowIntrinsicID,
4237-
/*trailingVerbatimArgs*/ 0);
4238-
}
4239-
42404233
/// Handle Arm NEON vector store intrinsics (vst{2,3,4}, vst1x_{2,3,4},
42414234
/// and vst{2,3,4}lane).
42424235
///
@@ -4783,33 +4776,49 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
47834776
handleVtestIntrinsic(I);
47844777
break;
47854778

4786-
case Intrinsic::x86_sse3_hadd_ps:
4787-
case Intrinsic::x86_sse3_hadd_pd:
4788-
case Intrinsic::x86_ssse3_phadd_d:
4789-
case Intrinsic::x86_ssse3_phadd_d_128:
4779+
// Packed Horizontal Add/Subtract
47904780
case Intrinsic::x86_ssse3_phadd_w:
47914781
case Intrinsic::x86_ssse3_phadd_w_128:
4782+
case Intrinsic::x86_avx2_phadd_w:
4783+
case Intrinsic::x86_ssse3_phsub_w:
4784+
case Intrinsic::x86_ssse3_phsub_w_128:
4785+
case Intrinsic::x86_avx2_phsub_w: {
4786+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
4787+
break;
4788+
}
4789+
4790+
// Packed Horizontal Add/Subtract
4791+
case Intrinsic::x86_ssse3_phadd_d:
4792+
case Intrinsic::x86_ssse3_phadd_d_128:
4793+
case Intrinsic::x86_avx2_phadd_d:
4794+
case Intrinsic::x86_ssse3_phsub_d:
4795+
case Intrinsic::x86_ssse3_phsub_d_128:
4796+
case Intrinsic::x86_avx2_phsub_d: {
4797+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/32);
4798+
break;
4799+
}
4800+
4801+
// Packed Horizontal Add/Subtract and Saturate
47924802
case Intrinsic::x86_ssse3_phadd_sw:
47934803
case Intrinsic::x86_ssse3_phadd_sw_128:
4804+
case Intrinsic::x86_avx2_phadd_sw:
4805+
case Intrinsic::x86_ssse3_phsub_sw:
4806+
case Intrinsic::x86_ssse3_phsub_sw_128:
4807+
case Intrinsic::x86_avx2_phsub_sw: {
4808+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
4809+
break;
4810+
}
4811+
4812+
// Packed Single/Double Precision Floating-Point Horizontal Add
4813+
case Intrinsic::x86_sse3_hadd_ps:
4814+
case Intrinsic::x86_sse3_hadd_pd:
47944815
case Intrinsic::x86_avx_hadd_pd_256:
47954816
case Intrinsic::x86_avx_hadd_ps_256:
4796-
case Intrinsic::x86_avx2_phadd_d:
4797-
case Intrinsic::x86_avx2_phadd_w:
4798-
case Intrinsic::x86_avx2_phadd_sw:
47994817
case Intrinsic::x86_sse3_hsub_ps:
48004818
case Intrinsic::x86_sse3_hsub_pd:
4801-
case Intrinsic::x86_ssse3_phsub_d:
4802-
case Intrinsic::x86_ssse3_phsub_d_128:
4803-
case Intrinsic::x86_ssse3_phsub_w:
4804-
case Intrinsic::x86_ssse3_phsub_w_128:
4805-
case Intrinsic::x86_ssse3_phsub_sw:
4806-
case Intrinsic::x86_ssse3_phsub_sw_128:
48074819
case Intrinsic::x86_avx_hsub_pd_256:
4808-
case Intrinsic::x86_avx_hsub_ps_256:
4809-
case Intrinsic::x86_avx2_phsub_d:
4810-
case Intrinsic::x86_avx2_phsub_w:
4811-
case Intrinsic::x86_avx2_phsub_sw: {
4812-
handleAVXHorizontalAddSubIntrinsic(I);
4820+
case Intrinsic::x86_avx_hsub_ps_256: {
4821+
handlePairwiseShadowOrIntrinsic(I);
48134822
break;
48144823
}
48154824

llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -435,10 +435,9 @@ define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1
435435
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
436436
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
437437
; CHECK-NEXT: call void @llvm.donothing()
438-
; CHECK-NEXT: [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
439-
; CHECK-NEXT: [[A1:%.*]] = bitcast <4 x i64> [[TMP2]] to <4 x double>
440-
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0]], <4 x double> [[A1]])
441-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
438+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
439+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
440+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP4]]
442441
; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A2:%.*]], <4 x double> [[A3:%.*]])
443442
; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
444443
; CHECK-NEXT: ret <4 x double> [[RES1]]
@@ -454,10 +453,9 @@ define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) #
454453
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
455454
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
456455
; CHECK-NEXT: call void @llvm.donothing()
457-
; CHECK-NEXT: [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
458-
; CHECK-NEXT: [[A1:%.*]] = bitcast <8 x i32> [[TMP2]] to <8 x float>
459-
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
460-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
456+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
457+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
458+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP4]]
461459
; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A2:%.*]], <8 x float> [[A3:%.*]])
462460
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
463461
; CHECK-NEXT: ret <8 x float> [[RES1]]
@@ -473,10 +471,9 @@ define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1
473471
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
474472
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
475473
; CHECK-NEXT: call void @llvm.donothing()
476-
; CHECK-NEXT: [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
477-
; CHECK-NEXT: [[A1:%.*]] = bitcast <4 x i64> [[TMP2]] to <4 x double>
478-
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0]], <4 x double> [[A1]])
479-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
474+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
475+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
476+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP4]]
480477
; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[A2:%.*]], <4 x double> [[A3:%.*]])
481478
; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
482479
; CHECK-NEXT: ret <4 x double> [[RES1]]
@@ -492,10 +489,9 @@ define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) #
492489
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
493490
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
494491
; CHECK-NEXT: call void @llvm.donothing()
495-
; CHECK-NEXT: [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
496-
; CHECK-NEXT: [[A1:%.*]] = bitcast <8 x i32> [[TMP2]] to <8 x float>
497-
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
498-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
492+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
493+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
494+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP4]]
499495
; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[A2:%.*]], <8 x float> [[A3:%.*]])
500496
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
501497
; CHECK-NEXT: ret <8 x float> [[RES1]]

0 commit comments

Comments
 (0)