Skip to content

Commit 0930461

Browse files
committed
[msan] Generalize handlePairwiseShadowOrIntrinsic, and handle x86 pairwise add/sub
x86 pairwise add and sub are currently handled by applying the pairwise add intrinsic to the shadow (llvm#124835), due to the lack of an x86 pairwise OR intrinsic. handlePairwiseShadowOrIntrinsic was added (llvm#126008) to handle Arm pairwise add, but assumes that the intrinsic operates on each pair of elements as defined by the LLVM type. In contrast, x86 pairwise add/sub may sometimes have e.g., <1 x i64> as a parameter but actually be operating on <2 x i32>. This patch generalizes handlePairwiseShadowOrIntrinsic, to allow reinterpreting the parameters to be a vector of specified element size, and then uses this function to handle x86 pairwise add/sub.
1 parent ef9f0b3 commit 0930461

File tree

7 files changed

+226
-161
lines changed

7 files changed

+226
-161
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 82 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -2608,38 +2608,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
26082608
/// e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
26092609
/// <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
26102610
///
2611-
/// TODO: adapt this function to handle horizontal add/sub?
2612-
void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I) {
2611+
/// Optionally, reinterpret the parameters to have elements of a specified
2612+
/// width. For example:
2613+
/// @llvm.x86.ssse3.phadd.w(<1 x i64> [[VAR1]], <1 x i64> [[VAR2]])
2614+
/// conceptually operates on
2615+
/// (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
2616+
/// and can be handled with ReinterpretElemWidth == 16.
2617+
void
2618+
handlePairwiseShadowOrIntrinsic(IntrinsicInst &I,
2619+
std::optional<int> ReinterpretElemWidth) {
26132620
assert(I.arg_size() == 1 || I.arg_size() == 2);
26142621

26152622
assert(I.getType()->isVectorTy());
26162623
assert(I.getArgOperand(0)->getType()->isVectorTy());
26172624

26182625
FixedVectorType *ParamType =
26192626
cast<FixedVectorType>(I.getArgOperand(0)->getType());
2620-
if (I.arg_size() == 2)
2627+
if (I.arg_size() == 2) {
2628+
assert(I.getArgOperand(1)->getType()->isVectorTy());
26212629
assert(ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType()));
2630+
}
2631+
26222632
[[maybe_unused]] FixedVectorType *ReturnType =
26232633
cast<FixedVectorType>(I.getType());
26242634
assert(ParamType->getNumElements() * I.arg_size() ==
26252635
2 * ReturnType->getNumElements());
26262636

26272637
IRBuilder<> IRB(&I);
2628-
unsigned Width = ParamType->getNumElements() * I.arg_size();
2638+
2639+
unsigned TotalNumElems = ParamType->getNumElements() * I.arg_size();
2640+
FixedVectorType *ReinterpretShadowTy = nullptr;
2641+
if (ReinterpretElemWidth.has_value()) {
2642+
assert(ParamType->getPrimitiveSizeInBits() %
2643+
ReinterpretElemWidth.value() ==
2644+
0);
2645+
ReinterpretShadowTy = FixedVectorType::get(
2646+
IRB.getIntNTy(ReinterpretElemWidth.value()),
2647+
ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth.value());
2648+
TotalNumElems = ReinterpretShadowTy->getNumElements() * I.arg_size();
2649+
}
26292650

26302651
// Horizontal OR of shadow
26312652
SmallVector<int, 8> EvenMask;
26322653
SmallVector<int, 8> OddMask;
2633-
for (unsigned X = 0; X < Width; X += 2) {
2654+
for (unsigned X = 0; X + 1 < TotalNumElems; X += 2) {
26342655
EvenMask.push_back(X);
26352656
OddMask.push_back(X + 1);
26362657
}
26372658

26382659
Value *FirstArgShadow = getShadow(&I, 0);
2660+
if (ReinterpretShadowTy)
2661+
FirstArgShadow = IRB.CreateBitCast(FirstArgShadow, ReinterpretShadowTy);
2662+
2663+
// If we had two parameters each with an odd number of elements, the total
2664+
// number of elements is even, but we have never seen this in extant
2665+
// instruction sets, so we enforce that each parameter must have an even
2666+
// number of elements.
2667+
assert(
2668+
(cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements()) %
2669+
2 ==
2670+
0);
2671+
26392672
Value *EvenShadow;
26402673
Value *OddShadow;
26412674
if (I.arg_size() == 2) {
26422675
Value *SecondArgShadow = getShadow(&I, 1);
2676+
if (ReinterpretShadowTy)
2677+
SecondArgShadow =
2678+
IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
2679+
assert((cast<FixedVectorType>(SecondArgShadow->getType())
2680+
->getNumElements()) %
2681+
2 ==
2682+
0);
2683+
26432684
EvenShadow =
26442685
IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
26452686
OddShadow =
@@ -2653,6 +2694,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
26532694
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
26542695

26552696
setShadow(&I, OrShadow);
2697+
26562698
setOriginForNaryOp(I);
26572699
}
26582700

@@ -4156,87 +4198,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
41564198
setOriginForNaryOp(I);
41574199
}
41584200

4159-
void handleAVXHorizontalAddSubIntrinsic(IntrinsicInst &I) {
4160-
// Approximation only:
4161-
// output = horizontal_add/sub(A, B)
4162-
// => shadow[output] = horizontal_add(shadow[A], shadow[B])
4163-
//
4164-
// We always use horizontal add instead of subtract, because subtracting
4165-
// a fully uninitialized shadow would result in a fully initialized shadow.
4166-
//
4167-
// - If we add two adjacent zero (initialized) shadow values, the
4168-
// result always be zero i.e., no false positives.
4169-
// - If we add two shadows, one of which is uninitialized, the
4170-
// result will always be non-zero i.e., no false negatives.
4171-
// - However, we can have false negatives if we do an addition that wraps
4172-
// to zero; we consider this an acceptable tradeoff for performance.
4173-
//
4174-
// To make shadow propagation precise, we want the equivalent of
4175-
// "horizontal OR", but this is not available for SSE3/SSSE3/AVX/AVX2.
4176-
4177-
Intrinsic::ID shadowIntrinsicID = I.getIntrinsicID();
4178-
4179-
switch (I.getIntrinsicID()) {
4180-
case Intrinsic::x86_sse3_hsub_ps:
4181-
shadowIntrinsicID = Intrinsic::x86_sse3_hadd_ps;
4182-
break;
4183-
4184-
case Intrinsic::x86_sse3_hsub_pd:
4185-
shadowIntrinsicID = Intrinsic::x86_sse3_hadd_pd;
4186-
break;
4187-
4188-
case Intrinsic::x86_ssse3_phsub_d:
4189-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_d;
4190-
break;
4191-
4192-
case Intrinsic::x86_ssse3_phsub_d_128:
4193-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_d_128;
4194-
break;
4195-
4196-
case Intrinsic::x86_ssse3_phsub_w:
4197-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_w;
4198-
break;
4199-
4200-
case Intrinsic::x86_ssse3_phsub_w_128:
4201-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_w_128;
4202-
break;
4203-
4204-
case Intrinsic::x86_ssse3_phsub_sw:
4205-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_sw;
4206-
break;
4207-
4208-
case Intrinsic::x86_ssse3_phsub_sw_128:
4209-
shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_sw_128;
4210-
break;
4211-
4212-
case Intrinsic::x86_avx_hsub_pd_256:
4213-
shadowIntrinsicID = Intrinsic::x86_avx_hadd_pd_256;
4214-
break;
4215-
4216-
case Intrinsic::x86_avx_hsub_ps_256:
4217-
shadowIntrinsicID = Intrinsic::x86_avx_hadd_ps_256;
4218-
break;
4219-
4220-
case Intrinsic::x86_avx2_phsub_d:
4221-
shadowIntrinsicID = Intrinsic::x86_avx2_phadd_d;
4222-
break;
4223-
4224-
case Intrinsic::x86_avx2_phsub_w:
4225-
shadowIntrinsicID = Intrinsic::x86_avx2_phadd_w;
4226-
break;
4227-
4228-
case Intrinsic::x86_avx2_phsub_sw:
4229-
shadowIntrinsicID = Intrinsic::x86_avx2_phadd_sw;
4230-
break;
4231-
4232-
default:
4233-
break;
4234-
}
4235-
4236-
return handleIntrinsicByApplyingToShadow(I, shadowIntrinsicID,
4237-
/*trailingVerbatimArgs*/ 0);
4238-
}
4239-
42404201
/// Handle Arm NEON vector store intrinsics (vst{2,3,4}, vst1x_{2,3,4},
42414202
/// and vst{2,3,4}lane).
42424203
///
@@ -4783,33 +4744,49 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
47834744
handleVtestIntrinsic(I);
47844745
break;
47854746

4786-
case Intrinsic::x86_sse3_hadd_ps:
4787-
case Intrinsic::x86_sse3_hadd_pd:
4788-
case Intrinsic::x86_ssse3_phadd_d:
4789-
case Intrinsic::x86_ssse3_phadd_d_128:
4747+
// Packed Horizontal Add/Subtract
47904748
case Intrinsic::x86_ssse3_phadd_w:
47914749
case Intrinsic::x86_ssse3_phadd_w_128:
4750+
case Intrinsic::x86_avx2_phadd_w:
4751+
case Intrinsic::x86_ssse3_phsub_w:
4752+
case Intrinsic::x86_ssse3_phsub_w_128:
4753+
case Intrinsic::x86_avx2_phsub_w: {
4754+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
4755+
break;
4756+
}
4757+
4758+
// Packed Horizontal Add/Subtract
4759+
case Intrinsic::x86_ssse3_phadd_d:
4760+
case Intrinsic::x86_ssse3_phadd_d_128:
4761+
case Intrinsic::x86_avx2_phadd_d:
4762+
case Intrinsic::x86_ssse3_phsub_d:
4763+
case Intrinsic::x86_ssse3_phsub_d_128:
4764+
case Intrinsic::x86_avx2_phsub_d: {
4765+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/32);
4766+
break;
4767+
}
4768+
4769+
// Packed Horizontal Add/Subtract and Saturate
47924770
case Intrinsic::x86_ssse3_phadd_sw:
47934771
case Intrinsic::x86_ssse3_phadd_sw_128:
4772+
case Intrinsic::x86_avx2_phadd_sw:
4773+
case Intrinsic::x86_ssse3_phsub_sw:
4774+
case Intrinsic::x86_ssse3_phsub_sw_128:
4775+
case Intrinsic::x86_avx2_phsub_sw: {
4776+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
4777+
break;
4778+
}
4779+
4780+
// Packed Single/Double Precision Floating-Point Horizontal Add
4781+
case Intrinsic::x86_sse3_hadd_ps:
4782+
case Intrinsic::x86_sse3_hadd_pd:
47944783
case Intrinsic::x86_avx_hadd_pd_256:
47954784
case Intrinsic::x86_avx_hadd_ps_256:
4796-
case Intrinsic::x86_avx2_phadd_d:
4797-
case Intrinsic::x86_avx2_phadd_w:
4798-
case Intrinsic::x86_avx2_phadd_sw:
47994785
case Intrinsic::x86_sse3_hsub_ps:
48004786
case Intrinsic::x86_sse3_hsub_pd:
4801-
case Intrinsic::x86_ssse3_phsub_d:
4802-
case Intrinsic::x86_ssse3_phsub_d_128:
4803-
case Intrinsic::x86_ssse3_phsub_w:
4804-
case Intrinsic::x86_ssse3_phsub_w_128:
4805-
case Intrinsic::x86_ssse3_phsub_sw:
4806-
case Intrinsic::x86_ssse3_phsub_sw_128:
48074787
case Intrinsic::x86_avx_hsub_pd_256:
4808-
case Intrinsic::x86_avx_hsub_ps_256:
4809-
case Intrinsic::x86_avx2_phsub_d:
4810-
case Intrinsic::x86_avx2_phsub_w:
4811-
case Intrinsic::x86_avx2_phsub_sw: {
4812-
handleAVXHorizontalAddSubIntrinsic(I);
4788+
case Intrinsic::x86_avx_hsub_ps_256: {
4789+
handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/std::nullopt);
48134790
break;
48144791
}
48154792

@@ -4869,7 +4846,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
48694846
// Add Long Pairwise
48704847
case Intrinsic::aarch64_neon_saddlp:
48714848
case Intrinsic::aarch64_neon_uaddlp: {
4872-
handlePairwiseShadowOrIntrinsic(I);
4849+
handlePairwiseShadowOrIntrinsic(I, std::nullopt);
48734850
break;
48744851
}
48754852

llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -435,10 +435,9 @@ define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1
435435
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
436436
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
437437
; CHECK-NEXT: call void @llvm.donothing()
438-
; CHECK-NEXT: [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
439-
; CHECK-NEXT: [[A1:%.*]] = bitcast <4 x i64> [[TMP2]] to <4 x double>
440-
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0]], <4 x double> [[A1]])
441-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
438+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
439+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
440+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP4]]
442441
; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A2:%.*]], <4 x double> [[A3:%.*]])
443442
; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
444443
; CHECK-NEXT: ret <4 x double> [[RES1]]
@@ -454,10 +453,9 @@ define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) #
454453
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
455454
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
456455
; CHECK-NEXT: call void @llvm.donothing()
457-
; CHECK-NEXT: [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
458-
; CHECK-NEXT: [[A1:%.*]] = bitcast <8 x i32> [[TMP2]] to <8 x float>
459-
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
460-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
456+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
457+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
458+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP4]]
461459
; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A2:%.*]], <8 x float> [[A3:%.*]])
462460
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
463461
; CHECK-NEXT: ret <8 x float> [[RES1]]
@@ -473,10 +471,9 @@ define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1
473471
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
474472
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
475473
; CHECK-NEXT: call void @llvm.donothing()
476-
; CHECK-NEXT: [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
477-
; CHECK-NEXT: [[A1:%.*]] = bitcast <4 x i64> [[TMP2]] to <4 x double>
478-
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0]], <4 x double> [[A1]])
479-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
474+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
475+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
476+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP4]]
480477
; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[A2:%.*]], <4 x double> [[A3:%.*]])
481478
; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
482479
; CHECK-NEXT: ret <4 x double> [[RES1]]
@@ -492,10 +489,9 @@ define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) #
492489
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
493490
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
494491
; CHECK-NEXT: call void @llvm.donothing()
495-
; CHECK-NEXT: [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
496-
; CHECK-NEXT: [[A1:%.*]] = bitcast <8 x i32> [[TMP2]] to <8 x float>
497-
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
498-
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
492+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
493+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
494+
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP4]]
499495
; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[A2:%.*]], <8 x float> [[A3:%.*]])
500496
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
501497
; CHECK-NEXT: ret <8 x float> [[RES1]]

0 commit comments

Comments
 (0)