@@ -2607,8 +2607,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
2607
2607
// /
2608
2608
// / e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
2609
2609
// / <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
2610
- // /
2611
- // / TODO: adapt this function to handle horizontal add/sub?
2612
2610
void handlePairwiseShadowOrIntrinsic (IntrinsicInst &I) {
2613
2611
assert (I.arg_size () == 1 || I.arg_size () == 2 );
2614
2612
@@ -2617,8 +2615,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
2617
2615
2618
2616
FixedVectorType *ParamType =
2619
2617
cast<FixedVectorType>(I.getArgOperand (0 )->getType ());
2620
- if ( I.arg_size () == 2 )
2621
- assert (ParamType == cast<FixedVectorType>(I.getArgOperand (1 )->getType ()));
2618
+ assert (( I.arg_size () != 2 ) ||
2619
+ (ParamType == cast<FixedVectorType>(I.getArgOperand (1 )->getType () )));
2622
2620
[[maybe_unused]] FixedVectorType *ReturnType =
2623
2621
cast<FixedVectorType>(I.getType ());
2624
2622
assert (ParamType->getNumElements () * I.arg_size () ==
@@ -2656,6 +2654,82 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
2656
2654
setOriginForNaryOp (I);
2657
2655
}
2658
2656
2657
+ // / Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
2658
+ // / fields, with the parameters reinterpreted to have elements of a specified
2659
+ // / width. For example:
2660
+ // / @llvm.x86.ssse3.phadd.w(<1 x i64> [[VAR1]], <1 x i64> [[VAR2]])
2661
+ // / conceptually operates on
2662
+ // / (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
2663
+ // / and can be handled with ReinterpretElemWidth == 16.
2664
+ void handlePairwiseShadowOrIntrinsic (IntrinsicInst &I,
2665
+ int ReinterpretElemWidth) {
2666
+ assert (I.arg_size () == 1 || I.arg_size () == 2 );
2667
+
2668
+ assert (I.getType ()->isVectorTy ());
2669
+ assert (I.getArgOperand (0 )->getType ()->isVectorTy ());
2670
+
2671
+ FixedVectorType *ParamType =
2672
+ cast<FixedVectorType>(I.getArgOperand (0 )->getType ());
2673
+ assert ((I.arg_size () != 2 ) ||
2674
+ (ParamType == cast<FixedVectorType>(I.getArgOperand (1 )->getType ())));
2675
+
2676
+ [[maybe_unused]] FixedVectorType *ReturnType =
2677
+ cast<FixedVectorType>(I.getType ());
2678
+ assert (ParamType->getNumElements () * I.arg_size () ==
2679
+ 2 * ReturnType->getNumElements ());
2680
+
2681
+ IRBuilder<> IRB (&I);
2682
+
2683
+ unsigned TotalNumElems = ParamType->getNumElements () * I.arg_size ();
2684
+ FixedVectorType *ReinterpretShadowTy = nullptr ;
2685
+ assert (isAligned (Align (ReinterpretElemWidth),
2686
+ ParamType->getPrimitiveSizeInBits ()));
2687
+ ReinterpretShadowTy = FixedVectorType::get (
2688
+ IRB.getIntNTy (ReinterpretElemWidth),
2689
+ ParamType->getPrimitiveSizeInBits () / ReinterpretElemWidth);
2690
+ TotalNumElems = ReinterpretShadowTy->getNumElements () * I.arg_size ();
2691
+
2692
+ // Horizontal OR of shadow
2693
+ SmallVector<int , 8 > EvenMask;
2694
+ SmallVector<int , 8 > OddMask;
2695
+ for (unsigned X = 0 ; X < TotalNumElems - 1 ; X += 2 ) {
2696
+ EvenMask.push_back (X);
2697
+ OddMask.push_back (X + 1 );
2698
+ }
2699
+
2700
+ Value *FirstArgShadow = getShadow (&I, 0 );
2701
+ FirstArgShadow = IRB.CreateBitCast (FirstArgShadow, ReinterpretShadowTy);
2702
+
2703
+ // If we had two parameters each with an odd number of elements, the total
2704
+ // number of elements is even, but we have never seen this in extant
2705
+ // instruction sets, so we enforce that each parameter must have an even
2706
+ // number of elements.
2707
+ assert (isAligned (
2708
+ Align (2 ),
2709
+ cast<FixedVectorType>(FirstArgShadow->getType ())->getNumElements ()));
2710
+
2711
+ Value *EvenShadow;
2712
+ Value *OddShadow;
2713
+ if (I.arg_size () == 2 ) {
2714
+ Value *SecondArgShadow = getShadow (&I, 1 );
2715
+ SecondArgShadow = IRB.CreateBitCast (SecondArgShadow, ReinterpretShadowTy);
2716
+
2717
+ EvenShadow =
2718
+ IRB.CreateShuffleVector (FirstArgShadow, SecondArgShadow, EvenMask);
2719
+ OddShadow =
2720
+ IRB.CreateShuffleVector (FirstArgShadow, SecondArgShadow, OddMask);
2721
+ } else {
2722
+ EvenShadow = IRB.CreateShuffleVector (FirstArgShadow, EvenMask);
2723
+ OddShadow = IRB.CreateShuffleVector (FirstArgShadow, OddMask);
2724
+ }
2725
+
2726
+ Value *OrShadow = IRB.CreateOr (EvenShadow, OddShadow);
2727
+ OrShadow = CreateShadowCast (IRB, OrShadow, getShadowTy (&I));
2728
+
2729
+ setShadow (&I, OrShadow);
2730
+ setOriginForNaryOp (I);
2731
+ }
2732
+
2659
2733
void visitFNeg (UnaryOperator &I) { handleShadowOr (I); }
2660
2734
2661
2735
// Handle multiplication by constant.
@@ -4156,87 +4230,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
4156
4230
setOriginForNaryOp (I);
4157
4231
}
4158
4232
4159
- void handleAVXHorizontalAddSubIntrinsic (IntrinsicInst &I) {
4160
- // Approximation only:
4161
- // output = horizontal_add/sub(A, B)
4162
- // => shadow[output] = horizontal_add(shadow[A], shadow[B])
4163
- //
4164
- // We always use horizontal add instead of subtract, because subtracting
4165
- // a fully uninitialized shadow would result in a fully initialized shadow.
4166
- //
4167
- // - If we add two adjacent zero (initialized) shadow values, the
4168
- // result always be zero i.e., no false positives.
4169
- // - If we add two shadows, one of which is uninitialized, the
4170
- // result will always be non-zero i.e., no false negatives.
4171
- // - However, we can have false negatives if we do an addition that wraps
4172
- // to zero; we consider this an acceptable tradeoff for performance.
4173
- //
4174
- // To make shadow propagation precise, we want the equivalent of
4175
- // "horizontal OR", but this is not available for SSE3/SSSE3/AVX/AVX2.
4176
-
4177
- Intrinsic::ID shadowIntrinsicID = I.getIntrinsicID ();
4178
-
4179
- switch (I.getIntrinsicID ()) {
4180
- case Intrinsic::x86_sse3_hsub_ps:
4181
- shadowIntrinsicID = Intrinsic::x86_sse3_hadd_ps;
4182
- break ;
4183
-
4184
- case Intrinsic::x86_sse3_hsub_pd:
4185
- shadowIntrinsicID = Intrinsic::x86_sse3_hadd_pd;
4186
- break ;
4187
-
4188
- case Intrinsic::x86_ssse3_phsub_d:
4189
- shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_d;
4190
- break ;
4191
-
4192
- case Intrinsic::x86_ssse3_phsub_d_128:
4193
- shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_d_128;
4194
- break ;
4195
-
4196
- case Intrinsic::x86_ssse3_phsub_w:
4197
- shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_w;
4198
- break ;
4199
-
4200
- case Intrinsic::x86_ssse3_phsub_w_128:
4201
- shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_w_128;
4202
- break ;
4203
-
4204
- case Intrinsic::x86_ssse3_phsub_sw:
4205
- shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_sw;
4206
- break ;
4207
-
4208
- case Intrinsic::x86_ssse3_phsub_sw_128:
4209
- shadowIntrinsicID = Intrinsic::x86_ssse3_phadd_sw_128;
4210
- break ;
4211
-
4212
- case Intrinsic::x86_avx_hsub_pd_256:
4213
- shadowIntrinsicID = Intrinsic::x86_avx_hadd_pd_256;
4214
- break ;
4215
-
4216
- case Intrinsic::x86_avx_hsub_ps_256:
4217
- shadowIntrinsicID = Intrinsic::x86_avx_hadd_ps_256;
4218
- break ;
4219
-
4220
- case Intrinsic::x86_avx2_phsub_d:
4221
- shadowIntrinsicID = Intrinsic::x86_avx2_phadd_d;
4222
- break ;
4223
-
4224
- case Intrinsic::x86_avx2_phsub_w:
4225
- shadowIntrinsicID = Intrinsic::x86_avx2_phadd_w;
4226
- break ;
4227
-
4228
- case Intrinsic::x86_avx2_phsub_sw:
4229
- shadowIntrinsicID = Intrinsic::x86_avx2_phadd_sw;
4230
- break ;
4231
-
4232
- default :
4233
- break ;
4234
- }
4235
-
4236
- return handleIntrinsicByApplyingToShadow (I, shadowIntrinsicID,
4237
- /* trailingVerbatimArgs*/ 0 );
4238
- }
4239
-
4240
4233
// / Handle Arm NEON vector store intrinsics (vst{2,3,4}, vst1x_{2,3,4},
4241
4234
// / and vst{2,3,4}lane).
4242
4235
// /
@@ -4783,33 +4776,49 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
4783
4776
handleVtestIntrinsic (I);
4784
4777
break ;
4785
4778
4786
- case Intrinsic::x86_sse3_hadd_ps:
4787
- case Intrinsic::x86_sse3_hadd_pd:
4788
- case Intrinsic::x86_ssse3_phadd_d:
4789
- case Intrinsic::x86_ssse3_phadd_d_128:
4779
+ // Packed Horizontal Add/Subtract
4790
4780
case Intrinsic::x86_ssse3_phadd_w:
4791
4781
case Intrinsic::x86_ssse3_phadd_w_128:
4782
+ case Intrinsic::x86_avx2_phadd_w:
4783
+ case Intrinsic::x86_ssse3_phsub_w:
4784
+ case Intrinsic::x86_ssse3_phsub_w_128:
4785
+ case Intrinsic::x86_avx2_phsub_w: {
4786
+ handlePairwiseShadowOrIntrinsic (I, /* ReinterpretElemWidth=*/ 16 );
4787
+ break ;
4788
+ }
4789
+
4790
+ // Packed Horizontal Add/Subtract
4791
+ case Intrinsic::x86_ssse3_phadd_d:
4792
+ case Intrinsic::x86_ssse3_phadd_d_128:
4793
+ case Intrinsic::x86_avx2_phadd_d:
4794
+ case Intrinsic::x86_ssse3_phsub_d:
4795
+ case Intrinsic::x86_ssse3_phsub_d_128:
4796
+ case Intrinsic::x86_avx2_phsub_d: {
4797
+ handlePairwiseShadowOrIntrinsic (I, /* ReinterpretElemWidth=*/ 32 );
4798
+ break ;
4799
+ }
4800
+
4801
+ // Packed Horizontal Add/Subtract and Saturate
4792
4802
case Intrinsic::x86_ssse3_phadd_sw:
4793
4803
case Intrinsic::x86_ssse3_phadd_sw_128:
4804
+ case Intrinsic::x86_avx2_phadd_sw:
4805
+ case Intrinsic::x86_ssse3_phsub_sw:
4806
+ case Intrinsic::x86_ssse3_phsub_sw_128:
4807
+ case Intrinsic::x86_avx2_phsub_sw: {
4808
+ handlePairwiseShadowOrIntrinsic (I, /* ReinterpretElemWidth=*/ 16 );
4809
+ break ;
4810
+ }
4811
+
4812
+ // Packed Single/Double Precision Floating-Point Horizontal Add
4813
+ case Intrinsic::x86_sse3_hadd_ps:
4814
+ case Intrinsic::x86_sse3_hadd_pd:
4794
4815
case Intrinsic::x86_avx_hadd_pd_256:
4795
4816
case Intrinsic::x86_avx_hadd_ps_256:
4796
- case Intrinsic::x86_avx2_phadd_d:
4797
- case Intrinsic::x86_avx2_phadd_w:
4798
- case Intrinsic::x86_avx2_phadd_sw:
4799
4817
case Intrinsic::x86_sse3_hsub_ps:
4800
4818
case Intrinsic::x86_sse3_hsub_pd:
4801
- case Intrinsic::x86_ssse3_phsub_d:
4802
- case Intrinsic::x86_ssse3_phsub_d_128:
4803
- case Intrinsic::x86_ssse3_phsub_w:
4804
- case Intrinsic::x86_ssse3_phsub_w_128:
4805
- case Intrinsic::x86_ssse3_phsub_sw:
4806
- case Intrinsic::x86_ssse3_phsub_sw_128:
4807
4819
case Intrinsic::x86_avx_hsub_pd_256:
4808
- case Intrinsic::x86_avx_hsub_ps_256:
4809
- case Intrinsic::x86_avx2_phsub_d:
4810
- case Intrinsic::x86_avx2_phsub_w:
4811
- case Intrinsic::x86_avx2_phsub_sw: {
4812
- handleAVXHorizontalAddSubIntrinsic (I);
4820
+ case Intrinsic::x86_avx_hsub_ps_256: {
4821
+ handlePairwiseShadowOrIntrinsic (I);
4813
4822
break ;
4814
4823
}
4815
4824
0 commit comments