-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[msan] Generalize handleVectorReduceIntrinsic to support Arm NEON add reduction to scalar #125288
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… reduction to scalar This generalizes handleVectorReduceIntrinsic to allow intrinsics where the return type is not the same as the fields. This is then used to support the Arm NEON add reduction to scalar intrinsics (llvm.aarch64.neon.faddv, llvm.aarch64.neon.saddv, llvm.aarch64.neon.uaddv). Updates the tests from llvm#125271
@llvm/pr-subscribers-compiler-rt-sanitizer @llvm/pr-subscribers-llvm-transforms Author: Thurston Dang (thurstond) ChangesThis generalizes handleVectorReduceIntrinsic to allow intrinsics where the return type is not the same as the fields. This is then used to support the Arm NEON add reduction to scalar intrinsics (llvm.aarch64.neon.faddv, llvm.aarch64.neon.saddv, llvm.aarch64.neon.uaddv). Updates the tests from #125271 Patch is 42.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125288.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index a4f7e43f041c38..19f7a3001c9926 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3493,11 +3493,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Instrument generic vector reduction intrinsics
// by ORing together all their fields.
+ //
+ // The return type does not need to be the same type as the fields
+ // e.g., declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
void handleVectorReduceIntrinsic(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *S = IRB.CreateOrReduce(getShadow(&I, 0));
+ S = CreateShadowCast(IRB, S, getShadowTy(&I));
setShadow(&I, S);
- setOrigin(&I, getOrigin(&I, 0));
+ setOriginForNaryOp(I);
}
// Instrument vector.reduce.or intrinsic.
@@ -4342,6 +4346,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_xor:
case Intrinsic::vector_reduce_mul:
+ // Add reduction to scalar
+ case Intrinsic::aarch64_neon_faddv:
+ case Intrinsic::aarch64_neon_saddv:
+ case Intrinsic::aarch64_neon_uaddv:
handleVectorReduceIntrinsic(I);
break;
case Intrinsic::x86_sse_stmxcsr:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
index f0e607db9d281e..5dd808dd0aa7f3 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
@@ -17,16 +17,12 @@ define signext i8 @test_vaddv_s8(<8 x i8> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A1]])
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i8
-; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i8 [[TMP4]]
;
entry:
@@ -42,16 +38,12 @@ define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK: 3:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 4:
+; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A2]])
+; CHECK-NEXT: [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i8
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 0, i32 3
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[_MSPROP1]], i32 3
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[A1]], i8 [[TMP6]], i32 3
; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i8> [[TMP7]]
@@ -69,16 +61,12 @@ define signext i16 @test_vaddv_s16(<4 x i16> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A1]])
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i16
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i16
-; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i16 [[TMP4]]
;
entry:
@@ -94,16 +82,12 @@ define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK: 3:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 4:
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A2]])
+; CHECK-NEXT: [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i16
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
-; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 0, i32 3
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[_MSPROP1]], i32 3
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i16> [[A1]], i16 [[TMP6]], i32 3
; CHECK-NEXT: store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i16> [[TMP7]]
@@ -121,15 +105,9 @@ define i32 @test_vaddv_s32(<2 x i32> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A1]])
-; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[VADDV_I]]
;
; 2 x i32 is not supported by the ISA, thus, this is a special case
@@ -145,15 +123,9 @@ define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK: 3:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 4:
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A2]])
-; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP1]], i32 0, i32 1
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[A1]], i32 [[TMP5]], i32 1
; CHECK-NEXT: store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i32> [[TMP6]]
@@ -170,15 +142,9 @@ define i64 @test_vaddv_s64(<2 x i64> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A1]])
-; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i64 [[TMP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[VADDV_I]]
;
entry:
@@ -193,15 +159,9 @@ define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) #0
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK: 3:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 4:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A2]])
-; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 1
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i64 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[A1]], i64 [[TMP5]], i64 1
; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[TMP6]]
@@ -218,16 +178,12 @@ define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A1]])
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i8
-; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i8 [[TMP4]]
;
entry:
@@ -243,16 +199,12 @@ define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK: 3:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 4:
+; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A2]])
+; CHECK-NEXT: [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i8
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 0, i32 3
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[_MSPROP1]], i32 3
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[A1]], i8 [[TMP6]], i32 3
; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i8> [[TMP7]]
@@ -270,17 +222,14 @@ define i32 @test_vaddv_u8_masked(<8 x i8> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A1]])
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 0
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[VADDV_I]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = or i32 0, [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP2]], 511
+; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[VADDV_I]], 511
; CHECK-NEXT: store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[TMP7]]
@@ -297,16 +246,12 @@ define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A1]])
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i16
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i16
-; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i16 [[TMP4]]
;
entry:
@@ -322,16 +267,12 @@ define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK: 3:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 4:
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A2]])
+; CHECK-NEXT: [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i16
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
-; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 0, i32 3
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[_MSPROP1]], i32 3
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i16> [[A1]], i16 [[TMP6]], i32 3
; CHECK-NEXT: store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i16> [[TMP7]]
@@ -349,17 +290,14 @@ define i32 @test_vaddv_u16_masked(<4 x i16> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A1]])
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 0
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[VADDV_I]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = or i32 0, [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP2]], 3276799
+; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[VADDV_I]], 3276799
; CHECK-NEXT: store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[TMP7]]
@@ -376,15 +314,9 @@ define i32 @test_vaddv_u32(<2 x i32> %a1) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK: 2:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT: unreachable
-; CHECK: 3:
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
; CHECK-NEXT: [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A1]])
-; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[VADDV_I]]
;
; 2 x i32 is not supported by the ISA, thus, this is a special case
@@ -400,15 +332,9 @@ define <2 x i32> @tes...
[truncated]
|
This uses the generalized handleVectorReduceIntrinsic (from llvm#125288) to handle NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}). Updates the tests from llvm#125140 and llvm#125288
This uses the generalized handleVectorReduceIntrinsic (from llvm#125288) to handle NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}). Updates the tests from llvm#125140 and llvm#125288
… reduction to scalar (llvm#125288) This generalizes handleVectorReduceIntrinsic to allow intrinsics where the return type is not the same as the fields. This patch then applies the generalized handleVectorReduceIntrinsic to support the following Arm NEON add reduction to scalar intrinsics: llvm.aarch64.neon.{faddv, saddv, uaddv}. Updates the tests from llvm#125271
This uses the generalized handleVectorReduceIntrinsic (from llvm#125288) to handle NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}). Updates the tests from llvm#125140 and llvm#125288
…742) This handles NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}) by (ab)using handleShadowOr() to perform the shadow cast. Previously, these were unknown intrinsics handled suboptimally by visitInstruction. Updates the tests from llvm/llvm-project#125288 and llvm/llvm-project#125140
This handles NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}) by (ab)using handleShadowOr() to perform the shadow cast. Previously, these were unknown intrinsics handled suboptimally by visitInstruction. Updates the tests from llvm#125288 and llvm#125140
This handles NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}) by (ab)using handleShadowOr() to perform the shadow cast. Previously, these were unknown intrinsics handled suboptimally by visitInstruction. Updates the tests from llvm#125288 and llvm#125140
This handles NEON saturating extract and narrow (Intrinsic::aarch64_neon_{sqxtn, sqxtun, uqxtn}) by (ab)using handleShadowOr() to perform the shadow cast. Previously, these were unknown intrinsics handled suboptimally by visitInstruction. Updates the tests from llvm#125288 and llvm#125140
This generalizes handleVectorReduceIntrinsic to allow intrinsics where the return type is not the same as the fields. This patch then applies the generalized handleVectorReduceIntrinsic to support the following Arm NEON add reduction to scalar intrinsics: llvm.aarch64.neon.{faddv, saddv, uaddv}.
Updates the tests from #125271