Skip to content

Commit 3c8473b

Browse files
committed
[SLP] allow matching integer min/max intrinsics as reduction ops
As noted in D98152, we need to patch SLP to avoid regressions when we start canonicalizing to integer min/max intrinsics. Most of the real work to make this possible was in: 7202f47 Differential Revision: https://reviews.llvm.org/D98981
1 parent 520f70e commit 3c8473b

File tree

3 files changed

+77
-146
lines changed

3 files changed

+77
-146
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6628,17 +6628,20 @@ class HorizontalReduction {
66286628
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
66296629
return RecurKind::FMin;
66306630

6631+
// This matches either cmp+select or intrinsics. SLP is expected to handle
6632+
// either form.
6633+
// TODO: If we are canonicalizing to intrinsics, we can remove several
6634+
// special-case paths that deal with selects.
6635+
if (match(I, m_SMax(m_Value(), m_Value())))
6636+
return RecurKind::SMax;
6637+
if (match(I, m_SMin(m_Value(), m_Value())))
6638+
return RecurKind::SMin;
6639+
if (match(I, m_UMax(m_Value(), m_Value())))
6640+
return RecurKind::UMax;
6641+
if (match(I, m_UMin(m_Value(), m_Value())))
6642+
return RecurKind::UMin;
6643+
66316644
if (auto *Select = dyn_cast<SelectInst>(I)) {
6632-
// These would also match llvm.{u,s}{min,max} intrinsic call
6633-
// if were not guarded by the SelectInst check above.
6634-
if (match(I, m_SMax(m_Value(), m_Value())))
6635-
return RecurKind::SMax;
6636-
if (match(I, m_SMin(m_Value(), m_Value())))
6637-
return RecurKind::SMin;
6638-
if (match(I, m_UMax(m_Value(), m_Value())))
6639-
return RecurKind::UMax;
6640-
if (match(I, m_UMin(m_Value(), m_Value())))
6641-
return RecurKind::UMin;
66426645
// Try harder: look for min/max pattern based on instructions producing
66436646
// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
66446647
// During the intermediate stages of SLP, it's very common to have
@@ -7353,6 +7356,14 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
73537356
return true;
73547357
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
73557358
return true;
7359+
if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
7360+
return true;
7361+
if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
7362+
return true;
7363+
if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
7364+
return true;
7365+
if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
7366+
return true;
73567367
return false;
73577368
}
73587369

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Lines changed: 33 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,22 +1016,10 @@ define i32 @smax_intrinsic_rdx_v8i32(i32* %p0) {
10161016
; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 5
10171017
; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 6
10181018
; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 7
1019-
; CHECK-NEXT: [[T0:%.*]] = load i32, i32* [[P0]], align 4
1020-
; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[P1]], align 4
1021-
; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[P2]], align 4
1022-
; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[P3]], align 4
1023-
; CHECK-NEXT: [[T4:%.*]] = load i32, i32* [[P4]], align 4
1024-
; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[P5]], align 4
1025-
; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[P6]], align 4
1026-
; CHECK-NEXT: [[T7:%.*]] = load i32, i32* [[P7]], align 4
1027-
; CHECK-NEXT: [[M10:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T1]], i32 [[T0]])
1028-
; CHECK-NEXT: [[M32:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T3]], i32 [[T2]])
1029-
; CHECK-NEXT: [[M54:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T5]], i32 [[T4]])
1030-
; CHECK-NEXT: [[M76:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T7]], i32 [[T6]])
1031-
; CHECK-NEXT: [[M3210:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M32]], i32 [[M10]])
1032-
; CHECK-NEXT: [[M7654:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M76]], i32 [[M54]])
1033-
; CHECK-NEXT: [[M:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M7654]], i32 [[M3210]])
1034-
; CHECK-NEXT: ret i32 [[M]]
1019+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <8 x i32>*
1020+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
1021+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
1022+
; CHECK-NEXT: ret i32 [[TMP3]]
10351023
;
10361024
%p1 = getelementptr inbounds i32, i32* %p0, i64 1
10371025
%p2 = getelementptr inbounds i32, i32* %p0, i64 2
@@ -1067,22 +1055,10 @@ define i16 @smin_intrinsic_rdx_v8i16(i16* %p0) {
10671055
; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
10681056
; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
10691057
; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
1070-
; CHECK-NEXT: [[T0:%.*]] = load i16, i16* [[P0]], align 4
1071-
; CHECK-NEXT: [[T1:%.*]] = load i16, i16* [[P1]], align 4
1072-
; CHECK-NEXT: [[T2:%.*]] = load i16, i16* [[P2]], align 4
1073-
; CHECK-NEXT: [[T3:%.*]] = load i16, i16* [[P3]], align 4
1074-
; CHECK-NEXT: [[T4:%.*]] = load i16, i16* [[P4]], align 4
1075-
; CHECK-NEXT: [[T5:%.*]] = load i16, i16* [[P5]], align 4
1076-
; CHECK-NEXT: [[T6:%.*]] = load i16, i16* [[P6]], align 4
1077-
; CHECK-NEXT: [[T7:%.*]] = load i16, i16* [[P7]], align 4
1078-
; CHECK-NEXT: [[M10:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T1]], i16 [[T0]])
1079-
; CHECK-NEXT: [[M32:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T3]], i16 [[T2]])
1080-
; CHECK-NEXT: [[M54:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T5]], i16 [[T4]])
1081-
; CHECK-NEXT: [[M76:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T7]], i16 [[T6]])
1082-
; CHECK-NEXT: [[M3210:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M32]], i16 [[M10]])
1083-
; CHECK-NEXT: [[M7654:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M76]], i16 [[M54]])
1084-
; CHECK-NEXT: [[M:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M7654]], i16 [[M3210]])
1085-
; CHECK-NEXT: ret i16 [[M]]
1058+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
1059+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
1060+
; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[TMP2]])
1061+
; CHECK-NEXT: ret i16 [[TMP3]]
10861062
;
10871063
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
10881064
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1110,18 +1086,27 @@ define i16 @smin_intrinsic_rdx_v8i16(i16* %p0) {
11101086
}
11111087

11121088
define i64 @umax_intrinsic_rdx_v4i64(i64* %p0) {
1113-
; CHECK-LABEL: @umax_intrinsic_rdx_v4i64(
1114-
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1
1115-
; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2
1116-
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3
1117-
; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[P0]], align 4
1118-
; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[P1]], align 4
1119-
; CHECK-NEXT: [[T2:%.*]] = load i64, i64* [[P2]], align 4
1120-
; CHECK-NEXT: [[T3:%.*]] = load i64, i64* [[P3]], align 4
1121-
; CHECK-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
1122-
; CHECK-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
1123-
; CHECK-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
1124-
; CHECK-NEXT: ret i64 [[M]]
1089+
; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64(
1090+
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1
1091+
; DEFAULT-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2
1092+
; DEFAULT-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3
1093+
; DEFAULT-NEXT: [[T0:%.*]] = load i64, i64* [[P0]], align 4
1094+
; DEFAULT-NEXT: [[T1:%.*]] = load i64, i64* [[P1]], align 4
1095+
; DEFAULT-NEXT: [[T2:%.*]] = load i64, i64* [[P2]], align 4
1096+
; DEFAULT-NEXT: [[T3:%.*]] = load i64, i64* [[P3]], align 4
1097+
; DEFAULT-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
1098+
; DEFAULT-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
1099+
; DEFAULT-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
1100+
; DEFAULT-NEXT: ret i64 [[M]]
1101+
;
1102+
; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(
1103+
; THRESH-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1
1104+
; THRESH-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2
1105+
; THRESH-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3
1106+
; THRESH-NEXT: [[TMP1:%.*]] = bitcast i64* [[P0]] to <4 x i64>*
1107+
; THRESH-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP1]], align 4
1108+
; THRESH-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP2]])
1109+
; THRESH-NEXT: ret i64 [[TMP3]]
11251110
;
11261111
%p1 = getelementptr inbounds i64, i64* %p0, i64 1
11271112
%p2 = getelementptr inbounds i64, i64* %p0, i64 2
@@ -1153,38 +1138,10 @@ define i8 @umin_intrinsic_rdx_v16i8(i8* %p0) {
11531138
; CHECK-NEXT: [[PD:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
11541139
; CHECK-NEXT: [[PE:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
11551140
; CHECK-NEXT: [[PF:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
1156-
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[P0]], align 4
1157-
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[P1]], align 4
1158-
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[P2]], align 4
1159-
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[P3]], align 4
1160-
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[P4]], align 4
1161-
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[P5]], align 4
1162-
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[P6]], align 4
1163-
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[P7]], align 4
1164-
; CHECK-NEXT: [[T8:%.*]] = load i8, i8* [[P8]], align 4
1165-
; CHECK-NEXT: [[T9:%.*]] = load i8, i8* [[P9]], align 4
1166-
; CHECK-NEXT: [[TA:%.*]] = load i8, i8* [[PA]], align 4
1167-
; CHECK-NEXT: [[TB:%.*]] = load i8, i8* [[PB]], align 4
1168-
; CHECK-NEXT: [[TC:%.*]] = load i8, i8* [[PC]], align 4
1169-
; CHECK-NEXT: [[TD:%.*]] = load i8, i8* [[PD]], align 4
1170-
; CHECK-NEXT: [[TE:%.*]] = load i8, i8* [[PE]], align 4
1171-
; CHECK-NEXT: [[TF:%.*]] = load i8, i8* [[PF]], align 4
1172-
; CHECK-NEXT: [[M10:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T1]], i8 [[T0]])
1173-
; CHECK-NEXT: [[M32:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T3]], i8 [[T2]])
1174-
; CHECK-NEXT: [[M54:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T5]], i8 [[T4]])
1175-
; CHECK-NEXT: [[M76:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T7]], i8 [[T6]])
1176-
; CHECK-NEXT: [[M98:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T9]], i8 [[T8]])
1177-
; CHECK-NEXT: [[MBA:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TB]], i8 [[TA]])
1178-
; CHECK-NEXT: [[MDC:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TD]], i8 [[TC]])
1179-
; CHECK-NEXT: [[MFE:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TF]], i8 [[TE]])
1180-
; CHECK-NEXT: [[M3210:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M32]], i8 [[M10]])
1181-
; CHECK-NEXT: [[M7654:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M76]], i8 [[M54]])
1182-
; CHECK-NEXT: [[MDC98:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MDC]], i8 [[M98]])
1183-
; CHECK-NEXT: [[MFEBA:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MFE]], i8 [[MBA]])
1184-
; CHECK-NEXT: [[ML:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M3210]], i8 [[M7654]])
1185-
; CHECK-NEXT: [[MH:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MFEBA]], i8 [[MDC98]])
1186-
; CHECK-NEXT: [[M:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MH]], i8 [[ML]])
1187-
; CHECK-NEXT: ret i8 [[M]]
1141+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
1142+
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 4
1143+
; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[TMP2]])
1144+
; CHECK-NEXT: ret i8 [[TMP3]]
11881145
;
11891146
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
11901147
%p2 = getelementptr inbounds i8, i8* %p0, i64 2

llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll

Lines changed: 23 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s
3-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s
4-
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s
2+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
3+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
4+
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
55

66
@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
77

@@ -21,15 +21,20 @@ define i32 @smax_v2i32(i32) {
2121
}
2222

2323
define i32 @smax_v4i32(i32) {
24-
; CHECK-LABEL: @smax_v4i32(
25-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
26-
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
27-
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
28-
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
29-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
30-
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
31-
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
32-
; CHECK-NEXT: ret i32 [[TMP8]]
24+
; SSE-LABEL: @smax_v4i32(
25+
; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
26+
; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
27+
; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
28+
; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
29+
; SSE-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
30+
; SSE-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
31+
; SSE-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
32+
; SSE-NEXT: ret i32 [[TMP8]]
33+
;
34+
; AVX-LABEL: @smax_v4i32(
35+
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16
36+
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
37+
; AVX-NEXT: ret i32 [[TMP3]]
3338
;
3439
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
3540
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -43,22 +48,9 @@ define i32 @smax_v4i32(i32) {
4348

4449
define i32 @smax_v8i32(i32) {
4550
; CHECK-LABEL: @smax_v8i32(
46-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
47-
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
48-
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
49-
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
50-
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
51-
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
52-
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
53-
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
54-
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
55-
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP10]], i32 [[TMP4]])
56-
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP11]], i32 [[TMP5]])
57-
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 [[TMP6]])
58-
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP13]], i32 [[TMP7]])
59-
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[TMP8]])
60-
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP15]], i32 [[TMP9]])
61-
; CHECK-NEXT: ret i32 [[TMP16]]
51+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
52+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
53+
; CHECK-NEXT: ret i32 [[TMP3]]
6254
;
6355
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
6456
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -80,38 +72,9 @@ define i32 @smax_v8i32(i32) {
8072

8173
define i32 @smax_v16i32(i32) {
8274
; CHECK-LABEL: @smax_v16i32(
83-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
84-
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
85-
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
86-
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
87-
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
88-
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
89-
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
90-
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
91-
; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
92-
; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
93-
; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
94-
; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
95-
; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
96-
; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
97-
; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
98-
; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
99-
; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
100-
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP18]], i32 [[TMP4]])
101-
; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP19]], i32 [[TMP5]])
102-
; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP20]], i32 [[TMP6]])
103-
; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP7]])
104-
; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP22]], i32 [[TMP8]])
105-
; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP23]], i32 [[TMP9]])
106-
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP24]], i32 [[TMP10]])
107-
; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP25]], i32 [[TMP11]])
108-
; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP26]], i32 [[TMP12]])
109-
; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP27]], i32 [[TMP13]])
110-
; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP28]], i32 [[TMP14]])
111-
; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP29]], i32 [[TMP15]])
112-
; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP30]], i32 [[TMP16]])
113-
; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP31]], i32 [[TMP17]])
114-
; CHECK-NEXT: ret i32 [[TMP32]]
75+
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
76+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
77+
; CHECK-NEXT: ret i32 [[TMP3]]
11578
;
11679
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
11780
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4

0 commit comments

Comments
 (0)