Skip to content

Commit d1e9741

Browse files
committed
[AMDGPU] Vectorize more 16 bit shuffles (llvm#90648)
In the case of larger vectors, we should still prefer the vectorized version (i.e. shufflevector vs extract/insert chains). In arithmetic chains, vectorization results in chains of packed math instructions (as opposed to unpack/repack & scalarized arithmetic): https://godbolt.org/z/c5onaf6G5 In chains with PHIs, vectorization again removes the unnecessary pack / repack code around BBs: https://godbolt.org/z/vz7zYzvhs Change-Id: I2da3af0c596f2e3273553642a2b27f97f10509e7
1 parent eba0f55 commit d1e9741

File tree

7 files changed

+1277
-716
lines changed

7 files changed

+1277
-716
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,22 +1128,54 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11281128
TTI::TargetCostKind CostKind,
11291129
int Index, VectorType *SubTp,
11301130
ArrayRef<const Value *> Args) {
1131+
if (!isa<FixedVectorType>(VT))
1132+
return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1133+
11311134
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
11321135

1133-
if (ST->hasVOP3PInsts()) {
1134-
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1135-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1136+
// Larger vector widths may require additional instructions, but are
1137+
// typically cheaper than scalarized versions.
1138+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1139+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1140+
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1141+
bool HasVOP3P = ST->hasVOP3PInsts();
1142+
unsigned RequestedElts =
1143+
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1144+
if (RequestedElts == 0)
1145+
return 0;
1146+
switch (Kind) {
1147+
case TTI::SK_Broadcast:
1148+
case TTI::SK_Reverse:
1149+
case TTI::SK_PermuteSingleSrc: {
11361150
// With op_sel VOP3P instructions freely can access the low half or high
1137-
// half of a register, so any swizzle is free.
1138-
1139-
switch (Kind) {
1140-
case TTI::SK_Broadcast:
1141-
case TTI::SK_Reverse:
1142-
case TTI::SK_PermuteSingleSrc:
1151+
// half of a register, so any swizzle of two elements is free.
1152+
if (HasVOP3P && NumVectorElts == 2)
11431153
return 0;
1144-
default:
1145-
break;
1146-
}
1154+
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1155+
// SK_Broadcast just reuses the same mask
1156+
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1157+
return NumPerms + NumPermMasks;
1158+
}
1159+
case TTI::SK_ExtractSubvector:
1160+
case TTI::SK_InsertSubvector: {
1161+
// Even aligned accesses are free
1162+
if (!(Index % 2))
1163+
return 0;
1164+
// Insert/extract subvectors only require shifts / extract code to get the
1165+
// relevant bits
1166+
return alignTo(RequestedElts, 2) / 2;
1167+
}
1168+
case TTI::SK_PermuteTwoSrc:
1169+
case TTI::SK_Splice:
1170+
case TTI::SK_Select: {
1171+
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1172+
// SK_Select just reuses the same mask
1173+
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1174+
return NumPerms + NumPermMasks;
1175+
}
1176+
1177+
default:
1178+
break;
11471179
}
11481180
}
11491181

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Lines changed: 1079 additions & 542 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
33
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
77
; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
2121
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
2222
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
2323
;
24+
; GFX9-LABEL: @uadd_sat_v2i16(
25+
; GFX9-NEXT: bb:
26+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
27+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
28+
;
2429
bb:
2530
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
2631
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
5156
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
5257
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
5358
;
59+
; GFX9-LABEL: @usub_sat_v2i16(
60+
; GFX9-NEXT: bb:
61+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
62+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
63+
;
5464
bb:
5565
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
5666
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
8191
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
8292
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
8393
;
94+
; GFX9-LABEL: @sadd_sat_v2i16(
95+
; GFX9-NEXT: bb:
96+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
97+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
98+
;
8499
bb:
85100
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
86101
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
111126
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
112127
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
113128
;
129+
; GFX9-LABEL: @ssub_sat_v2i16(
130+
; GFX9-NEXT: bb:
131+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
132+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
133+
;
114134
bb:
115135
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
116136
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
252272
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
253273
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
254274
;
275+
; GFX9-LABEL: @uadd_sat_v3i16(
276+
; GFX9-NEXT: bb:
277+
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
278+
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
279+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
280+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
281+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
282+
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
283+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
284+
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
285+
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
286+
;
255287
bb:
256288
%arg0.0 = extractelement <3 x i16> %arg0, i64 0
257289
%arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
291323
;
292324
; GFX8-LABEL: @uadd_sat_v4i16(
293325
; GFX8-NEXT: bb:
294-
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
295-
; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
296-
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
297-
; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
298-
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
299-
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
326+
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
327+
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
300328
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
301-
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
302-
; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
303-
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
304-
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
305-
; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
306-
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
329+
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
330+
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
331+
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
332+
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333+
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
334+
;
335+
; GFX9-LABEL: @uadd_sat_v4i16(
336+
; GFX9-NEXT: bb:
337+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
338+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
339+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
340+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
341+
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
342+
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
343+
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
344+
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
307345
;
308346
bb:
309347
%arg0.0 = extractelement <4 x i16> %arg0, i64 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
33
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
77
; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
2121
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
2222
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
2323
;
24+
; GFX9-LABEL: @uadd_sat_v2i16(
25+
; GFX9-NEXT: bb:
26+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
27+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
28+
;
2429
bb:
2530
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
2631
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
5156
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
5257
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
5358
;
59+
; GFX9-LABEL: @usub_sat_v2i16(
60+
; GFX9-NEXT: bb:
61+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
62+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
63+
;
5464
bb:
5565
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
5666
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
8191
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
8292
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
8393
;
94+
; GFX9-LABEL: @sadd_sat_v2i16(
95+
; GFX9-NEXT: bb:
96+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
97+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
98+
;
8499
bb:
85100
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
86101
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
111126
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
112127
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
113128
;
129+
; GFX9-LABEL: @ssub_sat_v2i16(
130+
; GFX9-NEXT: bb:
131+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
132+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
133+
;
114134
bb:
115135
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
116136
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
252272
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
253273
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
254274
;
275+
; GFX9-LABEL: @uadd_sat_v3i16(
276+
; GFX9-NEXT: bb:
277+
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
278+
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
279+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
280+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
281+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
282+
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
283+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
284+
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
285+
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
286+
;
255287
bb:
256288
%arg0.0 = extractelement <3 x i16> %arg0, i64 0
257289
%arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
291323
;
292324
; GFX8-LABEL: @uadd_sat_v4i16(
293325
; GFX8-NEXT: bb:
294-
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
295-
; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
296-
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
297-
; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
298-
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
299-
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
326+
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
327+
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
300328
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
301-
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
302-
; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
303-
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
304-
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
305-
; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
306-
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
329+
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
330+
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
331+
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
332+
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333+
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
334+
;
335+
; GFX9-LABEL: @uadd_sat_v4i16(
336+
; GFX9-NEXT: bb:
337+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
338+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
339+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
340+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
341+
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
342+
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
343+
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
344+
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
307345
;
308346
bb:
309347
%arg0.0 = extractelement <4 x i16> %arg0, i64 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,10 @@
44
define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
55
; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
66
; CHECK-NEXT: bb:
7-
; CHECK-NEXT: [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
8-
; CHECK-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
9-
; CHECK-NEXT: [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
10-
; CHECK-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
11-
; CHECK-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
12-
; CHECK-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
13-
; CHECK-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
14-
; CHECK-NEXT: [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
15-
; CHECK-NEXT: ret <2 x i16> [[INS_2]]
7+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x i16> undef, <9 x i16> [[ARG0:%.*]], <2 x i32> <i32 0, i32 17>
8+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
9+
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
10+
; CHECK-NEXT: ret <2 x i16> [[TMP2]]
1611
;
1712
bb:
1813
%arg0.1 = extractelement <9 x i16> undef, i64 7

0 commit comments

Comments
 (0)