Skip to content

Commit ea43a30

Browse files
authored
[AMDGPU] Vectorize more 16 bit shuffles (#90648)
In the case of larger vectors, we should still prefer the vectorized version (i.e. shufflevector vs extract/insert chains). In arithmetic chains, vectorization results in chains of packed math instructions (as opposed to unpack/repack & scalarized arithmetic): https://godbolt.org/z/c5onaf6G5 In chains with PHIs, vectorization again removes the unnecessary pack / repack code around BBs: https://godbolt.org/z/vz7zYzvhs
1 parent f52d29c commit ea43a30

File tree

7 files changed

+1278
-724
lines changed

7 files changed

+1278
-724
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,31 +1129,56 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11291129
int Index, VectorType *SubTp,
11301130
ArrayRef<const Value *> Args,
11311131
const Instruction *CxtI) {
1132+
if (!isa<FixedVectorType>(VT))
1133+
return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1134+
11321135
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1133-
// Treat extractsubvector as single op permutation.
1134-
bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1135-
if (IsExtractSubvector)
1136-
Kind = TTI::SK_PermuteSingleSrc;
1137-
1138-
if (ST->hasVOP3PInsts()) {
1139-
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1140-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1141-
// With op_sel VOP3P instructions freely can access the low half or high
1142-
// half of a register, so any swizzle is free.
11431136

1144-
switch (Kind) {
1145-
case TTI::SK_Broadcast:
1146-
case TTI::SK_Reverse:
1147-
case TTI::SK_PermuteSingleSrc:
1137+
// Larger vector widths may require additional instructions, but are
1138+
// typically cheaper than scalarized versions.
1139+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1140+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1141+
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1142+
bool HasVOP3P = ST->hasVOP3PInsts();
1143+
unsigned RequestedElts =
1144+
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1145+
if (RequestedElts == 0)
1146+
return 0;
1147+
switch (Kind) {
1148+
case TTI::SK_Broadcast:
1149+
case TTI::SK_Reverse:
1150+
case TTI::SK_PermuteSingleSrc: {
1151+
// With op_sel VOP3P instructions freely can access the low half or high
1152+
// half of a register, so any swizzle of two elements is free.
1153+
if (HasVOP3P && NumVectorElts == 2)
11481154
return 0;
1149-
default:
1150-
break;
1151-
}
1155+
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1156+
// SK_Broadcast just reuses the same mask
1157+
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1158+
return NumPerms + NumPermMasks;
1159+
}
1160+
case TTI::SK_ExtractSubvector:
1161+
case TTI::SK_InsertSubvector: {
1162+
// Even aligned accesses are free
1163+
if (!(Index % 2))
1164+
return 0;
1165+
// Insert/extract subvectors only require shifts / extract code to get the
1166+
// relevant bits
1167+
return alignTo(RequestedElts, 2) / 2;
1168+
}
1169+
case TTI::SK_PermuteTwoSrc:
1170+
case TTI::SK_Splice:
1171+
case TTI::SK_Select: {
1172+
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1173+
// SK_Select just reuses the same mask
1174+
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1175+
return NumPerms + NumPermMasks;
1176+
}
1177+
1178+
default:
1179+
break;
11521180
}
11531181
}
1154-
// Restore optimal kind.
1155-
if (IsExtractSubvector)
1156-
Kind = TTI::SK_ExtractSubvector;
11571182

11581183
return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
11591184
}

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Lines changed: 1079 additions & 542 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
33
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
77
; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
2121
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
2222
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
2323
;
24+
; GFX9-LABEL: @uadd_sat_v2i16(
25+
; GFX9-NEXT: bb:
26+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
27+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
28+
;
2429
bb:
2530
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
2631
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
5156
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
5257
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
5358
;
59+
; GFX9-LABEL: @usub_sat_v2i16(
60+
; GFX9-NEXT: bb:
61+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
62+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
63+
;
5464
bb:
5565
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
5666
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
8191
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
8292
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
8393
;
94+
; GFX9-LABEL: @sadd_sat_v2i16(
95+
; GFX9-NEXT: bb:
96+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
97+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
98+
;
8499
bb:
85100
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
86101
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
111126
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
112127
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
113128
;
129+
; GFX9-LABEL: @ssub_sat_v2i16(
130+
; GFX9-NEXT: bb:
131+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
132+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
133+
;
114134
bb:
115135
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
116136
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
252272
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
253273
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
254274
;
275+
; GFX9-LABEL: @uadd_sat_v3i16(
276+
; GFX9-NEXT: bb:
277+
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
278+
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
279+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
280+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
281+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
282+
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
283+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
284+
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
285+
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
286+
;
255287
bb:
256288
%arg0.0 = extractelement <3 x i16> %arg0, i64 0
257289
%arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
291323
;
292324
; GFX8-LABEL: @uadd_sat_v4i16(
293325
; GFX8-NEXT: bb:
294-
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
295-
; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
296-
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
297-
; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
298-
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
299-
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
326+
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
327+
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
300328
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
301-
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
302-
; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
303-
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
304-
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
305-
; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
306-
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
329+
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
330+
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
331+
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
332+
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333+
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
334+
;
335+
; GFX9-LABEL: @uadd_sat_v4i16(
336+
; GFX9-NEXT: bb:
337+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
338+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
339+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
340+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
341+
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
342+
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
343+
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
344+
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
307345
;
308346
bb:
309347
%arg0.0 = extractelement <4 x i16> %arg0, i64 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
33
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
77
; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
2121
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
2222
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
2323
;
24+
; GFX9-LABEL: @uadd_sat_v2i16(
25+
; GFX9-NEXT: bb:
26+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
27+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
28+
;
2429
bb:
2530
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
2631
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
5156
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
5257
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
5358
;
59+
; GFX9-LABEL: @usub_sat_v2i16(
60+
; GFX9-NEXT: bb:
61+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
62+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
63+
;
5464
bb:
5565
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
5666
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
8191
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
8292
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
8393
;
94+
; GFX9-LABEL: @sadd_sat_v2i16(
95+
; GFX9-NEXT: bb:
96+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
97+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
98+
;
8499
bb:
85100
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
86101
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
111126
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
112127
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
113128
;
129+
; GFX9-LABEL: @ssub_sat_v2i16(
130+
; GFX9-NEXT: bb:
131+
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
132+
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
133+
;
114134
bb:
115135
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
116136
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
252272
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
253273
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
254274
;
275+
; GFX9-LABEL: @uadd_sat_v3i16(
276+
; GFX9-NEXT: bb:
277+
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
278+
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
279+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
280+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
281+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
282+
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
283+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
284+
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
285+
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
286+
;
255287
bb:
256288
%arg0.0 = extractelement <3 x i16> %arg0, i64 0
257289
%arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
291323
;
292324
; GFX8-LABEL: @uadd_sat_v4i16(
293325
; GFX8-NEXT: bb:
294-
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
295-
; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
296-
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
297-
; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
298-
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
299-
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
326+
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
327+
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
300328
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
301-
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
302-
; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
303-
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
304-
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
305-
; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
306-
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
329+
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
330+
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
331+
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
332+
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333+
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
334+
;
335+
; GFX9-LABEL: @uadd_sat_v4i16(
336+
; GFX9-NEXT: bb:
337+
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
338+
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
339+
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
340+
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
341+
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
342+
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
343+
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
344+
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
307345
;
308346
bb:
309347
%arg0.0 = extractelement <4 x i16> %arg0, i64 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,10 @@
44
define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
55
; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
66
; CHECK-NEXT: bb:
7-
; CHECK-NEXT: [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
8-
; CHECK-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
9-
; CHECK-NEXT: [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
10-
; CHECK-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
11-
; CHECK-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
12-
; CHECK-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
13-
; CHECK-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
14-
; CHECK-NEXT: [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
15-
; CHECK-NEXT: ret <2 x i16> [[INS_2]]
7+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x i16> [[ARG0:%.*]], <9 x i16> poison, <2 x i32> <i32 poison, i32 8>
8+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
9+
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
10+
; CHECK-NEXT: ret <2 x i16> [[TMP2]]
1611
;
1712
bb:
1813
%arg0.1 = extractelement <9 x i16> undef, i64 7

0 commit comments

Comments
 (0)