Skip to content

[AMDGPU] Vectorize more 16 bit shuffles #90648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 45 additions & 20 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,31 +1129,56 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
int Index, VectorType *SubTp,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
if (!isa<FixedVectorType>(VT))
return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);

Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
// Treat extractsubvector as single op permutation.
bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
if (IsExtractSubvector)
Kind = TTI::SK_PermuteSingleSrc;

if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle is free.

switch (Kind) {
case TTI::SK_Broadcast:
case TTI::SK_Reverse:
case TTI::SK_PermuteSingleSrc:
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
if (RequestedElts == 0)
return 0;
switch (Kind) {
case TTI::SK_Broadcast:
case TTI::SK_Reverse:
case TTI::SK_PermuteSingleSrc: {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (HasVOP3P && NumVectorElts == 2)
return 0;
default:
break;
}
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
return NumPerms + NumPermMasks;
}
case TTI::SK_ExtractSubvector:
case TTI::SK_InsertSubvector: {
// Even aligned accesses are free
if (!(Index % 2))
return 0;
// Insert/extract subvectors only require shifts / extract code to get the
// relevant bits
return alignTo(RequestedElts, 2) / 2;
}
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
return NumPerms + NumPermMasks;
}

default:
break;
}
}
// Restore optimal kind.
if (IsExtractSubvector)
Kind = TTI::SK_ExtractSubvector;

return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
}
Expand Down
1,621 changes: 1,079 additions & 542 deletions llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s

define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX7-LABEL: @uadd_sat_v2i16(
Expand All @@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @uadd_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @usub_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @sadd_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @ssub_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
;
bb:
%arg0.0 = extractelement <3 x i16> %arg0, i64 0
%arg0.1 = extractelement <3 x i16> %arg0, i64 1
Expand Down Expand Up @@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
;
bb:
%arg0.0 = extractelement <4 x i16> %arg0, i64 0
Expand Down
64 changes: 51 additions & 13 deletions llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s

define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX7-LABEL: @uadd_sat_v2i16(
Expand All @@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @uadd_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @usub_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @sadd_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
;
; GFX9-LABEL: @ssub_sat_v2i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX9-NEXT: ret <2 x i16> [[TMP0]]
;
bb:
%arg0.0 = extractelement <2 x i16> %arg0, i64 0
%arg0.1 = extractelement <2 x i16> %arg0, i64 1
Expand Down Expand Up @@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
;
bb:
%arg0.0 = extractelement <3 x i16> %arg0, i64 0
%arg0.1 = extractelement <3 x i16> %arg0, i64 1
Expand Down Expand Up @@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
;
bb:
%arg0.0 = extractelement <4 x i16> %arg0, i64 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,10 @@
define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
; CHECK-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
; CHECK-NEXT: [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
; CHECK-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
; CHECK-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; CHECK-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; CHECK-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
; CHECK-NEXT: [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
; CHECK-NEXT: ret <2 x i16> [[INS_2]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x i16> [[ARG0:%.*]], <9 x i16> poison, <2 x i32> <i32 poison, i32 8>
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; CHECK-NEXT: ret <2 x i16> [[TMP2]]
;
bb:
%arg0.1 = extractelement <9 x i16> undef, i64 7
Expand Down
Loading
Loading