-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU][GFX12] Default component broadcast store #76212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][GFX12] Default component broadcast store #76212
Conversation
For image and buffer stores the default behaviour on GFX12 is to set all unset components to the value of the first component. So if we pass only X component, it will be the same as XXXX, or XY same as XYXX. This patch simplifies the passed vector of components in InstCombine by removing components from the end that are equal to the first component. For image stores it also trims DMask if necessary.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Mariusz Sikora (mariusz-sikora-at-amd) ChangesFor image and buffer stores the default behaviour on GFX12 is to set all unset components to the value of the first component. So if we pass only X component, it will be the same as XXXX, or XY same as XYXX. This patch simplifies the passed vector of components in InstCombine by removing components from the end that are equal to the first component. For image stores it also trims DMask if necessary. Full diff: https://github.com/llvm/llvm-project/pull/76212.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2bb7b6bd0674a2..da2f862308558b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
return DemandedElts;
}
+// Trim elements of the end of the vector \p V, if they are
+// equal to the first element of the vector.
+static APInt defaultComponentBroadcast(Value *V) {
+ auto *VTy = cast<FixedVectorType>(V->getType());
+ unsigned VWidth = VTy->getNumElements();
+ APInt DemandedElts = APInt::getAllOnes(VWidth);
+ Value *FirstComponent = findScalarElement(V, 0);
+
+ SmallVector<int> ShuffleMask;
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
+ SVI->getShuffleMask(ShuffleMask);
+
+ for (int I = VWidth - 1; I > 0; --I) {
+ if (ShuffleMask.empty()) {
+ auto *Elt = findScalarElement(V, I);
+ if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
+ break;
+ } else {
+ // Detect identical elements in the shufflevector result, even though
+ // findScalarElement cannot tell us what that element is.
+ if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
+ break;
+ }
+ DemandedElts.clearBit(I);
+ }
+
+ return DemandedElts;
+}
+
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
IntrinsicInst &II,
APInt DemandedElts,
@@ -1140,8 +1169,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
break;
- APInt DemandedElts =
- trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+ APInt DemandedElts;
+ if (AMDGPU::isGFX12Plus(*ST))
+ DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
+ else
+ DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index f2d904cce7f00d..95b1d09bbd6036 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -23,7 +23,8 @@ define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg
; GCN-NEXT: ret void
;
; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end(
-; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
; GFX12-NEXT: ret void
;
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -63,9 +64,9 @@ define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %
; GCN-NEXT: ret void
;
; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFX12-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; GFX12-NEXT: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
+; GFX12-NEXT: [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
+; GFX12-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
; GFX12-NEXT: ret void
;
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -84,9 +85,9 @@ define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %
; GCN-NEXT: ret void
;
; GFX12-LABEL: @struct_buffer_store_format_insert_zeros(
-; GFX12-NEXT: [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2
-; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GFX12-NEXT: [[TMP1:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 2
+; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
; GFX12-NEXT: ret void
;
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -140,8 +141,8 @@ define amdgpu_ps void @image_store_1d_store_shufflevector_same(<8 x i32> inreg %
; GCN-NEXT: ret void
;
; GFX12-LABEL: @image_store_1d_store_shufflevector_same(
-; GFX12-NEXT: [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
-; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VDATA1:%.*]], i64 0
+; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[TMP1]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
; GFX12-NEXT: ret void
;
%data = shufflevector <4 x float> %vdata1, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -155,7 +156,7 @@ define amdgpu_ps void @image_store_1d_store_shufflevector(<8 x i32> inreg %rsrc,
; GCN-NEXT: ret void
;
; GFX12-LABEL: @image_store_1d_store_shufflevector(
-; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00>, i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
; GFX12-NEXT: ret void
;
%data = shufflevector <4 x float> <float 2.0, float 1.0, float 2.0, float 5.0>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
@@ -172,10 +173,8 @@ define amdgpu_ps void @struct_buffer_store_format_insert_first_at_end(<4 x i32>
; GCN-NEXT: ret void
;
; GFX12-LABEL: @struct_buffer_store_format_insert_first_at_end(
-; GFX12-NEXT: [[NEWVDATA2:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT: [[NEWVDATA3:%.*]] = insertelement <4 x float> [[NEWVDATA2]], float [[VDATA1]], i64 2
-; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3
-; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GFX12-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
; GFX12-NEXT: ret void
;
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -194,9 +193,8 @@ define amdgpu_ps void @struct_tbuffer_store_insert(<4 x i32> inreg %a, float %vd
; GCN-NEXT: ret void
;
; GFX12-LABEL: @struct_tbuffer_store_insert(
-; GFX12-NEXT: [[NEWVDATA3:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 2.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3
-; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFX12-NEXT: [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 1.000000e+00, float 2.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v3f32(<3 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
; GFX12-NEXT: ret void
;
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
|
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. @arsenm does this address your concerns?
ping @arsenm |
…nt-broadcast-store
Merge with upstream to run tests. I will merge this changes if CI will pass. |
def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast", | ||
"HasDefaultComponentBroadcast", | ||
"true", | ||
"BUFFER/IMAGE store instructions set unspecified components to x component" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should add gfx numbers to the description
For image and buffer stores the default behaviour on GFX12 is to set all unset components to the value of the first component. So if we pass only X component, it will be the same as XXXX, or XY same as XYXX. This patch simplifies the passed vector of components in InstCombine by removing components from the end that are equal to the first component. For image stores it also trims DMask if necessary. --------- Co-authored-by: Mateja Marjanovic <[email protected]>
For image and buffer stores the default behaviour on GFX12 is to set all unset components to the value of the first component. So if we pass only X component, it will be the same as XXXX, or XY same as XYXX.
This patch simplifies the passed vector of components in InstCombine by removing components from the end that are equal to the first component.
For image stores it also trims DMask if necessary.