Skip to content

Commit 43924cb

Browse files
committed
[AMDGPU][GlobalISel] Fix selection of image sample g16 instructions
Pre-GFX10 A16 modifier would imply G16. From GFX10 and onwards there are separate instructions for 16bit gradients. This fixes the condition for selecting G16 opcodes. Also stop adding G16 flag to instructions that do not use gradients for GFX10 onwards.
1 parent dcee187 commit 43924cb

7 files changed

+456
-104
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1824,7 +1824,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
18241824
}
18251825

18261826
// Set G16 opcode
1827-
if (IsG16 && !IsA16) {
1827+
if (Subtarget->hasG16() && IsG16) {
18281828
const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
18291829
AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
18301830
assert(G16MappingInfo);

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4901,7 +4901,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
49014901
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
49024902
LLT AddrTy =
49034903
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4904-
const bool IsG16 = GradTy == S16;
4904+
const bool IsG16 =
4905+
ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
49054906
const bool IsA16 = AddrTy == S16;
49064907
const bool IsD16 = Ty.getScalarType() == S16;
49074908

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll

Lines changed: 24 additions & 24 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll

Lines changed: 50 additions & 50 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll

Lines changed: 28 additions & 28 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll

Lines changed: 277 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,80 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, floa
857857
declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
858858
declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
859859

860+
define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
861+
; GFX10-LABEL: sample_d_1d_g16_a16:
862+
; GFX10: ; %bb.0: ; %main_body
863+
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
864+
; GFX10-NEXT: s_waitcnt vmcnt(0)
865+
; GFX10-NEXT: ; return to shader part epilog
866+
;
867+
; GFX10GISEL-LABEL: sample_d_1d_g16_a16:
868+
; GFX10GISEL: ; %bb.0: ; %main_body
869+
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
870+
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
871+
; GFX10GISEL-NEXT: ; return to shader part epilog
872+
main_body:
873+
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
874+
ret <4 x float> %v
875+
}
876+
877+
define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
878+
; GFX10-LABEL: sample_d_2d_g16_a16:
879+
; GFX10: ; %bb.0: ; %main_body
880+
; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
881+
; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100
882+
; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100
883+
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
884+
; GFX10-NEXT: s_waitcnt vmcnt(0)
885+
; GFX10-NEXT: ; return to shader part epilog
886+
;
887+
; GFX10GISEL-LABEL: sample_d_2d_g16_a16:
888+
; GFX10GISEL: ; %bb.0: ; %main_body
889+
; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
890+
; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
891+
; GFX10GISEL-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
892+
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
893+
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
894+
; GFX10GISEL-NEXT: ; return to shader part epilog
895+
main_body:
896+
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
897+
ret <4 x float> %v
898+
}
899+
900+
define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) {
901+
; GFX10-LABEL: sample_d_3d_g16_a16:
902+
; GFX10: ; %bb.0: ; %main_body
903+
; GFX10-NEXT: v_mov_b32_e32 v12, v8
904+
; GFX10-NEXT: v_mov_b32_e32 v10, v5
905+
; GFX10-NEXT: v_mov_b32_e32 v8, v2
906+
; GFX10-NEXT: v_perm_b32 v11, v7, v6, 0x5040100
907+
; GFX10-NEXT: v_perm_b32 v9, v4, v3, 0x5040100
908+
; GFX10-NEXT: v_perm_b32 v7, v1, v0, 0x5040100
909+
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
910+
; GFX10-NEXT: s_waitcnt vmcnt(0)
911+
; GFX10-NEXT: ; return to shader part epilog
912+
;
913+
; GFX10GISEL-LABEL: sample_d_3d_g16_a16:
914+
; GFX10GISEL: ; %bb.0: ; %main_body
915+
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
916+
; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v7
917+
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2
918+
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8
919+
; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100
920+
; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
921+
; GFX10GISEL-NEXT: v_perm_b32 v6, v10, v6, 0x5040100
922+
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
923+
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
924+
; GFX10GISEL-NEXT: ; return to shader part epilog
925+
main_body:
926+
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
927+
ret <4 x float> %v
928+
}
929+
930+
declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32)
931+
declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32)
932+
declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32)
933+
860934
attributes #0 = { nounwind }
861935
attributes #1 = { nounwind readonly }
862936
attributes #2 = { nounwind readnone }

0 commit comments

Comments
 (0)