Skip to content

Commit d84e5fd

Browse files
committed
[AMDGPU][GlobalISel] Fix v2s16 right shifts
When widening, each half of the v2s16 operands needs to be sign extended for G_ASHR or zero extended for G_LSHR. Differential Revision: https://reviews.llvm.org/D96048
1 parent b3bb5c3 commit d84e5fd

File tree

5 files changed

+81
-42
lines changed

5 files changed

+81
-42
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,9 +1564,11 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
15641564
// Return a suitable opcode for extending the operands of Opc when widening.
15651565
static unsigned getExtendOp(unsigned Opc) {
15661566
switch (Opc) {
1567+
case TargetOpcode::G_ASHR:
15671568
case TargetOpcode::G_SMIN:
15681569
case TargetOpcode::G_SMAX:
15691570
return TargetOpcode::G_SEXT;
1571+
case TargetOpcode::G_LSHR:
15701572
case TargetOpcode::G_UMIN:
15711573
case TargetOpcode::G_UMAX:
15721574
return TargetOpcode::G_ZEXT;

llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -560,11 +560,13 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
560560
;
561561
; GFX9-LABEL: s_ashr_v2i16:
562562
; GFX9: ; %bb.0:
563-
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
564-
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
563+
; GFX9-NEXT: s_sext_i32_i16 s2, s0
564+
; GFX9-NEXT: s_sext_i32_i16 s3, s1
565+
; GFX9-NEXT: s_ashr_i32 s0, s0, 16
566+
; GFX9-NEXT: s_ashr_i32 s1, s1, 16
567+
; GFX9-NEXT: s_ashr_i32 s2, s2, s3
565568
; GFX9-NEXT: s_ashr_i32 s0, s0, s1
566-
; GFX9-NEXT: s_ashr_i32 s1, s2, s3
567-
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
569+
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
568570
; GFX9-NEXT: ; return to shader part epilog
569571
%result = ashr <2 x i16> %value, %amount
570572
%cast = bitcast <2 x i16> %result to i32
@@ -754,16 +756,20 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
754756
;
755757
; GFX9-LABEL: s_ashr_v4i16:
756758
; GFX9: ; %bb.0:
757-
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
758-
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
759+
; GFX9-NEXT: s_sext_i32_i16 s4, s0
760+
; GFX9-NEXT: s_sext_i32_i16 s5, s2
761+
; GFX9-NEXT: s_ashr_i32 s0, s0, 16
762+
; GFX9-NEXT: s_ashr_i32 s2, s2, 16
759763
; GFX9-NEXT: s_ashr_i32 s0, s0, s2
760-
; GFX9-NEXT: s_ashr_i32 s2, s4, s5
761-
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
762-
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
763-
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
764-
; GFX9-NEXT: s_ashr_i32 s1, s1, s3
764+
; GFX9-NEXT: s_ashr_i32 s4, s4, s5
765+
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0
766+
; GFX9-NEXT: s_sext_i32_i16 s2, s1
767+
; GFX9-NEXT: s_sext_i32_i16 s4, s3
768+
; GFX9-NEXT: s_ashr_i32 s1, s1, 16
769+
; GFX9-NEXT: s_ashr_i32 s3, s3, 16
765770
; GFX9-NEXT: s_ashr_i32 s2, s2, s4
766-
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
771+
; GFX9-NEXT: s_ashr_i32 s1, s1, s3
772+
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
767773
; GFX9-NEXT: ; return to shader part epilog
768774
%result = ashr <4 x i16> %value, %amount
769775
%cast = bitcast <4 x i16> %result to <2 x i32>
@@ -968,26 +974,34 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
968974
;
969975
; GFX9-LABEL: s_ashr_v8i16:
970976
; GFX9: ; %bb.0:
971-
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
972-
; GFX9-NEXT: s_lshr_b32 s9, s4, 16
977+
; GFX9-NEXT: s_sext_i32_i16 s8, s0
978+
; GFX9-NEXT: s_sext_i32_i16 s9, s4
979+
; GFX9-NEXT: s_ashr_i32 s0, s0, 16
980+
; GFX9-NEXT: s_ashr_i32 s4, s4, 16
973981
; GFX9-NEXT: s_ashr_i32 s0, s0, s4
974-
; GFX9-NEXT: s_ashr_i32 s4, s8, s9
975-
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
976-
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
977-
; GFX9-NEXT: s_lshr_b32 s8, s5, 16
982+
; GFX9-NEXT: s_ashr_i32 s8, s8, s9
983+
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s0
984+
; GFX9-NEXT: s_sext_i32_i16 s4, s1
985+
; GFX9-NEXT: s_sext_i32_i16 s8, s5
986+
; GFX9-NEXT: s_ashr_i32 s1, s1, 16
987+
; GFX9-NEXT: s_ashr_i32 s5, s5, 16
978988
; GFX9-NEXT: s_ashr_i32 s1, s1, s5
979989
; GFX9-NEXT: s_ashr_i32 s4, s4, s8
980-
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
981-
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
982-
; GFX9-NEXT: s_lshr_b32 s5, s6, 16
990+
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1
991+
; GFX9-NEXT: s_sext_i32_i16 s4, s2
992+
; GFX9-NEXT: s_sext_i32_i16 s5, s6
993+
; GFX9-NEXT: s_ashr_i32 s2, s2, 16
994+
; GFX9-NEXT: s_ashr_i32 s6, s6, 16
983995
; GFX9-NEXT: s_ashr_i32 s4, s4, s5
984996
; GFX9-NEXT: s_ashr_i32 s2, s2, s6
985-
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
986-
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
987-
; GFX9-NEXT: s_lshr_b32 s5, s7, 16
988-
; GFX9-NEXT: s_ashr_i32 s3, s3, s7
997+
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2
998+
; GFX9-NEXT: s_sext_i32_i16 s4, s3
999+
; GFX9-NEXT: s_sext_i32_i16 s5, s7
1000+
; GFX9-NEXT: s_ashr_i32 s3, s3, 16
1001+
; GFX9-NEXT: s_ashr_i32 s6, s7, 16
9891002
; GFX9-NEXT: s_ashr_i32 s4, s4, s5
990-
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
1003+
; GFX9-NEXT: s_ashr_i32 s3, s3, s6
1004+
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3
9911005
; GFX9-NEXT: ; return to shader part epilog
9921006
%result = ashr <8 x i16> %value, %amount
9931007
%cast = bitcast <8 x i16> %result to <4 x i32>

llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -569,10 +569,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
569569
;
570570
; GFX9-LABEL: s_lshr_v2i16:
571571
; GFX9: ; %bb.0:
572+
; GFX9-NEXT: s_mov_b32 s3, 0xffff
572573
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
573-
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
574+
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
575+
; GFX9-NEXT: s_and_b32 s0, s0, s3
576+
; GFX9-NEXT: s_and_b32 s1, s1, s3
574577
; GFX9-NEXT: s_lshr_b32 s0, s0, s1
575-
; GFX9-NEXT: s_lshr_b32 s1, s2, s3
578+
; GFX9-NEXT: s_lshr_b32 s1, s2, s4
576579
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
577580
; GFX9-NEXT: ; return to shader part epilog
578581
%result = lshr <2 x i16> %value, %amount
@@ -747,13 +750,18 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
747750
;
748751
; GFX9-LABEL: s_lshr_v4i16:
749752
; GFX9: ; %bb.0:
753+
; GFX9-NEXT: s_mov_b32 s5, 0xffff
750754
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
751-
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
755+
; GFX9-NEXT: s_lshr_b32 s6, s2, 16
756+
; GFX9-NEXT: s_and_b32 s0, s0, s5
757+
; GFX9-NEXT: s_and_b32 s2, s2, s5
752758
; GFX9-NEXT: s_lshr_b32 s0, s0, s2
753-
; GFX9-NEXT: s_lshr_b32 s2, s4, s5
759+
; GFX9-NEXT: s_lshr_b32 s2, s4, s6
754760
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
755761
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
756762
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
763+
; GFX9-NEXT: s_and_b32 s1, s1, s5
764+
; GFX9-NEXT: s_and_b32 s3, s3, s5
757765
; GFX9-NEXT: s_lshr_b32 s1, s1, s3
758766
; GFX9-NEXT: s_lshr_b32 s2, s2, s4
759767
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
@@ -937,24 +945,33 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
937945
;
938946
; GFX9-LABEL: s_lshr_v8i16:
939947
; GFX9: ; %bb.0:
948+
; GFX9-NEXT: s_mov_b32 s9, 0xffff
940949
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
941-
; GFX9-NEXT: s_lshr_b32 s9, s4, 16
950+
; GFX9-NEXT: s_lshr_b32 s10, s4, 16
951+
; GFX9-NEXT: s_and_b32 s0, s0, s9
952+
; GFX9-NEXT: s_and_b32 s4, s4, s9
942953
; GFX9-NEXT: s_lshr_b32 s0, s0, s4
943-
; GFX9-NEXT: s_lshr_b32 s4, s8, s9
954+
; GFX9-NEXT: s_lshr_b32 s4, s8, s10
944955
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
945956
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
946957
; GFX9-NEXT: s_lshr_b32 s8, s5, 16
958+
; GFX9-NEXT: s_and_b32 s1, s1, s9
959+
; GFX9-NEXT: s_and_b32 s5, s5, s9
947960
; GFX9-NEXT: s_lshr_b32 s1, s1, s5
948961
; GFX9-NEXT: s_lshr_b32 s4, s4, s8
949962
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
950963
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
951964
; GFX9-NEXT: s_lshr_b32 s5, s6, 16
952-
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
965+
; GFX9-NEXT: s_and_b32 s2, s2, s9
966+
; GFX9-NEXT: s_and_b32 s6, s6, s9
953967
; GFX9-NEXT: s_lshr_b32 s2, s2, s6
968+
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
954969
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
955970
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
956971
; GFX9-NEXT: s_lshr_b32 s5, s7, 16
957-
; GFX9-NEXT: s_lshr_b32 s3, s3, s7
972+
; GFX9-NEXT: s_and_b32 s3, s3, s9
973+
; GFX9-NEXT: s_and_b32 s6, s7, s9
974+
; GFX9-NEXT: s_lshr_b32 s3, s3, s6
958975
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
959976
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
960977
; GFX9-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,16 @@ body: |
183183
; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
184184
; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
185185
; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
186+
; CHECK: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST]], 16
186187
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
187-
; CHECK: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
188+
; CHECK: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32)
188189
; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
190+
; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
189191
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
190-
; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
191-
; CHECK: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[BITCAST1]](s32)
192-
; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[LSHR]], [[LSHR1]](s32)
193-
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR]](s32), [[ASHR1]](s32)
192+
; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
193+
; CHECK: [[ASHR2:%[0-9]+]]:sgpr(s32) = G_ASHR [[SEXT_INREG]], [[SEXT_INREG1]](s32)
194+
; CHECK: [[ASHR3:%[0-9]+]]:sgpr(s32) = G_ASHR [[ASHR]], [[ASHR1]](s32)
195+
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR2]](s32), [[ASHR3]](s32)
194196
; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)
195197
%0:_(<2 x s16>) = COPY $sgpr0
196198
%1:_(<2 x s16>) = COPY $sgpr1

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,14 @@ body: |
185185
; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
186186
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
187187
; CHECK: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
188+
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
189+
; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]]
188190
; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
189-
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
190-
; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
191-
; CHECK: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[BITCAST1]](s32)
191+
; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
192+
; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
193+
; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
194+
; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
195+
; CHECK: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[AND]], [[AND1]](s32)
192196
; CHECK: [[LSHR3:%[0-9]+]]:sgpr(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32)
193197
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32)
194198
; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)

0 commit comments

Comments
 (0)