Skip to content

Commit 8a52bd8

Browse files
committed
[AMDGPU] Only select VOP3 forms of VOP2 instructions
Change VOP_PAT_GEN to default to not generating an instruction selection pattern for the VOP2 (e32) form of an instruction, only for the VOP3 (e64) form. This allows SIFoldOperands maximum freedom to fold copies into the operands of an instruction, before SIShrinkInstructions tries to shrink it back to the smaller encoding. This affects the following VOP2 instructions: v_min_i32 v_max_i32 v_min_u32 v_max_u32 v_and_b32 v_or_b32 v_xor_b32 v_lshr_b32 v_ashr_i32 v_lshl_b32 A further cleanup could simplify or remove VOP_PAT_GEN, since its optional second argument is never used. Differential Revision: https://reviews.llvm.org/D114252
1 parent c52ff0c commit 8a52bd8

21 files changed

+384
-409
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2246,7 +2246,7 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
22462246
let HasExtSDWA9 = 0;
22472247
}
22482248

2249-
class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> {
2249+
class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> {
22502250
let NeedPatGen = mode;
22512251
}
22522252

llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x
4040
; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
4141
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
4242
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
43-
; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
44-
; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
43+
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
44+
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4545
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
4646
; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
4747
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2-
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2+
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
33

44
; GCN-LABEL: {{^}}v_ubfe_sub_i32:
55
; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
@@ -24,11 +24,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
2424
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
2525
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
2626

27-
; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
28-
; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
29-
30-
; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
31-
; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
27+
; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
28+
; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
3229

3330
; GCN: [[BFE]]
3431
; GCN: [[SHL]]
@@ -101,11 +98,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
10198
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
10299
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
103100

104-
; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
105-
; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
106-
107-
; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
108-
; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
101+
; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
102+
; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
109103

110104
; GCN: [[BFE]]
111105
; GCN: [[SHL]]

llvm/test/CodeGen/AMDGPU/commute-shifts.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
1717
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
1818
; SI-NEXT: v_and_b32_e32 v0, 7, v0
1919
; SI-NEXT: s_waitcnt vmcnt(0)
20-
; SI-NEXT: v_lshr_b32_e32 v0, v2, v0
20+
; SI-NEXT: v_lshrrev_b32_e32 v0, v0, v2
2121
; SI-NEXT: v_and_b32_e32 v0, 1, v0
2222
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
2323
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc

llvm/test/CodeGen/AMDGPU/ctpop16.ll

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
730730
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
731731
; SI-NEXT: v_and_b32_e32 v14, s0, v2
732732
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
733-
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v3
733+
; SI-NEXT: v_and_b32_e32 v15, s0, v3
734734
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
735735
; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0
736736
; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0
@@ -773,7 +773,6 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
773773
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
774774
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
775775
; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
776-
; VI-NEXT: v_mov_b32_e32 v8, 0xffff
777776
; VI-NEXT: s_mov_b32 s7, 0xf000
778777
; VI-NEXT: s_mov_b32 s6, -1
779778
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -786,23 +785,23 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
786785
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
787786
; VI-NEXT: s_mov_b32 s0, 0xffff
788787
; VI-NEXT: s_waitcnt vmcnt(1)
789-
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
790-
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
791-
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
792-
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v0
793-
; VI-NEXT: v_and_b32_e32 v3, v8, v3
794-
; VI-NEXT: v_and_b32_e32 v2, v8, v2
795-
; VI-NEXT: v_and_b32_e32 v1, v8, v1
796-
; VI-NEXT: v_and_b32_e32 v0, v8, v0
788+
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
789+
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
790+
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
791+
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
792+
; VI-NEXT: v_and_b32_e32 v3, s0, v3
793+
; VI-NEXT: v_and_b32_e32 v2, s0, v2
794+
; VI-NEXT: v_and_b32_e32 v1, s0, v1
795+
; VI-NEXT: v_and_b32_e32 v0, s0, v0
797796
; VI-NEXT: s_waitcnt vmcnt(0)
798-
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v7
797+
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
799798
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6
800799
; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5
801800
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4
801+
; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0
802802
; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0
803803
; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0
804804
; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0
805-
; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0
806805
; VI-NEXT: v_and_b32_e32 v7, s0, v7
807806
; VI-NEXT: v_and_b32_e32 v6, s0, v6
808807
; VI-NEXT: v_and_b32_e32 v5, s0, v5
@@ -811,27 +810,27 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
811810
; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
812811
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
813812
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
814-
; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0
813+
; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0
815814
; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0
816815
; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0
817816
; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0
817+
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
818818
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
819819
; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
820820
; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
821-
; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
822821
; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0
823822
; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0
824823
; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0
825824
; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0
826-
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
825+
; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
827826
; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
828827
; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
829828
; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
830-
; VI-NEXT: v_or_b32_e32 v3, v3, v9
831-
; VI-NEXT: v_or_b32_e32 v2, v2, v10
832-
; VI-NEXT: v_or_b32_e32 v1, v1, v11
833-
; VI-NEXT: v_or_b32_e32 v0, v0, v12
834-
; VI-NEXT: v_or_b32_e32 v7, v7, v8
829+
; VI-NEXT: v_or_b32_e32 v3, v3, v8
830+
; VI-NEXT: v_or_b32_e32 v2, v2, v9
831+
; VI-NEXT: v_or_b32_e32 v1, v1, v10
832+
; VI-NEXT: v_or_b32_e32 v0, v0, v11
833+
; VI-NEXT: v_or_b32_e32 v7, v7, v12
835834
; VI-NEXT: v_or_b32_e32 v6, v6, v13
836835
; VI-NEXT: v_or_b32_e32 v5, v5, v14
837836
; VI-NEXT: v_or_b32_e32 v4, v4, v15

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
166166
; SI: ; %bb.0:
167167
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168168
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
169-
; SI-NEXT: v_lshl_b32_e32 v0, v0, v1
170-
; SI-NEXT: v_lshr_b32_e32 v0, v0, v1
169+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
170+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
171171
; SI-NEXT: s_setpc_b64 s[30:31]
172172
;
173173
; VI-LABEL: bzhi32_d1_indexzext:

llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Lines changed: 58 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -500,9 +500,9 @@ define void @store_load_vindex_foo(i32 %idx) {
500500
; GFX9: ; %bb.0: ; %bb
501501
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502502
; GFX9-NEXT: v_mov_b32_e32 v1, s32
503-
; GFX9-NEXT: v_mov_b32_e32 v3, 15
504503
; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
505-
; GFX9-NEXT: v_and_b32_e32 v0, v0, v3
504+
; GFX9-NEXT: v_mov_b32_e32 v3, 15
505+
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
506506
; GFX9-NEXT: scratch_store_dword v2, v3, off
507507
; GFX9-NEXT: s_waitcnt vmcnt(0)
508508
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
@@ -514,24 +514,24 @@ define void @store_load_vindex_foo(i32 %idx) {
514514
; GFX10: ; %bb.0: ; %bb
515515
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
516516
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
517-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
518-
; GFX10-NEXT: v_mov_b32_e32 v2, s32
519-
; GFX10-NEXT: v_and_b32_e32 v3, v0, v1
520-
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2
521-
; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2
522-
; GFX10-NEXT: scratch_store_dword v0, v1, off
517+
; GFX10-NEXT: v_mov_b32_e32 v1, s32
518+
; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
519+
; GFX10-NEXT: v_mov_b32_e32 v3, 15
520+
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
521+
; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1
522+
; GFX10-NEXT: scratch_store_dword v0, v3, off
523523
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
524-
; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc
524+
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
525525
; GFX10-NEXT: s_waitcnt vmcnt(0)
526526
; GFX10-NEXT: s_setpc_b64 s[30:31]
527527
;
528528
; GFX9-PAL-LABEL: store_load_vindex_foo:
529529
; GFX9-PAL: ; %bb.0: ; %bb
530530
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531531
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32
532-
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
533532
; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
534-
; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3
533+
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
534+
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
535535
; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
536536
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
537537
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
@@ -543,14 +543,14 @@ define void @store_load_vindex_foo(i32 %idx) {
543543
; GFX10-PAL: ; %bb.0: ; %bb
544544
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545545
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
546-
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
547-
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32
548-
; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1
549-
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2
550-
; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2
551-
; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
546+
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32
547+
; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0
548+
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
549+
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
550+
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1
551+
; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off
552552
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
553-
; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc
553+
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
554554
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
555555
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
556556
bb:
@@ -1247,9 +1247,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
12471247
; GFX9-NEXT: s_waitcnt vmcnt(0)
12481248
; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100
12491249
; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi
1250-
; GFX9-NEXT: v_mov_b32_e32 v3, 15
12511250
; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
1252-
; GFX9-NEXT: v_and_b32_e32 v0, v0, v3
1251+
; GFX9-NEXT: v_mov_b32_e32 v3, 15
1252+
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
12531253
; GFX9-NEXT: scratch_store_dword v2, v3, off
12541254
; GFX9-NEXT: s_waitcnt vmcnt(0)
12551255
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
@@ -1261,17 +1261,17 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
12611261
; GFX10: ; %bb.0: ; %bb
12621262
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12631263
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1264-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
12651264
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
1266-
; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo
1267-
; GFX10-NEXT: v_and_b32_e32 v3, v0, v1
1268-
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2
1269-
; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2
1270-
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
1265+
; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
1266+
; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo
1267+
; GFX10-NEXT: v_mov_b32_e32 v3, 15
1268+
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
1269+
; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1
1270+
; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc
12711271
; GFX10-NEXT: s_waitcnt vmcnt(0)
1272-
; GFX10-NEXT: scratch_store_dword v0, v1, off
1272+
; GFX10-NEXT: scratch_store_dword v0, v3, off
12731273
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1274-
; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc
1274+
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
12751275
; GFX10-NEXT: s_waitcnt vmcnt(0)
12761276
; GFX10-NEXT: s_setpc_b64 s[30:31]
12771277
;
@@ -1282,9 +1282,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
12821282
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
12831283
; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100
12841284
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi
1285-
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
12861285
; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
1287-
; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3
1286+
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
1287+
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
12881288
; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
12891289
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
12901290
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
@@ -1296,17 +1296,17 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
12961296
; GFX10-PAL: ; %bb.0: ; %bb
12971297
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12981298
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1299-
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
13001299
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
1301-
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo
1302-
; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1
1303-
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2
1304-
; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2
1305-
; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc
1300+
; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0
1301+
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
1302+
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
1303+
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
1304+
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1
1305+
; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc
13061306
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
1307-
; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
1307+
; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off
13081308
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1309-
; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc
1309+
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
13101310
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
13111311
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
13121312
bb:
@@ -2019,9 +2019,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
20192019
; GFX9-NEXT: s_waitcnt vmcnt(0)
20202020
; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004
20212021
; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi
2022-
; GFX9-NEXT: v_mov_b32_e32 v3, 15
20232022
; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
2024-
; GFX9-NEXT: v_and_b32_e32 v0, v0, v3
2023+
; GFX9-NEXT: v_mov_b32_e32 v3, 15
2024+
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
20252025
; GFX9-NEXT: scratch_store_dword v2, v3, off
20262026
; GFX9-NEXT: s_waitcnt vmcnt(0)
20272027
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
@@ -2033,17 +2033,17 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
20332033
; GFX10: ; %bb.0: ; %bb
20342034
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20352035
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2036-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
20372036
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
2038-
; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo
2039-
; GFX10-NEXT: v_and_b32_e32 v3, v0, v1
2040-
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2
2041-
; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2
2042-
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
2037+
; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
2038+
; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo
2039+
; GFX10-NEXT: v_mov_b32_e32 v3, 15
2040+
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
2041+
; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1
2042+
; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc
20432043
; GFX10-NEXT: s_waitcnt vmcnt(0)
2044-
; GFX10-NEXT: scratch_store_dword v0, v1, off
2044+
; GFX10-NEXT: scratch_store_dword v0, v3, off
20452045
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2046-
; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc
2046+
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
20472047
; GFX10-NEXT: s_waitcnt vmcnt(0)
20482048
; GFX10-NEXT: s_setpc_b64 s[30:31]
20492049
;
@@ -2054,9 +2054,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
20542054
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
20552055
; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004
20562056
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi
2057-
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
20582057
; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
2059-
; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3
2058+
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
2059+
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
20602060
; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
20612061
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
20622062
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
@@ -2068,17 +2068,17 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
20682068
; GFX10-PAL: ; %bb.0: ; %bb
20692069
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20702070
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2071-
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
20722071
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
2073-
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo
2074-
; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1
2075-
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2
2076-
; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2
2077-
; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
2072+
; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0
2073+
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
2074+
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
2075+
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
2076+
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1
2077+
; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc
20782078
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
2079-
; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
2079+
; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off
20802080
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2081-
; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc
2081+
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
20822082
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
20832083
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
20842084
bb:

0 commit comments

Comments
 (0)