Skip to content

Commit dd32d26

Browse files
authored
[AMDGPU] Form V_MAD_U64_U32 from mul24 (#72393)
Fixes SWDEV-421067
1 parent 2ce9a79 commit dd32d26

File tree

3 files changed

+194
-57
lines changed

3 files changed

+194
-57
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -678,11 +678,22 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
678678
>;
679679
}
680680

681+
// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
682+
// We need to separate this because otherwise OtherPredicates would be overriden.
683+
class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
684+
(i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
685+
(inst $src0, $src1, $src2, 0 /* clamp */)
686+
>;
687+
681688
// exclude pre-GFX9 where it was slow
682-
let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in
689+
let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
683690
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
684-
let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in
691+
def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
692+
}
693+
let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
685694
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
695+
def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
696+
}
686697

687698
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
688699
let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,

llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll

Lines changed: 163 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
99
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
1010

11-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
12-
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
11+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG,GFX900-SDAG,GFX900 %s
12+
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL,GFX900-GISEL,GFX900 %s
1313

1414
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A,GFX9-SDAG,GFX90A-SDAG %s
1515
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A,GFX9-GISEL,GFX90A-GISEL %s
@@ -5482,23 +5482,41 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
54825482
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v1
54835483
; GFX8-NEXT: s_setpc_b64 s[30:31]
54845484
;
5485-
; GFX900-LABEL: v_multi_use_mul_chain_add_other_use_all:
5486-
; GFX900: ; %bb.0: ; %bb
5487-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5488-
; GFX900-NEXT: v_add_u32_e32 v0, 1, v0
5489-
; GFX900-NEXT: v_mul_lo_u32 v2, v0, v1
5490-
; GFX900-NEXT: v_add_u32_e32 v0, v2, v0
5491-
; GFX900-NEXT: v_mul_lo_u32 v0, v0, v1
5492-
; GFX900-NEXT: v_add_u32_e32 v1, 1, v2
5493-
; GFX900-NEXT: v_mul_lo_u32 v5, v0, v1
5494-
; GFX900-NEXT: global_store_dword v[3:4], v2, off
5495-
; GFX900-NEXT: s_waitcnt vmcnt(0)
5496-
; GFX900-NEXT: global_store_dword v[3:4], v0, off
5497-
; GFX900-NEXT: s_waitcnt vmcnt(0)
5498-
; GFX900-NEXT: global_store_dword v[3:4], v5, off
5499-
; GFX900-NEXT: s_waitcnt vmcnt(0)
5500-
; GFX900-NEXT: v_add_u32_e32 v0, v5, v1
5501-
; GFX900-NEXT: s_setpc_b64 s[30:31]
5485+
; GFX900-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
5486+
; GFX900-SDAG: ; %bb.0: ; %bb
5487+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5488+
; GFX900-SDAG-NEXT: v_add_u32_e32 v0, 1, v0
5489+
; GFX900-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1
5490+
; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v2, v0
5491+
; GFX900-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1
5492+
; GFX900-SDAG-NEXT: v_add_u32_e32 v1, 1, v2
5493+
; GFX900-SDAG-NEXT: v_mul_lo_u32 v5, v0, v1
5494+
; GFX900-SDAG-NEXT: global_store_dword v[3:4], v2, off
5495+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
5496+
; GFX900-SDAG-NEXT: global_store_dword v[3:4], v0, off
5497+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
5498+
; GFX900-SDAG-NEXT: global_store_dword v[3:4], v5, off
5499+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
5500+
; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v5, v1
5501+
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
5502+
;
5503+
; GFX900-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_all:
5504+
; GFX900-GISEL: ; %bb.0: ; %bb
5505+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5506+
; GFX900-GISEL-NEXT: v_add_u32_e32 v0, 1, v0
5507+
; GFX900-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1
5508+
; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v2, v0
5509+
; GFX900-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1
5510+
; GFX900-GISEL-NEXT: v_add_u32_e32 v1, 1, v2
5511+
; GFX900-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
5512+
; GFX900-GISEL-NEXT: global_store_dword v[3:4], v2, off
5513+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
5514+
; GFX900-GISEL-NEXT: global_store_dword v[3:4], v0, off
5515+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
5516+
; GFX900-GISEL-NEXT: global_store_dword v[3:4], v5, off
5517+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
5518+
; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v5, v1
5519+
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
55025520
;
55035521
; GFX90A-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
55045522
; GFX90A-SDAG: ; %bb.0: ; %bb
@@ -5686,21 +5704,37 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
56865704
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
56875705
; GFX8-NEXT: s_setpc_b64 s[30:31]
56885706
;
5689-
; GFX900-LABEL: v_multi_use_mul_chain_add_other_use_some:
5690-
; GFX900: ; %bb.0: ; %bb
5691-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5692-
; GFX900-NEXT: v_add_u32_e32 v0, 1, v0
5693-
; GFX900-NEXT: v_mul_lo_u32 v2, v0, v1
5694-
; GFX900-NEXT: v_add_u32_e32 v0, v2, v0
5695-
; GFX900-NEXT: v_mul_lo_u32 v0, v0, v1
5696-
; GFX900-NEXT: v_add_u32_e32 v1, 1, v2
5697-
; GFX900-NEXT: v_mul_lo_u32 v0, v0, v1
5698-
; GFX900-NEXT: global_store_dword v[3:4], v2, off
5699-
; GFX900-NEXT: s_waitcnt vmcnt(0)
5700-
; GFX900-NEXT: global_store_dword v[3:4], v0, off
5701-
; GFX900-NEXT: s_waitcnt vmcnt(0)
5702-
; GFX900-NEXT: v_add_u32_e32 v0, v0, v1
5703-
; GFX900-NEXT: s_setpc_b64 s[30:31]
5707+
; GFX900-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
5708+
; GFX900-SDAG: ; %bb.0: ; %bb
5709+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5710+
; GFX900-SDAG-NEXT: v_add_u32_e32 v0, 1, v0
5711+
; GFX900-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1
5712+
; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v2, v0
5713+
; GFX900-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1
5714+
; GFX900-SDAG-NEXT: v_add_u32_e32 v1, 1, v2
5715+
; GFX900-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1
5716+
; GFX900-SDAG-NEXT: global_store_dword v[3:4], v2, off
5717+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
5718+
; GFX900-SDAG-NEXT: global_store_dword v[3:4], v0, off
5719+
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
5720+
; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v0, v1
5721+
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
5722+
;
5723+
; GFX900-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_some:
5724+
; GFX900-GISEL: ; %bb.0: ; %bb
5725+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5726+
; GFX900-GISEL-NEXT: v_add_u32_e32 v0, 1, v0
5727+
; GFX900-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1
5728+
; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v2, v0
5729+
; GFX900-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1
5730+
; GFX900-GISEL-NEXT: v_add_u32_e32 v1, 1, v2
5731+
; GFX900-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1
5732+
; GFX900-GISEL-NEXT: global_store_dword v[3:4], v2, off
5733+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
5734+
; GFX900-GISEL-NEXT: global_store_dword v[3:4], v0, off
5735+
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
5736+
; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v0, v1
5737+
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
57045738
;
57055739
; GFX90A-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
57065740
; GFX90A-SDAG: ; %bb.0: ; %bb
@@ -8291,7 +8325,102 @@ entry:
82918325
ret <2 x i16> %add0
82928326
}
82938327

8328+
define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
8329+
; GFX67-LABEL: mul_u24_add64:
8330+
; GFX67: ; %bb.0:
8331+
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8332+
; GFX67-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
8333+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8334+
; GFX67-NEXT: v_add_i32_e32 v0, vcc, v0, v2
8335+
; GFX67-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
8336+
; GFX67-NEXT: s_setpc_b64 s[30:31]
8337+
;
8338+
; GFX8-LABEL: mul_u24_add64:
8339+
; GFX8: ; %bb.0:
8340+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8341+
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
8342+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8343+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
8344+
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
8345+
; GFX8-NEXT: s_setpc_b64 s[30:31]
8346+
;
8347+
; GFX9-SDAG-LABEL: mul_u24_add64:
8348+
; GFX9-SDAG: ; %bb.0:
8349+
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8350+
; GFX9-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
8351+
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
8352+
;
8353+
; GFX9-GISEL-LABEL: mul_u24_add64:
8354+
; GFX9-GISEL: ; %bb.0:
8355+
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8356+
; GFX9-GISEL-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
8357+
; GFX9-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8358+
; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
8359+
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v3, vcc
8360+
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
8361+
;
8362+
; GFX10-SDAG-LABEL: mul_u24_add64:
8363+
; GFX10-SDAG: ; %bb.0:
8364+
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8365+
; GFX10-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
8366+
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
8367+
;
8368+
; GFX10-GISEL-LABEL: mul_u24_add64:
8369+
; GFX10-GISEL: ; %bb.0:
8370+
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8371+
; GFX10-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1
8372+
; GFX10-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
8373+
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
8374+
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
8375+
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
8376+
%mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
8377+
%add = add i64 %mul, %z
8378+
ret i64 %add
8379+
}
8380+
8381+
define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
8382+
; GFX67-LABEL: mul_u24_zext_add64:
8383+
; GFX67: ; %bb.0:
8384+
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8385+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8386+
; GFX67-NEXT: v_add_i32_e32 v0, vcc, v0, v2
8387+
; GFX67-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
8388+
; GFX67-NEXT: s_setpc_b64 s[30:31]
8389+
;
8390+
; GFX8-LABEL: mul_u24_zext_add64:
8391+
; GFX8: ; %bb.0:
8392+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8393+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8394+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
8395+
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
8396+
; GFX8-NEXT: s_setpc_b64 s[30:31]
8397+
;
8398+
; GFX9-LABEL: mul_u24_zext_add64:
8399+
; GFX9: ; %bb.0:
8400+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8401+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8402+
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
8403+
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
8404+
; GFX9-NEXT: s_setpc_b64 s[30:31]
8405+
;
8406+
; GFX10-LABEL: mul_u24_zext_add64:
8407+
; GFX10: ; %bb.0:
8408+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8409+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
8410+
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
8411+
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
8412+
; GFX10-NEXT: s_setpc_b64 s[30:31]
8413+
%mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
8414+
%mul.zext = zext i32 %mul to i64
8415+
%add = add i64 %mul.zext, %z
8416+
ret i64 %add
8417+
}
8418+
8419+
declare i64 @llvm.amdgcn.mul.u24.i64(i32, i32)
8420+
declare i32 @llvm.amdgcn.mul.u24(i32, i32)
8421+
82948422
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
82958423
; GFX6: {{.*}}
82968424
; GFX7: {{.*}}
8425+
; GFX900: {{.*}}
82978426
; GFX90A: {{.*}}

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -444,31 +444,28 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
444444
; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
445445
; CHECK-NEXT: s_cbranch_execz .LBB0_31
446446
; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
447-
; CHECK-NEXT: v_xor_b32_e32 v5, v60, v58
448-
; CHECK-NEXT: v_lshrrev_b64 v[3:4], 16, v[56:57]
449-
; CHECK-NEXT: v_mul_u32_u24_e32 v11, 0x180, v73
450-
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v0
451-
; CHECK-NEXT: v_lshrrev_b64 v[1:2], 16, v[45:46]
452-
; CHECK-NEXT: v_lshlrev_b32_e32 v7, 16, v5
447+
; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58
448+
; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57]
449+
; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47]
450+
; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0
451+
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4
453452
; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
454-
; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, s46, v11
455-
; CHECK-NEXT: v_lshlrev_b32_e32 v10, 12, v63
456-
; CHECK-NEXT: v_or_b32_e32 v4, v7, v4
457-
; CHECK-NEXT: v_mul_hi_u32_u24_e32 v7, 0x180, v73
458-
; CHECK-NEXT: v_xor_b32_e32 v6, v61, v59
459-
; CHECK-NEXT: v_lshlrev_b32_e32 v9, 16, v56
460-
; CHECK-NEXT: v_or3_b32 v10, v8, v10, v62
453+
; CHECK-NEXT: v_lshlrev_b32_e32 v9, 12, v63
454+
; CHECK-NEXT: v_xor_b32_e32 v5, v61, v59
455+
; CHECK-NEXT: v_lshlrev_b32_e32 v11, 16, v56
456+
; CHECK-NEXT: v_or_b32_e32 v3, v1, v3
457+
; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46]
458+
; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, v10
459+
; CHECK-NEXT: v_or3_b32 v8, v8, v9, v62
460+
; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
461+
; CHECK-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
462+
; CHECK-NEXT: v_or_b32_e32 v1, v11, v1
461463
; CHECK-NEXT: ; implicit-def: $vgpr42
462464
; CHECK-NEXT: ; implicit-def: $vgpr43
463465
; CHECK-NEXT: ; implicit-def: $vgpr44
464-
; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s47, v7, vcc_lo
465-
; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v11, v0
466-
; CHECK-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6]
467-
; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo
468-
; CHECK-NEXT: v_or_b32_e32 v2, v9, v2
469-
; CHECK-NEXT: global_store_dword v[7:8], v10, off offset:4
470-
; CHECK-NEXT: global_store_dwordx4 v[7:8], v[1:4], off offset:8
471-
; CHECK-NEXT: global_store_dwordx2 v[7:8], v[5:6], off offset:24
466+
; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4
467+
; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8
468+
; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24
472469
; CHECK-NEXT: .LBB0_31: ; %Flow
473470
; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
474471
; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4

0 commit comments

Comments
 (0)