Skip to content

[AMDGPU][SDAG] Test ISD::PTRADD handling in various special cases #145329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: users/ritter-x2a/06-12-_amdgpu_sdag_handle_isd_ptradd_in_vop3_patterns
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX6,GFX6_PTRADD %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX6,GFX6_LEGACY %s

; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF.

define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX6_PTRADD-LABEL: v_add_i32:
; GFX6_PTRADD: ; %bb.0:
; GFX6_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6_PTRADD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6_PTRADD-NEXT: s_mov_b32 s7, 0x100f000
; GFX6_PTRADD-NEXT: s_mov_b32 s10, 0
; GFX6_PTRADD-NEXT: s_mov_b32 s11, s7
; GFX6_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX6_PTRADD-NEXT: v_mov_b32_e32 v1, s3
; GFX6_PTRADD-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX6_PTRADD-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX6_PTRADD-NEXT: s_mov_b32 s8, s10
; GFX6_PTRADD-NEXT: s_mov_b32 s9, s10
; GFX6_PTRADD-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; GFX6_PTRADD-NEXT: s_waitcnt vmcnt(0)
; GFX6_PTRADD-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
; GFX6_PTRADD-NEXT: s_waitcnt vmcnt(0)
; GFX6_PTRADD-NEXT: s_mov_b32 s6, -1
; GFX6_PTRADD-NEXT: s_mov_b32 s4, s0
; GFX6_PTRADD-NEXT: s_mov_b32 s5, s1
; GFX6_PTRADD-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; GFX6_PTRADD-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6_PTRADD-NEXT: s_endpgm
;
; GFX6_LEGACY-LABEL: v_add_i32:
; GFX6_LEGACY: ; %bb.0:
; GFX6_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6_LEGACY-NEXT: s_mov_b32 s7, 0x100f000
; GFX6_LEGACY-NEXT: s_mov_b32 s10, 0
; GFX6_LEGACY-NEXT: s_mov_b32 s11, s7
; GFX6_LEGACY-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX6_LEGACY-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX6_LEGACY-NEXT: v_mov_b32_e32 v1, 0
; GFX6_LEGACY-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; GFX6_LEGACY-NEXT: s_waitcnt vmcnt(0)
; GFX6_LEGACY-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
; GFX6_LEGACY-NEXT: s_waitcnt vmcnt(0)
; GFX6_LEGACY-NEXT: s_mov_b32 s6, -1
; GFX6_LEGACY-NEXT: s_mov_b32 s4, s0
; GFX6_LEGACY-NEXT: s_mov_b32 s5, s1
; GFX6_LEGACY-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; GFX6_LEGACY-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6_LEGACY-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
%b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
%a = load volatile i32, ptr addrspace(1) %gep
%b = load volatile i32, ptr addrspace(1) %b_ptr
%result = add i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
}

;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX6: {{.*}}
206 changes: 206 additions & 0 deletions llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,209 @@ define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
%gep = getelementptr inbounds i8, ptr %base, i64 %mul
ret ptr %gep
}

; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectGlobalSAddr.
define amdgpu_kernel void @uniform_base_varying_offset_imm(ptr addrspace(1) %p) {
; GFX942_PTRADD-LABEL: uniform_base_varying_offset_imm:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
; GFX942_PTRADD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off offset:16
; GFX942_PTRADD-NEXT: s_endpgm
;
; GFX942_LEGACY-LABEL: uniform_base_varying_offset_imm:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942_LEGACY-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 1
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[0:1] offset:16
; GFX942_LEGACY-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%shift = shl i32 %tid, 2
%voffset = zext i32 %shift to i64
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %voffset
%gep2 = getelementptr inbounds i8, ptr addrspace(1) %gep1, i64 16
store i32 1, ptr addrspace(1) %gep2
ret void
}

; Adjusted from global-saddr-load.ll. Tests PTRADD handling in
; AMDGPUDAGToDAGISel::SelectSMRDBaseOffset.
define amdgpu_kernel void @global_load_saddr_i32_uniform_offset(ptr addrspace(1) %sbase, i32 %soffset, ptr addrspace(1) %r) {
; GFX942_PTRADD-LABEL: global_load_saddr_i32_uniform_offset:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942_PTRADD-NEXT: s_load_dword s6, s[4:5], 0x8
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, s6
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, 0
; GFX942_PTRADD-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0
; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942_PTRADD-NEXT: s_endpgm
;
; GFX942_LEGACY-LABEL: global_load_saddr_i32_uniform_offset:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942_LEGACY-NEXT: s_load_dword s6, s[4:5], 0x8
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: s_load_dword s0, s[0:1], s6 offset:0x0
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0
; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942_LEGACY-NEXT: s_endpgm
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i32, ptr addrspace(1) %gep0
%to.vgpr = bitcast i32 %load to float
store float %to.vgpr, ptr addrspace(1) %r
ret void
}

; Adjusted from llvm.amdgcn.global.load.lds.ll, tests the offset lowering for
; Intrinsic::amdgcn_global_load_lds.
define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
; GFX942_PTRADD-LABEL: global_load_lds_dword_saddr_and_vaddr:
; GFX942_PTRADD: ; %bb.0: ; %main_body
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, v1
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, 0
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: v_readfirstlane_b32 s0, v0
; GFX942_PTRADD-NEXT: s_mov_b32 m0, s0
; GFX942_PTRADD-NEXT: s_nop 0
; GFX942_PTRADD-NEXT: global_load_lds_dword v[2:3], off offset:48 sc1
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: global_load_lds_dword_saddr_and_vaddr:
; GFX942_LEGACY: ; %bb.0: ; %main_body
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_readfirstlane_b32 s2, v0
; GFX942_LEGACY-NEXT: s_mov_b32 m0, s2
; GFX942_LEGACY-NEXT: s_nop 0
; GFX942_LEGACY-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
main_body:
%voffset.64 = zext i32 %voffset to i64
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 4, i32 48, i32 16)
ret void
}

; Taken from shl_add_ptr_global.ll, tests PTRADD handling in
; SITargetLowering::performSHLPtrCombine.
define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) {
; GFX942_PTRADD-LABEL: shl_base_global_ptr_global_atomic_fadd:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_mov_b64 s[0:1], 0x80
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, s[0:1]
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v6, 0x42c80000
; GFX942_PTRADD-NEXT: global_atomic_add_f32 v[4:5], v6, off
; GFX942_PTRADD-NEXT: global_store_dwordx2 v[2:3], v[0:1], off sc0 sc1
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: shl_base_global_ptr_global_atomic_fadd:
; GFX942_LEGACY: ; %bb.0:
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[0:1], 2, v[4:5]
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v6, 0x42c80000
; GFX942_LEGACY-NEXT: global_atomic_add_f32 v[0:1], v6, off offset:512
; GFX942_LEGACY-NEXT: s_mov_b64 s[0:1], 0x80
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, s[0:1]
; GFX942_LEGACY-NEXT: global_store_dwordx2 v[2:3], v[0:1], off sc0 sc1
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to ptr addrspace(1)
%unused = atomicrmw fadd ptr addrspace(1) %castback, float 100.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret void
}

; Test PTRADD handling in TargetLowering::SimplifyDemandedBits and
; TargetLowering::ShrinkDemandedOp.
define i32 @gep_in_const_as_cast_to_const32_as(ptr addrspace(4) %src, i64 %offset) {
; GFX942_PTRADD-LABEL: gep_in_const_as_cast_to_const32_as:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942_PTRADD-NEXT: s_mov_b32 s1, 0
; GFX942_PTRADD-NEXT: v_readfirstlane_b32 s0, v0
; GFX942_PTRADD-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s0
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: gep_in_const_as_cast_to_const32_as:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_add_u32_e32 v0, v0, v2
; GFX942_LEGACY-NEXT: s_mov_b32 s1, 0
; GFX942_LEGACY-NEXT: v_readfirstlane_b32 s0, v0
; GFX942_LEGACY-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s0
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr i8, ptr addrspace(4) %src, i64 %offset
%gep.cast = addrspacecast ptr addrspace(4) %gep to ptr addrspace(6)
%l = load i32, ptr addrspace(6) %gep.cast
ret i32 %l
}

@CG = addrspace(4) constant [16 x i32] zeroinitializer, align 4

; Test PTRADD handling in isMemSrcFromConstant.
define void @replace_const0_memcpy_by_memset(ptr align 4 %dst) {
; GFX942_PTRADD-LABEL: replace_const0_memcpy_by_memset:
; GFX942_PTRADD: ; %bb.0: ; %entry
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, CG@gotpcrel32@lo+4
; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, CG@gotpcrel32@hi+12
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
; GFX942_PTRADD-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942_PTRADD-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: replace_const0_memcpy_by_memset:
; GFX942_LEGACY: ; %bb.0: ; %entry
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 0
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, v2
; GFX942_LEGACY-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr i8, ptr addrspace(4) @CG, i64 4
tail call void @llvm.memcpy.p0.p4.i64(ptr noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %gep, i64 8, i1 false)
ret void
}

declare void @llvm.memcpy.p0.p4.i64(ptr noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)

!0 = !{}