Skip to content

AMDGPU: Make frame index folding logic consistent with eliminateFrameIndex #129633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,7 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
const unsigned Opc = UseMI.getOpcode();
switch (Opc) {
case AMDGPU::S_ADD_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_ADD_U32:
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_CO_U32_e32:
// TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
Expand Down
146 changes: 146 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
Original file line number Diff line number Diff line change
Expand Up @@ -426,4 +426,150 @@ body: |
$sgpr4 = COPY %4
$sgpr5 = COPY %5
SI_RETURN implicit $sgpr4, implicit $sgpr5

...

name: fold_frame_index__s_add_u32__fi_const
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 16384
stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_u32__fi_const
; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 %stack.0, 128, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_ADD_U32 %0, 128, implicit-def $scc
$sgpr4 = COPY %1
SI_RETURN implicit $sgpr4
...

---
name: fold_frame_index__s_add_u32__const_fi
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 16384
stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_u32__const_fi
; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 128, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_ADD_U32 128, %0, implicit-def $scc
$sgpr4 = COPY %1
SI_RETURN implicit $sgpr4
...

---
name: fold_frame_index__s_add_u32__fi_inlineimm
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 16384
stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_u32__fi_inlineimm
; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 %stack.0, 16, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_ADD_U32 %0, 16, implicit-def $scc
$sgpr4 = COPY %1
SI_RETURN implicit $sgpr4
...

---
name: fold_frame_index__s_add_u32__inlineimm_fi
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 16384
stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_u32__inlineimm_fi
; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 16, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_ADD_U32 16, %0, implicit-def $scc
$sgpr4 = COPY %1
SI_RETURN implicit $sgpr4
...

---
name: no_fold_literal_and_fi_s_or_b32
tracksRegLiveness: true
frameInfo:
maxAlignment: 16
localFrameSize: 8192
stack:
- { id: 0, size: 4096, alignment: 4, local-offset: 0 }
- { id: 1, size: 4096, alignment: 16, local-offset: 4096 }
body: |
bb.0:
; CHECK-LABEL: name: no_fold_literal_and_fi_s_or_b32
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc
; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]]
%0:sreg_32 = S_MOV_B32 12345
%1:sreg_32 = S_MOV_B32 %stack.1
%2:sreg_32 = S_AND_B32 killed %1, killed %0, implicit-def dead $scc
S_ENDPGM 0, implicit %2

...

---
name: no_fold_literal_or_fi_s_or_b32
tracksRegLiveness: true
frameInfo:
maxAlignment: 16
localFrameSize: 8192
stack:
- { id: 0, size: 4096, alignment: 4, local-offset: 0 }
- { id: 1, size: 4096, alignment: 16, local-offset: 4096 }
body: |
bb.0:
; CHECK-LABEL: name: no_fold_literal_or_fi_s_or_b32
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc
; CHECK-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]]
%0:sreg_32 = S_MOV_B32 12345
%1:sreg_32 = S_MOV_B32 %stack.1
%2:sreg_32 = S_OR_B32 killed %1, killed %0, implicit-def dead $scc
S_ENDPGM 0, implicit %2

...

---
name: no_fold_literal_and_fi_s_mul_i32
tracksRegLiveness: true
frameInfo:
maxAlignment: 16
localFrameSize: 8192
stack:
- { id: 0, size: 4096, alignment: 4, local-offset: 0 }
- { id: 1, size: 4096, alignment: 16, local-offset: 4096 }
body: |
bb.0:
; CHECK-LABEL: name: no_fold_literal_and_fi_s_mul_i32
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc
; CHECK-NEXT: S_ENDPGM 0, implicit [[S_MUL_I32_]]
%0:sreg_32 = S_MOV_B32 12345
%1:sreg_32 = S_MOV_B32 %stack.1
%2:sreg_32 = S_MUL_I32 killed %1, killed %0, implicit-def dead $scc
S_ENDPGM 0, implicit %2

...
26 changes: 16 additions & 10 deletions llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,10 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], 128, implicit-def dead $scc
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
; CHECK-NEXT: SI_RETURN implicit [[COPY]]
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc
%2:vgpr_32 = COPY %1
Expand All @@ -410,8 +412,10 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 128, [[S_MOV_B32_]], implicit-def dead $scc
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
; CHECK-NEXT: SI_RETURN implicit [[COPY]]
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc
%2:vgpr_32 = COPY %1
Expand All @@ -426,8 +430,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit %1
; CHECK: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e64_]]
%0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc
%1:vgpr_32 = COPY %0
SI_RETURN implicit %1
Expand All @@ -441,8 +445,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit %1
; CHECK: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e64_]]
%0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc
%1:vgpr_32 = COPY %0
SI_RETURN implicit %1
Expand Down Expand Up @@ -521,8 +525,10 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], 128, implicit-def dead $scc
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]]
; CHECK-NEXT: SI_RETURN implicit [[COPY]]
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc
%2:vgpr_32 = COPY %1
Expand Down
42 changes: 42 additions & 0 deletions llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
Original file line number Diff line number Diff line change
Expand Up @@ -374,4 +374,46 @@ vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i
ret void
}

; GCN-LABEL: {{^}}fi_sop2_and_literal_error:
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1fe00
define amdgpu_kernel void @fi_sop2_and_literal_error() #0 {
entry:
%.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
%Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
%p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32
br label %.shuffle.then.i.i.i.i

.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry
store i64 0, ptr addrspace(5) null, align 4
%or = and i32 %p2i, -512
%icmp = icmp ugt i32 %or, 9999999
br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i

vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i
%wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4
store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4
ret void
}

; GCN-LABEL: {{^}}fi_sop2_or_literal_error:
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
define amdgpu_kernel void @fi_sop2_or_literal_error() #0 {
entry:
%.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
%Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
%p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32
br label %.shuffle.then.i.i.i.i

.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry
store i64 0, ptr addrspace(5) null, align 4
%or = or i32 %p2i, 12345
%icmp = icmp ugt i32 %or, 9999999
br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i

vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i
%wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4
store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4
ret void
}

attributes #0 = { nounwind }
54 changes: 35 additions & 19 deletions llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s

; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; GCN: s_and_b32 s{{[0-9]+}}, [[FI]], 0xfffc
; GCN: v_mov_b32_e32 [[VFI:v[0-9]+]], [[FI]]{{$}}
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], [[VFI]]
define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 15, ptr addrspace(5) %alloca
Expand All @@ -20,11 +21,15 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
}

; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH128K-NOT: v_and_b32
; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
; SCRATCH256K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; SCRATCH256K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc

; SCRATCH1024K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; SCRATCH1024K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc

; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc

; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
%alloca = alloca i32, align 4, addrspace(5)
Expand All @@ -36,11 +41,17 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
}

; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH128K-NOT: v_and_b32
; SCRATCH256K-NOT: v_and_b32
; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
; SCRATCH128K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH256K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH128K-NOT: and_b32
; SCRATCH256K-NOT: and_b32

; SCRATCH1024K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; SCRATCH1024K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x3fffc

; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x3fffc

; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
%alloca = alloca i32, align 4, addrspace(5)
Expand All @@ -52,11 +63,16 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
}

; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH128K-NOT: v_and_b32
; SCRATCH256K-NOT: v_and_b32
; SCRATCH1024K-NOT: v_and_b32
; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]]
; SCRATCH128K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH256K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; SCRATCH1024K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}

; SCRATCH128K-NOT: and_b32
; SCRATCH256K-NOT: and_b32
; SCRATCH1024K-NOT: and_b32

; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0xffffc
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
%alloca = alloca i32, align 4, addrspace(5)
Expand All @@ -69,7 +85,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {

; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN-NOT: v_and_b32
; GCN-NOT: and_b32
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
%alloca = alloca i32, align 4, addrspace(5)
Expand Down
Loading
Loading