@@ -3538,6 +3538,94 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
3538
3538
ret void
3539
3539
}
3540
3540
3541
+ ; Check that exact regions with execz affected instructions are as short as possible
3542
+ define amdgpu_ps float @short_exact_regions (<8 x i32 > inreg %rsrc , <4 x i32 > inreg %sampler , float %c , ptr addrspace (4 ) %p ) {
3543
+ ; GFX9-W64-LABEL: short_exact_regions:
3544
+ ; GFX9-W64: ; %bb.0: ; %main_body
3545
+ ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3546
+ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3547
+ ; GFX9-W64-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
3548
+ ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3549
+ ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3550
+ ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
3551
+ ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
3552
+ ; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2
3553
+ ; GFX9-W64-NEXT: ; %bb.1: ; %if
3554
+ ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3555
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3556
+ ; GFX9-W64-NEXT: v_readfirstlane_b32 s16, v0
3557
+ ; GFX9-W64-NEXT: s_buffer_load_dword s16, s[8:11], s16 offset:0x0
3558
+ ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3559
+ ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s16
3560
+ ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
3561
+ ; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3562
+ ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
3563
+ ; GFX9-W64-NEXT: .LBB59_2: ; %endif
3564
+ ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
3565
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3566
+ ; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
3567
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3568
+ ; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v0
3569
+ ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3570
+ ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3571
+ ; GFX9-W64-NEXT: ; return to shader part epilog
3572
+ ;
3573
+ ; GFX10-W32-LABEL: short_exact_regions:
3574
+ ; GFX10-W32: ; %bb.0: ; %main_body
3575
+ ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3576
+ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3577
+ ; GFX10-W32-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
3578
+ ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3579
+ ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
3580
+ ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3581
+ ; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0
3582
+ ; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2
3583
+ ; GFX10-W32-NEXT: ; %bb.1: ; %if
3584
+ ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3585
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3586
+ ; GFX10-W32-NEXT: v_readfirstlane_b32 s14, v0
3587
+ ; GFX10-W32-NEXT: s_buffer_load_dword s14, s[8:11], s14 offset:0x0
3588
+ ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3589
+ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s14
3590
+ ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
3591
+ ; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3592
+ ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
3593
+ ; GFX10-W32-NEXT: .LBB59_2: ; %endif
3594
+ ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
3595
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3596
+ ; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3597
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3598
+ ; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v0
3599
+ ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3600
+ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3601
+ ; GFX10-W32-NEXT: ; return to shader part epilog
3602
+ main_body:
3603
+ %tex1 = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %c , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3604
+ %idx0 = load <4 x i32 >, ptr addrspace (4 ) %p , align 4
3605
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo (i32 -1 , i32 0 )
3606
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi (i32 -1 , i32 %lo )
3607
+ %cc = icmp uge i32 %hi , 16
3608
+ br i1 %cc , label %endif , label %if
3609
+
3610
+ if:
3611
+ %idx1 = extractelement <4 x i32 > %idx0 , i64 0
3612
+ %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32 (i32 %idx1 )
3613
+ %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32 (<4 x i32 > %sampler , i32 %idx2 , i32 0 )
3614
+
3615
+ call void @llvm.amdgcn.struct.buffer.store.v4f32 (<4 x float > %tex1 , <4 x i32 > undef , i32 %idx3 , i32 0 , i32 0 , i32 0 )
3616
+ br label %endif
3617
+
3618
+ endif:
3619
+ %d = extractelement <4 x float > %tex1 , i64 0
3620
+ %tex2 = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %d , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3621
+ %r0 = extractelement <4 x float > %tex1 , i64 1
3622
+ %r1 = extractelement <4 x float > %tex2 , i64 2
3623
+ %r2 = fadd float %r0 , %r1
3624
+ %out = call float @llvm.amdgcn.wqm.f32 (float %r2 )
3625
+
3626
+ ret float %out
3627
+ }
3628
+
3541
3629
declare void @llvm.amdgcn.exp.f32 (i32 , i32 , float , float , float , float , i1 , i1 ) #1
3542
3630
declare void @llvm.amdgcn.image.store.1d.v4f32.i32 (<4 x float >, i32 , i32 , <8 x i32 >, i32 , i32 ) #1
3543
3631
@@ -3577,6 +3665,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3577
3665
declare float @llvm.amdgcn.interp.p2 (float , float , i32 , i32 , i32 ) #2
3578
3666
declare i32 @llvm.amdgcn.ds.swizzle (i32 , i32 )
3579
3667
declare float @llvm.amdgcn.s.buffer.load.f32 (<4 x i32 >, i32 , i32 immarg) #7
3668
+ declare i32 @llvm.amdgcn.readfirstlane.i32 (i32 )
3580
3669
3581
3670
attributes #1 = { nounwind }
3582
3671
attributes #2 = { nounwind readonly }
0 commit comments