Skip to content

Commit ae6dbed

Browse files
authored
[AMDGPU] Use correct DWord for v_dot4 S0 operand (#115224)
Fixes a copy-paste typo. The typo resulted in producing bad v_perm based operands for the v_dot4 combine. When adding a corresponding byte pair to the v_dot byte pair chains, we must take note of the byte position in the corresponding source nodes. These byte positions are used to ensure we extract the correct DWord from the ultimate source, and formulate a correct perm_mask from the extracted DWord. With the typo, we the S0 byte would used the DWord offset for the corresponding S1 byte. If this offset was not the same as the true DWord offset for the S0 byte, we would extract and use the wrong byte for S0 in the v_dot. Fixes #112941
1 parent f0e2301 commit ae6dbed

File tree

2 files changed

+264
-1
lines changed

2 files changed

+264
-1
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14011,7 +14011,7 @@ static void placeSources(ByteProvider<SDValue> &Src0,
1401114011
Src0s.push_back(
1401214012
{*Src0.Src,
1401314013
((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14014-
Src1.SrcOffset / 4});
14014+
Src0.SrcOffset / 4});
1401514015
Src1s.push_back(
1401614016
{*Src1.Src,
1401714017
((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),

llvm/test/CodeGen/AMDGPU/idot4s.ll

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3449,5 +3449,268 @@ entry:
34493449
ret void
34503450
}
34513451

3452+
; The first (S0) operand of the v_dot4 is derived from the LHS of the mul chain (that is %op80, %op50).
3453+
; These correspond to the 0th, and 4th bytes starting from %inptr1.
3454+
; Confirm that we are actually accessing these bytes.
3455+
;
3456+
; Previously, we used the dword offset from the corresponding byte in the second (S1) operand.
3457+
; The result was to access the 0th byte instead of the 4th (i.e. a dword offset of 0 instead of 1).
3458+
3459+
define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) %inptr1, i8 %l81, i8 %l51) {
3460+
; GFX7-LABEL: ByteOffsetCorrectness:
3461+
; GFX7: ; %bb.0: ; %.entry
3462+
; GFX7-NEXT: s_load_dword s0, s[2:3], 0xb
3463+
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
3464+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
3465+
; GFX7-NEXT: s_mov_b32 s6, -1
3466+
; GFX7-NEXT: s_mov_b32 s8, 0
3467+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3468+
; GFX7-NEXT: s_sext_i32_i8 s2, s0
3469+
; GFX7-NEXT: s_bfe_i32 s3, s0, 0x80008
3470+
; GFX7-NEXT: s_mov_b32 s9, s8
3471+
; GFX7-NEXT: s_mov_b32 s10, s6
3472+
; GFX7-NEXT: s_mov_b32 s11, s7
3473+
; GFX7-NEXT: s_and_b64 s[0:1], exec, -1
3474+
; GFX7-NEXT: .LBB17_1: ; %.lr.ph
3475+
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
3476+
; GFX7-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:4
3477+
; GFX7-NEXT: buffer_load_sbyte v1, off, s[4:7], 0
3478+
; GFX7-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:1
3479+
; GFX7-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
3480+
; GFX7-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:3
3481+
; GFX7-NEXT: s_waitcnt vmcnt(4)
3482+
; GFX7-NEXT: v_mul_lo_u32 v0, v0, s3
3483+
; GFX7-NEXT: s_waitcnt vmcnt(3)
3484+
; GFX7-NEXT: v_mul_lo_u32 v1, v1, s2
3485+
; GFX7-NEXT: s_waitcnt vmcnt(1)
3486+
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
3487+
; GFX7-NEXT: s_waitcnt vmcnt(0)
3488+
; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
3489+
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 8
3490+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
3491+
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
3492+
; GFX7-NEXT: s_mov_b64 vcc, s[0:1]
3493+
; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0
3494+
; GFX7-NEXT: s_cbranch_vccnz .LBB17_1
3495+
; GFX7-NEXT: ; %bb.2: ; %DummyReturnBlock
3496+
; GFX7-NEXT: s_endpgm
3497+
;
3498+
; GFX8-LABEL: ByteOffsetCorrectness:
3499+
; GFX8: ; %bb.0: ; %.entry
3500+
; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
3501+
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3502+
; GFX8-NEXT: v_mov_b32_e32 v10, 0
3503+
; GFX8-NEXT: v_mov_b32_e32 v11, 0
3504+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3505+
; GFX8-NEXT: s_sext_i32_i8 s2, s6
3506+
; GFX8-NEXT: s_add_u32 s4, s0, 4
3507+
; GFX8-NEXT: s_addc_u32 s5, s1, 0
3508+
; GFX8-NEXT: s_bfe_i32 s3, s6, 0x80008
3509+
; GFX8-NEXT: s_add_u32 s6, s0, 3
3510+
; GFX8-NEXT: s_addc_u32 s7, s1, 0
3511+
; GFX8-NEXT: s_add_u32 s8, s0, 2
3512+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
3513+
; GFX8-NEXT: s_addc_u32 s9, s1, 0
3514+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
3515+
; GFX8-NEXT: s_add_u32 s0, s0, 1
3516+
; GFX8-NEXT: s_addc_u32 s1, s1, 0
3517+
; GFX8-NEXT: v_mov_b32_e32 v2, s4
3518+
; GFX8-NEXT: v_mov_b32_e32 v4, s8
3519+
; GFX8-NEXT: v_mov_b32_e32 v7, s1
3520+
; GFX8-NEXT: v_mov_b32_e32 v9, s7
3521+
; GFX8-NEXT: v_mov_b32_e32 v3, s5
3522+
; GFX8-NEXT: v_mov_b32_e32 v5, s9
3523+
; GFX8-NEXT: v_mov_b32_e32 v6, s0
3524+
; GFX8-NEXT: v_mov_b32_e32 v8, s6
3525+
; GFX8-NEXT: s_and_b64 s[0:1], exec, -1
3526+
; GFX8-NEXT: .LBB17_1: ; %.lr.ph
3527+
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
3528+
; GFX8-NEXT: flat_load_sbyte v12, v[0:1]
3529+
; GFX8-NEXT: flat_load_sbyte v13, v[2:3]
3530+
; GFX8-NEXT: flat_load_ubyte v14, v[4:5]
3531+
; GFX8-NEXT: flat_load_ubyte v15, v[6:7]
3532+
; GFX8-NEXT: flat_load_ubyte v16, v[8:9]
3533+
; GFX8-NEXT: s_waitcnt vmcnt(4)
3534+
; GFX8-NEXT: v_mul_lo_u32 v12, v12, s2
3535+
; GFX8-NEXT: s_waitcnt vmcnt(3)
3536+
; GFX8-NEXT: v_mul_lo_u32 v13, v13, s3
3537+
; GFX8-NEXT: s_waitcnt vmcnt(1)
3538+
; GFX8-NEXT: v_or_b32_e32 v14, v14, v15
3539+
; GFX8-NEXT: s_waitcnt vmcnt(0)
3540+
; GFX8-NEXT: v_or_b32_e32 v14, v16, v14
3541+
; GFX8-NEXT: v_or_b32_sdwa v13, v13, sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3542+
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
3543+
; GFX8-NEXT: s_mov_b64 vcc, s[0:1]
3544+
; GFX8-NEXT: flat_store_dword v[10:11], v12
3545+
; GFX8-NEXT: s_cbranch_vccnz .LBB17_1
3546+
; GFX8-NEXT: ; %bb.2: ; %DummyReturnBlock
3547+
; GFX8-NEXT: s_endpgm
3548+
;
3549+
; GFX9-NODL-LABEL: ByteOffsetCorrectness:
3550+
; GFX9-NODL: ; %bb.0: ; %.entry
3551+
; GFX9-NODL-NEXT: s_load_dword s4, s[2:3], 0x2c
3552+
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3553+
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
3554+
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
3555+
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
3556+
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
3557+
; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s4
3558+
; GFX9-NODL-NEXT: s_bfe_i32 s3, s4, 0x80008
3559+
; GFX9-NODL-NEXT: s_and_b64 vcc, exec, -1
3560+
; GFX9-NODL-NEXT: .LBB17_1: ; %.lr.ph
3561+
; GFX9-NODL-NEXT: ; =>This Inner Loop Header: Depth=1
3562+
; GFX9-NODL-NEXT: global_load_sbyte v3, v2, s[0:1]
3563+
; GFX9-NODL-NEXT: global_load_sbyte v4, v2, s[0:1] offset:4
3564+
; GFX9-NODL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:3
3565+
; GFX9-NODL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3566+
; GFX9-NODL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3567+
; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
3568+
; GFX9-NODL-NEXT: v_mul_lo_u32 v3, v3, s2
3569+
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
3570+
; GFX9-NODL-NEXT: v_mul_lo_u32 v4, v4, s3
3571+
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
3572+
; GFX9-NODL-NEXT: v_or_b32_e32 v6, v6, v7
3573+
; GFX9-NODL-NEXT: v_or_b32_e32 v5, v5, v6
3574+
; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3575+
; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v4
3576+
; GFX9-NODL-NEXT: global_store_dword v[0:1], v3, off
3577+
; GFX9-NODL-NEXT: s_mov_b64 vcc, vcc
3578+
; GFX9-NODL-NEXT: s_cbranch_vccnz .LBB17_1
3579+
; GFX9-NODL-NEXT: ; %bb.2: ; %DummyReturnBlock
3580+
; GFX9-NODL-NEXT: s_endpgm
3581+
;
3582+
; GFX9-DL-LABEL: ByteOffsetCorrectness:
3583+
; GFX9-DL: ; %bb.0: ; %.entry
3584+
; GFX9-DL-NEXT: s_load_dword s4, s[2:3], 0x2c
3585+
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3586+
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0xc0c0400
3587+
; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0
3588+
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0400
3589+
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3590+
; GFX9-DL-NEXT: s_sext_i32_i8 s3, s4
3591+
; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80008
3592+
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
3593+
; GFX9-DL-NEXT: v_perm_b32 v3, s3, v0, v1
3594+
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
3595+
; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
3596+
; GFX9-DL-NEXT: s_and_b64 vcc, exec, -1
3597+
; GFX9-DL-NEXT: .LBB17_1: ; %.lr.ph
3598+
; GFX9-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3599+
; GFX9-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3
3600+
; GFX9-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4
3601+
; GFX9-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3602+
; GFX9-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3603+
; GFX9-DL-NEXT: global_load_ubyte v8, v2, s[0:1]
3604+
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3605+
; GFX9-DL-NEXT: v_or_b32_e32 v6, v6, v7
3606+
; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v6
3607+
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3608+
; GFX9-DL-NEXT: v_perm_b32 v5, v8, v5, s2
3609+
; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3610+
; GFX9-DL-NEXT: v_dot4_i32_i8 v4, v5, v3, v4
3611+
; GFX9-DL-NEXT: global_store_dword v[0:1], v4, off
3612+
; GFX9-DL-NEXT: s_mov_b64 vcc, vcc
3613+
; GFX9-DL-NEXT: s_cbranch_vccnz .LBB17_1
3614+
; GFX9-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3615+
; GFX9-DL-NEXT: s_endpgm
3616+
;
3617+
; GFX10-DL-LABEL: ByteOffsetCorrectness:
3618+
; GFX10-DL: ; %bb.0: ; %.entry
3619+
; GFX10-DL-NEXT: s_clause 0x1
3620+
; GFX10-DL-NEXT: s_load_dword s4, s[2:3], 0x2c
3621+
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3622+
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xc0c0400
3623+
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3624+
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
3625+
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
3626+
; GFX10-DL-NEXT: s_mov_b32 vcc_lo, exec_lo
3627+
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3628+
; GFX10-DL-NEXT: s_sext_i32_i8 s2, s4
3629+
; GFX10-DL-NEXT: s_bfe_i32 s3, s4, 0x80008
3630+
; GFX10-DL-NEXT: v_perm_b32 v3, s2, s3, v3
3631+
; GFX10-DL-NEXT: .LBB17_1: ; %.lr.ph
3632+
; GFX10-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3633+
; GFX10-DL-NEXT: s_clause 0x4
3634+
; GFX10-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3
3635+
; GFX10-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4
3636+
; GFX10-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3637+
; GFX10-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3638+
; GFX10-DL-NEXT: global_load_ubyte v8, v2, s[0:1]
3639+
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3640+
; GFX10-DL-NEXT: v_or_b32_e32 v6, v6, v7
3641+
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3642+
; GFX10-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400
3643+
; GFX10-DL-NEXT: v_or_b32_e32 v4, v4, v6
3644+
; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3645+
; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v5, v3
3646+
; GFX10-DL-NEXT: global_store_dword v[0:1], v4, off
3647+
; GFX10-DL-NEXT: s_cbranch_vccnz .LBB17_1
3648+
; GFX10-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3649+
; GFX10-DL-NEXT: s_endpgm
3650+
;
3651+
; GFX11-DL-LABEL: ByteOffsetCorrectness:
3652+
; GFX11-DL: ; %bb.0: ; %.entry
3653+
; GFX11-DL-NEXT: s_clause 0x1
3654+
; GFX11-DL-NEXT: s_load_b32 s4, s[2:3], 0x2c
3655+
; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
3656+
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0xc0c0400 :: v_dual_mov_b32 v2, 0
3657+
; GFX11-DL-NEXT: v_mov_b32_e32 v0, 0
3658+
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
3659+
; GFX11-DL-NEXT: s_mov_b32 vcc_lo, exec_lo
3660+
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
3661+
; GFX11-DL-NEXT: s_sext_i32_i8 s2, s4
3662+
; GFX11-DL-NEXT: s_bfe_i32 s3, s4, 0x80008
3663+
; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3664+
; GFX11-DL-NEXT: v_perm_b32 v3, s2, s3, v3
3665+
; GFX11-DL-NEXT: .p2align 6
3666+
; GFX11-DL-NEXT: .LBB17_1: ; %.lr.ph
3667+
; GFX11-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3668+
; GFX11-DL-NEXT: s_clause 0x4
3669+
; GFX11-DL-NEXT: global_load_u8 v4, v2, s[0:1] offset:3
3670+
; GFX11-DL-NEXT: global_load_u8 v5, v2, s[0:1] offset:4
3671+
; GFX11-DL-NEXT: global_load_u8 v6, v2, s[0:1] offset:2
3672+
; GFX11-DL-NEXT: global_load_u8 v7, v2, s[0:1] offset:1
3673+
; GFX11-DL-NEXT: global_load_u8 v8, v2, s[0:1]
3674+
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
3675+
; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v7
3676+
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
3677+
; GFX11-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400
3678+
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3679+
; GFX11-DL-NEXT: v_or_b32_e32 v4, v4, v6
3680+
; GFX11-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3681+
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
3682+
; GFX11-DL-NEXT: v_dot4_i32_iu8 v4, v5, v3, v4 neg_lo:[1,1,0]
3683+
; GFX11-DL-NEXT: global_store_b32 v[0:1], v4, off
3684+
; GFX11-DL-NEXT: s_cbranch_vccnz .LBB17_1
3685+
; GFX11-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3686+
; GFX11-DL-NEXT: s_endpgm
3687+
.entry:
3688+
br label %.lr.ph
3689+
3690+
.lr.ph: ; preds = %.lr.ph, %.entry
3691+
%l80 = load i8, ptr addrspace(1) %inptr1, align 1
3692+
%op80 = sext i8 %l80 to i32
3693+
%op81 = sext i8 %l81 to i32
3694+
%mul8 = mul i32 %op80, %op81
3695+
%gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 4
3696+
%l50 = load i8, ptr addrspace(1) %gep50, align 1
3697+
%op50 = sext i8 %l50 to i32
3698+
%op51 = sext i8 %l51 to i32
3699+
%mul5 = mul i32 %op50, %op51
3700+
%gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 3
3701+
%l40 = load i8, ptr addrspace(1) %gep40, align 1
3702+
%gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 2
3703+
%l30 = load i8, ptr addrspace(1) %gep30, align 1
3704+
%gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 1
3705+
%l20 = load i8, ptr addrspace(1) %gep20, align 1
3706+
%ivadd31 = or i8 %l30, %l20
3707+
%ivadd42 = or i8 %l40, %ivadd31
3708+
%ivadd4 = sext i8 %ivadd42 to i32
3709+
%ivadd5 = or i32 %mul5, %ivadd4
3710+
%ivadd8 = add i32 %mul8, %ivadd5
3711+
store i32 %ivadd8, ptr addrspace(1) null, align 4
3712+
br label %.lr.ph
3713+
}
3714+
34523715

34533716
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)