@@ -3449,5 +3449,268 @@ entry:
3449
3449
ret void
3450
3450
}
3451
3451
3452
+ ; The first (S0) operand of the v_dot4 is derived from the LHS of the mul chain (that is %op80, %op50).
3453
+ ; These correspond to the 0th, and 4th bytes starting from %inptr1.
3454
+ ; Confirm that we are actually accessing these bytes.
3455
+ ;
3456
+ ; Previously, we used the dword offset from the corresponding byte in the second (S1) operand.
3457
+ ; The result was to access the 0th byte instead of the 4th (i.e. a dword offset of 0 instead of 1).
3458
+
3459
+ define amdgpu_kernel void @ByteOffsetCorrectness (ptr addrspace (1 ) %inptr1 , i8 %l81 , i8 %l51 ) {
3460
+ ; GFX7-LABEL: ByteOffsetCorrectness:
3461
+ ; GFX7: ; %bb.0: ; %.entry
3462
+ ; GFX7-NEXT: s_load_dword s0, s[2:3], 0xb
3463
+ ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
3464
+ ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3465
+ ; GFX7-NEXT: s_mov_b32 s6, -1
3466
+ ; GFX7-NEXT: s_mov_b32 s8, 0
3467
+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3468
+ ; GFX7-NEXT: s_sext_i32_i8 s2, s0
3469
+ ; GFX7-NEXT: s_bfe_i32 s3, s0, 0x80008
3470
+ ; GFX7-NEXT: s_mov_b32 s9, s8
3471
+ ; GFX7-NEXT: s_mov_b32 s10, s6
3472
+ ; GFX7-NEXT: s_mov_b32 s11, s7
3473
+ ; GFX7-NEXT: s_and_b64 s[0:1], exec, -1
3474
+ ; GFX7-NEXT: .LBB17_1: ; %.lr.ph
3475
+ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
3476
+ ; GFX7-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:4
3477
+ ; GFX7-NEXT: buffer_load_sbyte v1, off, s[4:7], 0
3478
+ ; GFX7-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:1
3479
+ ; GFX7-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
3480
+ ; GFX7-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:3
3481
+ ; GFX7-NEXT: s_waitcnt vmcnt(4)
3482
+ ; GFX7-NEXT: v_mul_lo_u32 v0, v0, s3
3483
+ ; GFX7-NEXT: s_waitcnt vmcnt(3)
3484
+ ; GFX7-NEXT: v_mul_lo_u32 v1, v1, s2
3485
+ ; GFX7-NEXT: s_waitcnt vmcnt(1)
3486
+ ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
3487
+ ; GFX7-NEXT: s_waitcnt vmcnt(0)
3488
+ ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
3489
+ ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 8
3490
+ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
3491
+ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
3492
+ ; GFX7-NEXT: s_mov_b64 vcc, s[0:1]
3493
+ ; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0
3494
+ ; GFX7-NEXT: s_cbranch_vccnz .LBB17_1
3495
+ ; GFX7-NEXT: ; %bb.2: ; %DummyReturnBlock
3496
+ ; GFX7-NEXT: s_endpgm
3497
+ ;
3498
+ ; GFX8-LABEL: ByteOffsetCorrectness:
3499
+ ; GFX8: ; %bb.0: ; %.entry
3500
+ ; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
3501
+ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3502
+ ; GFX8-NEXT: v_mov_b32_e32 v10, 0
3503
+ ; GFX8-NEXT: v_mov_b32_e32 v11, 0
3504
+ ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3505
+ ; GFX8-NEXT: s_sext_i32_i8 s2, s6
3506
+ ; GFX8-NEXT: s_add_u32 s4, s0, 4
3507
+ ; GFX8-NEXT: s_addc_u32 s5, s1, 0
3508
+ ; GFX8-NEXT: s_bfe_i32 s3, s6, 0x80008
3509
+ ; GFX8-NEXT: s_add_u32 s6, s0, 3
3510
+ ; GFX8-NEXT: s_addc_u32 s7, s1, 0
3511
+ ; GFX8-NEXT: s_add_u32 s8, s0, 2
3512
+ ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3513
+ ; GFX8-NEXT: s_addc_u32 s9, s1, 0
3514
+ ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3515
+ ; GFX8-NEXT: s_add_u32 s0, s0, 1
3516
+ ; GFX8-NEXT: s_addc_u32 s1, s1, 0
3517
+ ; GFX8-NEXT: v_mov_b32_e32 v2, s4
3518
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s8
3519
+ ; GFX8-NEXT: v_mov_b32_e32 v7, s1
3520
+ ; GFX8-NEXT: v_mov_b32_e32 v9, s7
3521
+ ; GFX8-NEXT: v_mov_b32_e32 v3, s5
3522
+ ; GFX8-NEXT: v_mov_b32_e32 v5, s9
3523
+ ; GFX8-NEXT: v_mov_b32_e32 v6, s0
3524
+ ; GFX8-NEXT: v_mov_b32_e32 v8, s6
3525
+ ; GFX8-NEXT: s_and_b64 s[0:1], exec, -1
3526
+ ; GFX8-NEXT: .LBB17_1: ; %.lr.ph
3527
+ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
3528
+ ; GFX8-NEXT: flat_load_sbyte v12, v[0:1]
3529
+ ; GFX8-NEXT: flat_load_sbyte v13, v[2:3]
3530
+ ; GFX8-NEXT: flat_load_ubyte v14, v[4:5]
3531
+ ; GFX8-NEXT: flat_load_ubyte v15, v[6:7]
3532
+ ; GFX8-NEXT: flat_load_ubyte v16, v[8:9]
3533
+ ; GFX8-NEXT: s_waitcnt vmcnt(4)
3534
+ ; GFX8-NEXT: v_mul_lo_u32 v12, v12, s2
3535
+ ; GFX8-NEXT: s_waitcnt vmcnt(3)
3536
+ ; GFX8-NEXT: v_mul_lo_u32 v13, v13, s3
3537
+ ; GFX8-NEXT: s_waitcnt vmcnt(1)
3538
+ ; GFX8-NEXT: v_or_b32_e32 v14, v14, v15
3539
+ ; GFX8-NEXT: s_waitcnt vmcnt(0)
3540
+ ; GFX8-NEXT: v_or_b32_e32 v14, v16, v14
3541
+ ; GFX8-NEXT: v_or_b32_sdwa v13, v13, sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3542
+ ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
3543
+ ; GFX8-NEXT: s_mov_b64 vcc, s[0:1]
3544
+ ; GFX8-NEXT: flat_store_dword v[10:11], v12
3545
+ ; GFX8-NEXT: s_cbranch_vccnz .LBB17_1
3546
+ ; GFX8-NEXT: ; %bb.2: ; %DummyReturnBlock
3547
+ ; GFX8-NEXT: s_endpgm
3548
+ ;
3549
+ ; GFX9-NODL-LABEL: ByteOffsetCorrectness:
3550
+ ; GFX9-NODL: ; %bb.0: ; %.entry
3551
+ ; GFX9-NODL-NEXT: s_load_dword s4, s[2:3], 0x2c
3552
+ ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3553
+ ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
3554
+ ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
3555
+ ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
3556
+ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
3557
+ ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s4
3558
+ ; GFX9-NODL-NEXT: s_bfe_i32 s3, s4, 0x80008
3559
+ ; GFX9-NODL-NEXT: s_and_b64 vcc, exec, -1
3560
+ ; GFX9-NODL-NEXT: .LBB17_1: ; %.lr.ph
3561
+ ; GFX9-NODL-NEXT: ; =>This Inner Loop Header: Depth=1
3562
+ ; GFX9-NODL-NEXT: global_load_sbyte v3, v2, s[0:1]
3563
+ ; GFX9-NODL-NEXT: global_load_sbyte v4, v2, s[0:1] offset:4
3564
+ ; GFX9-NODL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:3
3565
+ ; GFX9-NODL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3566
+ ; GFX9-NODL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3567
+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
3568
+ ; GFX9-NODL-NEXT: v_mul_lo_u32 v3, v3, s2
3569
+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
3570
+ ; GFX9-NODL-NEXT: v_mul_lo_u32 v4, v4, s3
3571
+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
3572
+ ; GFX9-NODL-NEXT: v_or_b32_e32 v6, v6, v7
3573
+ ; GFX9-NODL-NEXT: v_or_b32_e32 v5, v5, v6
3574
+ ; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3575
+ ; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v4
3576
+ ; GFX9-NODL-NEXT: global_store_dword v[0:1], v3, off
3577
+ ; GFX9-NODL-NEXT: s_mov_b64 vcc, vcc
3578
+ ; GFX9-NODL-NEXT: s_cbranch_vccnz .LBB17_1
3579
+ ; GFX9-NODL-NEXT: ; %bb.2: ; %DummyReturnBlock
3580
+ ; GFX9-NODL-NEXT: s_endpgm
3581
+ ;
3582
+ ; GFX9-DL-LABEL: ByteOffsetCorrectness:
3583
+ ; GFX9-DL: ; %bb.0: ; %.entry
3584
+ ; GFX9-DL-NEXT: s_load_dword s4, s[2:3], 0x2c
3585
+ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3586
+ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0xc0c0400
3587
+ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0
3588
+ ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0400
3589
+ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3590
+ ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s4
3591
+ ; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80008
3592
+ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
3593
+ ; GFX9-DL-NEXT: v_perm_b32 v3, s3, v0, v1
3594
+ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
3595
+ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
3596
+ ; GFX9-DL-NEXT: s_and_b64 vcc, exec, -1
3597
+ ; GFX9-DL-NEXT: .LBB17_1: ; %.lr.ph
3598
+ ; GFX9-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3599
+ ; GFX9-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3
3600
+ ; GFX9-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4
3601
+ ; GFX9-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3602
+ ; GFX9-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3603
+ ; GFX9-DL-NEXT: global_load_ubyte v8, v2, s[0:1]
3604
+ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3605
+ ; GFX9-DL-NEXT: v_or_b32_e32 v6, v6, v7
3606
+ ; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v6
3607
+ ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3608
+ ; GFX9-DL-NEXT: v_perm_b32 v5, v8, v5, s2
3609
+ ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3610
+ ; GFX9-DL-NEXT: v_dot4_i32_i8 v4, v5, v3, v4
3611
+ ; GFX9-DL-NEXT: global_store_dword v[0:1], v4, off
3612
+ ; GFX9-DL-NEXT: s_mov_b64 vcc, vcc
3613
+ ; GFX9-DL-NEXT: s_cbranch_vccnz .LBB17_1
3614
+ ; GFX9-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3615
+ ; GFX9-DL-NEXT: s_endpgm
3616
+ ;
3617
+ ; GFX10-DL-LABEL: ByteOffsetCorrectness:
3618
+ ; GFX10-DL: ; %bb.0: ; %.entry
3619
+ ; GFX10-DL-NEXT: s_clause 0x1
3620
+ ; GFX10-DL-NEXT: s_load_dword s4, s[2:3], 0x2c
3621
+ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3622
+ ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xc0c0400
3623
+ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3624
+ ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
3625
+ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
3626
+ ; GFX10-DL-NEXT: s_mov_b32 vcc_lo, exec_lo
3627
+ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3628
+ ; GFX10-DL-NEXT: s_sext_i32_i8 s2, s4
3629
+ ; GFX10-DL-NEXT: s_bfe_i32 s3, s4, 0x80008
3630
+ ; GFX10-DL-NEXT: v_perm_b32 v3, s2, s3, v3
3631
+ ; GFX10-DL-NEXT: .LBB17_1: ; %.lr.ph
3632
+ ; GFX10-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3633
+ ; GFX10-DL-NEXT: s_clause 0x4
3634
+ ; GFX10-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3
3635
+ ; GFX10-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4
3636
+ ; GFX10-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3637
+ ; GFX10-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3638
+ ; GFX10-DL-NEXT: global_load_ubyte v8, v2, s[0:1]
3639
+ ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3640
+ ; GFX10-DL-NEXT: v_or_b32_e32 v6, v6, v7
3641
+ ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3642
+ ; GFX10-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400
3643
+ ; GFX10-DL-NEXT: v_or_b32_e32 v4, v4, v6
3644
+ ; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3645
+ ; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v5, v3
3646
+ ; GFX10-DL-NEXT: global_store_dword v[0:1], v4, off
3647
+ ; GFX10-DL-NEXT: s_cbranch_vccnz .LBB17_1
3648
+ ; GFX10-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3649
+ ; GFX10-DL-NEXT: s_endpgm
3650
+ ;
3651
+ ; GFX11-DL-LABEL: ByteOffsetCorrectness:
3652
+ ; GFX11-DL: ; %bb.0: ; %.entry
3653
+ ; GFX11-DL-NEXT: s_clause 0x1
3654
+ ; GFX11-DL-NEXT: s_load_b32 s4, s[2:3], 0x2c
3655
+ ; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
3656
+ ; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0xc0c0400 :: v_dual_mov_b32 v2, 0
3657
+ ; GFX11-DL-NEXT: v_mov_b32_e32 v0, 0
3658
+ ; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
3659
+ ; GFX11-DL-NEXT: s_mov_b32 vcc_lo, exec_lo
3660
+ ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
3661
+ ; GFX11-DL-NEXT: s_sext_i32_i8 s2, s4
3662
+ ; GFX11-DL-NEXT: s_bfe_i32 s3, s4, 0x80008
3663
+ ; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3664
+ ; GFX11-DL-NEXT: v_perm_b32 v3, s2, s3, v3
3665
+ ; GFX11-DL-NEXT: .p2align 6
3666
+ ; GFX11-DL-NEXT: .LBB17_1: ; %.lr.ph
3667
+ ; GFX11-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3668
+ ; GFX11-DL-NEXT: s_clause 0x4
3669
+ ; GFX11-DL-NEXT: global_load_u8 v4, v2, s[0:1] offset:3
3670
+ ; GFX11-DL-NEXT: global_load_u8 v5, v2, s[0:1] offset:4
3671
+ ; GFX11-DL-NEXT: global_load_u8 v6, v2, s[0:1] offset:2
3672
+ ; GFX11-DL-NEXT: global_load_u8 v7, v2, s[0:1] offset:1
3673
+ ; GFX11-DL-NEXT: global_load_u8 v8, v2, s[0:1]
3674
+ ; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
3675
+ ; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v7
3676
+ ; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
3677
+ ; GFX11-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400
3678
+ ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3679
+ ; GFX11-DL-NEXT: v_or_b32_e32 v4, v4, v6
3680
+ ; GFX11-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3681
+ ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
3682
+ ; GFX11-DL-NEXT: v_dot4_i32_iu8 v4, v5, v3, v4 neg_lo:[1,1,0]
3683
+ ; GFX11-DL-NEXT: global_store_b32 v[0:1], v4, off
3684
+ ; GFX11-DL-NEXT: s_cbranch_vccnz .LBB17_1
3685
+ ; GFX11-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3686
+ ; GFX11-DL-NEXT: s_endpgm
3687
+ .entry:
3688
+ br label %.lr.ph
3689
+
3690
+ .lr.ph: ; preds = %.lr.ph, %.entry
3691
+ %l80 = load i8 , ptr addrspace (1 ) %inptr1 , align 1
3692
+ %op80 = sext i8 %l80 to i32
3693
+ %op81 = sext i8 %l81 to i32
3694
+ %mul8 = mul i32 %op80 , %op81
3695
+ %gep50 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 4
3696
+ %l50 = load i8 , ptr addrspace (1 ) %gep50 , align 1
3697
+ %op50 = sext i8 %l50 to i32
3698
+ %op51 = sext i8 %l51 to i32
3699
+ %mul5 = mul i32 %op50 , %op51
3700
+ %gep40 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 3
3701
+ %l40 = load i8 , ptr addrspace (1 ) %gep40 , align 1
3702
+ %gep30 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 2
3703
+ %l30 = load i8 , ptr addrspace (1 ) %gep30 , align 1
3704
+ %gep20 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 1
3705
+ %l20 = load i8 , ptr addrspace (1 ) %gep20 , align 1
3706
+ %ivadd31 = or i8 %l30 , %l20
3707
+ %ivadd42 = or i8 %l40 , %ivadd31
3708
+ %ivadd4 = sext i8 %ivadd42 to i32
3709
+ %ivadd5 = or i32 %mul5 , %ivadd4
3710
+ %ivadd8 = add i32 %mul8 , %ivadd5
3711
+ store i32 %ivadd8 , ptr addrspace (1 ) null , align 4
3712
+ br label %.lr.ph
3713
+ }
3714
+
3452
3715
3453
3716
declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments