@@ -314,8 +314,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
314
314
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
315
315
; SI-NEXT: s_mov_b32 s3, 0xf000
316
316
; SI-NEXT: s_waitcnt lgkmcnt(0)
317
- ; SI-NEXT: s_lshl_b32 s2, s2, 24
318
- ; SI-NEXT: s_flbit_i32_b32 s4, s2
317
+ ; SI-NEXT: s_and_b32 s2, s2, 0xff
318
+ ; SI-NEXT: s_flbit_i32_b32 s2, s2
319
+ ; SI-NEXT: s_sub_i32 s4, s2, 24
319
320
; SI-NEXT: s_mov_b32 s2, -1
320
321
; SI-NEXT: v_mov_b32_e32 v0, s4
321
322
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -326,8 +327,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
326
327
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
327
328
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
328
329
; VI-NEXT: s_waitcnt lgkmcnt(0)
329
- ; VI-NEXT: s_lshl_b32 s2, s2, 24
330
+ ; VI-NEXT: s_and_b32 s2, s2, 0xff
330
331
; VI-NEXT: s_flbit_i32_b32 s2, s2
332
+ ; VI-NEXT: s_sub_i32 s2, s2, 24
331
333
; VI-NEXT: v_mov_b32_e32 v0, s0
332
334
; VI-NEXT: v_mov_b32_e32 v1, s1
333
335
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -347,13 +349,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
347
349
; EG-NEXT: ALU clause starting at 8:
348
350
; EG-NEXT: MOV * T0.X, 0.0,
349
351
; EG-NEXT: ALU clause starting at 9:
350
- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
351
- ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
352
- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
352
+ ; EG-NEXT: FFBH_UINT T0.W, T0.X,
353
353
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
354
354
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
355
+ ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
356
+ ; EG-NEXT: -24(nan), 0(0.000000e+00)
355
357
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
356
- ; EG-NEXT: LSHL * T1.W, PS , literal.y,
358
+ ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
357
359
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
358
360
; EG-NEXT: LSHL T0.X, PV.W, PS,
359
361
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -389,8 +391,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
389
391
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
390
392
; SI-NEXT: s_mov_b32 s3, 0xf000
391
393
; SI-NEXT: s_waitcnt lgkmcnt(0)
392
- ; SI-NEXT: s_lshl_b32 s2, s2, 16
393
- ; SI-NEXT: s_flbit_i32_b32 s4, s2
394
+ ; SI-NEXT: s_and_b32 s2, s2, 0xffff
395
+ ; SI-NEXT: s_flbit_i32_b32 s2, s2
396
+ ; SI-NEXT: s_add_i32 s4, s2, -16
394
397
; SI-NEXT: s_mov_b32 s2, -1
395
398
; SI-NEXT: v_mov_b32_e32 v0, s4
396
399
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -423,13 +426,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
423
426
; EG-NEXT: ALU clause starting at 8:
424
427
; EG-NEXT: MOV * T0.X, 0.0,
425
428
; EG-NEXT: ALU clause starting at 9:
426
- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
427
- ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
428
- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
429
+ ; EG-NEXT: FFBH_UINT T0.W, T0.X,
429
430
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
430
431
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
432
+ ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
433
+ ; EG-NEXT: -16(nan), 0(0.000000e+00)
431
434
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
432
- ; EG-NEXT: LSHL * T1.W, PS , literal.y,
435
+ ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
433
436
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
434
437
; EG-NEXT: LSHL T0.X, PV.W, PS,
435
438
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -587,8 +590,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
587
590
; SI-NEXT: s_mov_b32 s4, s0
588
591
; SI-NEXT: s_mov_b32 s5, s1
589
592
; SI-NEXT: s_waitcnt vmcnt(0)
590
- ; SI-NEXT: v_lshlrev_b32_e32 v1, 24 , v0
591
- ; SI-NEXT: v_ffbh_u32_e32 v1, v1
593
+ ; SI-NEXT: v_ffbh_u32_e32 v1, v0
594
+ ; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24 , v1
592
595
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
593
596
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
594
597
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -602,8 +605,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
602
605
; VI-NEXT: v_mov_b32_e32 v1, s3
603
606
; VI-NEXT: flat_load_ubyte v0, v[0:1]
604
607
; VI-NEXT: s_waitcnt vmcnt(0)
605
- ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
606
- ; VI-NEXT: v_ffbh_u32_e32 v1, v1
608
+ ; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
609
+ ; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24 , v1
607
610
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
608
611
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
609
612
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -615,7 +618,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
615
618
; EG: ; %bb.0:
616
619
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
617
620
; EG-NEXT: TEX 0 @6
618
- ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
621
+ ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
619
622
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
620
623
; EG-NEXT: CF_END
621
624
; EG-NEXT: PAD
@@ -624,11 +627,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
624
627
; EG-NEXT: ALU clause starting at 8:
625
628
; EG-NEXT: MOV * T0.X, KC0[2].Z,
626
629
; EG-NEXT: ALU clause starting at 9:
627
- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
628
- ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
629
- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
630
- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
631
- ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
630
+ ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
631
+ ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
632
+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
633
+ ; EG-NEXT: -24(nan), 3(4.203895e-45)
632
634
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
633
635
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
634
636
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -683,8 +685,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
683
685
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
684
686
; SI-NEXT: s_waitcnt vmcnt(0)
685
687
; SI-NEXT: v_or_b32_e32 v0, v0, v1
686
- ; SI-NEXT: v_lshlrev_b32_e32 v1, 16 , v0
687
- ; SI-NEXT: v_ffbh_u32_e32 v1, v1
688
+ ; SI-NEXT: v_ffbh_u32_e32 v1, v0
689
+ ; SI-NEXT: v_add_i32_e32 v1, vcc, -16 , v1
688
690
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
689
691
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
690
692
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -719,7 +721,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
719
721
; EG: ; %bb.0:
720
722
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
721
723
; EG-NEXT: TEX 0 @6
722
- ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
724
+ ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
723
725
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
724
726
; EG-NEXT: CF_END
725
727
; EG-NEXT: PAD
@@ -728,11 +730,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
728
730
; EG-NEXT: ALU clause starting at 8:
729
731
; EG-NEXT: MOV * T0.X, KC0[2].Z,
730
732
; EG-NEXT: ALU clause starting at 9:
731
- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
732
- ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
733
- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
734
- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
735
- ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
733
+ ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
734
+ ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
735
+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
736
+ ; EG-NEXT: -16(nan), 3(4.203895e-45)
736
737
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
737
738
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
738
739
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -1101,8 +1102,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1101
1102
; SI-NEXT: s_mov_b32 s4, s0
1102
1103
; SI-NEXT: s_mov_b32 s5, s1
1103
1104
; SI-NEXT: s_waitcnt vmcnt(0)
1104
- ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1105
1105
; SI-NEXT: v_ffbh_u32_e32 v0, v0
1106
+ ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
1106
1107
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1107
1108
; SI-NEXT: s_endpgm
1108
1109
;
@@ -1115,8 +1116,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1115
1116
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1116
1117
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1117
1118
; VI-NEXT: s_waitcnt vmcnt(0)
1118
- ; VI-NEXT: v_lshlrev_b32_e32 v0, 24 , v0
1119
- ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1119
+ ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1120
+ ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24 , v0
1120
1121
; VI-NEXT: v_mov_b32_e32 v0, s0
1121
1122
; VI-NEXT: v_mov_b32_e32 v1, s1
1122
1123
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1135,13 +1136,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1135
1136
; EG-NEXT: ALU clause starting at 8:
1136
1137
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1137
1138
; EG-NEXT: ALU clause starting at 9:
1138
- ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1139
- ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1140
- ; EG-NEXT: FFBH_UINT T0.W, PV.W,
1139
+ ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1141
1140
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1142
1141
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1142
+ ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1143
+ ; EG-NEXT: -24(nan), 0(0.000000e+00)
1143
1144
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1144
- ; EG-NEXT: LSHL * T1.W, PS , literal.y,
1145
+ ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
1145
1146
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1146
1147
; EG-NEXT: LSHL T0.X, PV.W, PS,
1147
1148
; EG-NEXT: LSHL * T0.W, literal.x, PS,
0 commit comments