@@ -314,9 +314,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
314
314
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
315
315
; SI-NEXT: s_mov_b32 s3, 0xf000
316
316
; SI-NEXT: s_waitcnt lgkmcnt(0)
317
- ; SI-NEXT: s_and_b32 s2, s2, 0xff
318
- ; SI-NEXT: s_flbit_i32_b32 s2, s2
319
- ; SI-NEXT: s_sub_i32 s4, s2, 24
317
+ ; SI-NEXT: s_lshl_b32 s2, s2, 24
318
+ ; SI-NEXT: s_flbit_i32_b32 s4, s2
320
319
; SI-NEXT: s_mov_b32 s2, -1
321
320
; SI-NEXT: v_mov_b32_e32 v0, s4
322
321
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -327,9 +326,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
327
326
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
328
327
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
329
328
; VI-NEXT: s_waitcnt lgkmcnt(0)
330
- ; VI-NEXT: s_and_b32 s2, s2, 0xff
329
+ ; VI-NEXT: s_lshl_b32 s2, s2, 24
331
330
; VI-NEXT: s_flbit_i32_b32 s2, s2
332
- ; VI-NEXT: s_sub_i32 s2, s2, 24
333
331
; VI-NEXT: v_mov_b32_e32 v0, s0
334
332
; VI-NEXT: v_mov_b32_e32 v1, s1
335
333
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -349,13 +347,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
349
347
; EG-NEXT: ALU clause starting at 8:
350
348
; EG-NEXT: MOV * T0.X, 0.0,
351
349
; EG-NEXT: ALU clause starting at 9:
352
- ; EG-NEXT: FFBH_UINT T0.W, T0.X,
350
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
351
+ ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
352
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
353
353
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
354
354
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
355
- ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
356
- ; EG-NEXT: -24(nan), 0(0.000000e+00)
357
355
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
358
- ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
356
+ ; EG-NEXT: LSHL * T1.W, PS , literal.y,
359
357
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
360
358
; EG-NEXT: LSHL T0.X, PV.W, PS,
361
359
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -391,9 +389,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
391
389
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
392
390
; SI-NEXT: s_mov_b32 s3, 0xf000
393
391
; SI-NEXT: s_waitcnt lgkmcnt(0)
394
- ; SI-NEXT: s_and_b32 s2, s2, 0xffff
395
- ; SI-NEXT: s_flbit_i32_b32 s2, s2
396
- ; SI-NEXT: s_add_i32 s4, s2, -16
392
+ ; SI-NEXT: s_lshl_b32 s2, s2, 16
393
+ ; SI-NEXT: s_flbit_i32_b32 s4, s2
397
394
; SI-NEXT: s_mov_b32 s2, -1
398
395
; SI-NEXT: v_mov_b32_e32 v0, s4
399
396
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -426,13 +423,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
426
423
; EG-NEXT: ALU clause starting at 8:
427
424
; EG-NEXT: MOV * T0.X, 0.0,
428
425
; EG-NEXT: ALU clause starting at 9:
429
- ; EG-NEXT: FFBH_UINT T0.W, T0.X,
426
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
427
+ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
428
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
430
429
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
431
430
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
432
- ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
433
- ; EG-NEXT: -16(nan), 0(0.000000e+00)
434
431
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
435
- ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
432
+ ; EG-NEXT: LSHL * T1.W, PS , literal.y,
436
433
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
437
434
; EG-NEXT: LSHL T0.X, PV.W, PS,
438
435
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -590,8 +587,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
590
587
; SI-NEXT: s_mov_b32 s4, s0
591
588
; SI-NEXT: s_mov_b32 s5, s1
592
589
; SI-NEXT: s_waitcnt vmcnt(0)
593
- ; SI-NEXT: v_ffbh_u32_e32 v1, v0
594
- ; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24 , v1
590
+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 24 , v0
591
+ ; SI-NEXT: v_ffbh_u32_e32 v1, v1
595
592
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
596
593
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
597
594
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -605,8 +602,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
605
602
; VI-NEXT: v_mov_b32_e32 v1, s3
606
603
; VI-NEXT: flat_load_ubyte v0, v[0:1]
607
604
; VI-NEXT: s_waitcnt vmcnt(0)
608
- ; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
609
- ; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24 , v1
605
+ ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
606
+ ; VI-NEXT: v_ffbh_u32_e32 v1, v1
610
607
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
611
608
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
612
609
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -618,7 +615,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
618
615
; EG: ; %bb.0:
619
616
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
620
617
; EG-NEXT: TEX 0 @6
621
- ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
618
+ ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
622
619
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
623
620
; EG-NEXT: CF_END
624
621
; EG-NEXT: PAD
@@ -627,10 +624,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
627
624
; EG-NEXT: ALU clause starting at 8:
628
625
; EG-NEXT: MOV * T0.X, KC0[2].Z,
629
626
; EG-NEXT: ALU clause starting at 9:
630
- ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
631
- ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
632
- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
633
- ; EG-NEXT: -24(nan), 3(4.203895e-45)
627
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
628
+ ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
629
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
630
+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
631
+ ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
634
632
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
635
633
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
636
634
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -685,8 +683,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
685
683
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
686
684
; SI-NEXT: s_waitcnt vmcnt(0)
687
685
; SI-NEXT: v_or_b32_e32 v0, v0, v1
688
- ; SI-NEXT: v_ffbh_u32_e32 v1, v0
689
- ; SI-NEXT: v_add_i32_e32 v1, vcc, -16 , v1
686
+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16 , v0
687
+ ; SI-NEXT: v_ffbh_u32_e32 v1, v1
690
688
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
691
689
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
692
690
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -721,7 +719,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
721
719
; EG: ; %bb.0:
722
720
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
723
721
; EG-NEXT: TEX 0 @6
724
- ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
722
+ ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
725
723
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
726
724
; EG-NEXT: CF_END
727
725
; EG-NEXT: PAD
@@ -730,10 +728,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
730
728
; EG-NEXT: ALU clause starting at 8:
731
729
; EG-NEXT: MOV * T0.X, KC0[2].Z,
732
730
; EG-NEXT: ALU clause starting at 9:
733
- ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
734
- ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
735
- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
736
- ; EG-NEXT: -16(nan), 3(4.203895e-45)
731
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
732
+ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
733
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
734
+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
735
+ ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
737
736
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
738
737
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
739
738
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -1102,8 +1101,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1102
1101
; SI-NEXT: s_mov_b32 s4, s0
1103
1102
; SI-NEXT: s_mov_b32 s5, s1
1104
1103
; SI-NEXT: s_waitcnt vmcnt(0)
1104
+ ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1105
1105
; SI-NEXT: v_ffbh_u32_e32 v0, v0
1106
- ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
1107
1106
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1108
1107
; SI-NEXT: s_endpgm
1109
1108
;
@@ -1116,8 +1115,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1116
1115
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1117
1116
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1118
1117
; VI-NEXT: s_waitcnt vmcnt(0)
1119
- ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1120
- ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24 , v0
1118
+ ; VI-NEXT: v_lshlrev_b32_e32 v0, 24 , v0
1119
+ ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1121
1120
; VI-NEXT: v_mov_b32_e32 v0, s0
1122
1121
; VI-NEXT: v_mov_b32_e32 v1, s1
1123
1122
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1136,13 +1135,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1136
1135
; EG-NEXT: ALU clause starting at 8:
1137
1136
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1138
1137
; EG-NEXT: ALU clause starting at 9:
1139
- ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1138
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1139
+ ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1140
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
1140
1141
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1141
1142
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1142
- ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1143
- ; EG-NEXT: -24(nan), 0(0.000000e+00)
1144
1143
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1145
- ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
1144
+ ; EG-NEXT: LSHL * T1.W, PS , literal.y,
1146
1145
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1147
1146
; EG-NEXT: LSHL T0.X, PV.W, PS,
1148
1147
; EG-NEXT: LSHL * T0.W, literal.x, PS,
0 commit comments