@@ -314,9 +314,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
314
314
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
315
315
; SI-NEXT: s_mov_b32 s3, 0xf000
316
316
; SI-NEXT: s_waitcnt lgkmcnt(0)
317
- ; SI-NEXT: s_and_b32 s2, s2, 0xff
318
- ; SI-NEXT: s_flbit_i32_b32 s2, s2
319
- ; SI-NEXT: s_sub_i32 s4, s2, 24
317
+ ; SI-NEXT: s_lshl_b32 s2, s2, 24
318
+ ; SI-NEXT: s_flbit_i32_b32 s4, s2
320
319
; SI-NEXT: s_mov_b32 s2, -1
321
320
; SI-NEXT: v_mov_b32_e32 v0, s4
322
321
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -327,12 +326,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
327
326
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
328
327
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
329
328
; VI-NEXT: s_waitcnt lgkmcnt(0)
330
- ; VI-NEXT: s_and_b32 s2, s2, 0xff
329
+ ; VI-NEXT: s_lshl_b32 s2, s2, 24
331
330
; VI-NEXT: s_flbit_i32_b32 s2, s2
332
- ; VI-NEXT: s_add_i32 s2, s2, -16
333
331
; VI-NEXT: v_mov_b32_e32 v0, s0
334
- ; VI-NEXT: v_add_u16_e64 v2, s2, -8
335
332
; VI-NEXT: v_mov_b32_e32 v1, s1
333
+ ; VI-NEXT: v_mov_b32_e32 v2, s2
336
334
; VI-NEXT: flat_store_byte v[0:1], v2
337
335
; VI-NEXT: s_endpgm
338
336
;
@@ -349,13 +347,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
349
347
; EG-NEXT: ALU clause starting at 8:
350
348
; EG-NEXT: MOV * T0.X, 0.0,
351
349
; EG-NEXT: ALU clause starting at 9:
352
- ; EG-NEXT: FFBH_UINT T0.W, T0.X,
350
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
351
+ ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
352
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
353
353
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
354
354
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
355
- ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
356
- ; EG-NEXT: -24(nan), 0(0.000000e+00)
357
355
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
358
- ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
356
+ ; EG-NEXT: LSHL * T1.W, PS , literal.y,
359
357
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
360
358
; EG-NEXT: LSHL T0.X, PV.W, PS,
361
359
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -391,9 +389,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
391
389
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
392
390
; SI-NEXT: s_mov_b32 s3, 0xf000
393
391
; SI-NEXT: s_waitcnt lgkmcnt(0)
394
- ; SI-NEXT: s_and_b32 s2, s2, 0xffff
395
- ; SI-NEXT: s_flbit_i32_b32 s2, s2
396
- ; SI-NEXT: s_add_i32 s4, s2, -16
392
+ ; SI-NEXT: s_lshl_b32 s2, s2, 16
393
+ ; SI-NEXT: s_flbit_i32_b32 s4, s2
397
394
; SI-NEXT: s_mov_b32 s2, -1
398
395
; SI-NEXT: v_mov_b32_e32 v0, s4
399
396
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -426,13 +423,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
426
423
; EG-NEXT: ALU clause starting at 8:
427
424
; EG-NEXT: MOV * T0.X, 0.0,
428
425
; EG-NEXT: ALU clause starting at 9:
429
- ; EG-NEXT: FFBH_UINT T0.W, T0.X,
426
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
427
+ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
428
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
430
429
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
431
430
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
432
- ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
433
- ; EG-NEXT: -16(nan), 0(0.000000e+00)
434
431
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
435
- ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
432
+ ; EG-NEXT: LSHL * T1.W, PS , literal.y,
436
433
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
437
434
; EG-NEXT: LSHL T0.X, PV.W, PS,
438
435
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -590,8 +587,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
590
587
; SI-NEXT: s_mov_b32 s4, s0
591
588
; SI-NEXT: s_mov_b32 s5, s1
592
589
; SI-NEXT: s_waitcnt vmcnt(0)
593
- ; SI-NEXT: v_ffbh_u32_e32 v1, v0
594
- ; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24 , v1
590
+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 24 , v0
591
+ ; SI-NEXT: v_ffbh_u32_e32 v1, v1
595
592
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
596
593
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
597
594
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -605,9 +602,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
605
602
; VI-NEXT: v_mov_b32_e32 v1, s3
606
603
; VI-NEXT: flat_load_ubyte v0, v[0:1]
607
604
; VI-NEXT: s_waitcnt vmcnt(0)
608
- ; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
609
- ; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
610
- ; VI-NEXT: v_add_u16_e32 v1, -8, v1
605
+ ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
606
+ ; VI-NEXT: v_ffbh_u32_e32 v1, v1
611
607
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
612
608
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
613
609
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -619,7 +615,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
619
615
; EG: ; %bb.0:
620
616
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
621
617
; EG-NEXT: TEX 0 @6
622
- ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
618
+ ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
623
619
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
624
620
; EG-NEXT: CF_END
625
621
; EG-NEXT: PAD
@@ -628,10 +624,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
628
624
; EG-NEXT: ALU clause starting at 8:
629
625
; EG-NEXT: MOV * T0.X, KC0[2].Z,
630
626
; EG-NEXT: ALU clause starting at 9:
631
- ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
632
- ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
633
- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
634
- ; EG-NEXT: -24(nan), 3(4.203895e-45)
627
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
628
+ ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
629
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
630
+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
631
+ ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
635
632
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
636
633
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
637
634
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -686,8 +683,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
686
683
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
687
684
; SI-NEXT: s_waitcnt vmcnt(0)
688
685
; SI-NEXT: v_or_b32_e32 v0, v0, v1
689
- ; SI-NEXT: v_ffbh_u32_e32 v1, v0
690
- ; SI-NEXT: v_add_i32_e32 v1, vcc, -16 , v1
686
+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16 , v0
687
+ ; SI-NEXT: v_ffbh_u32_e32 v1, v1
691
688
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
692
689
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
693
690
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -722,7 +719,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
722
719
; EG: ; %bb.0:
723
720
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
724
721
; EG-NEXT: TEX 0 @6
725
- ; EG-NEXT: ALU 15 , @9, KC0[CB0:0-32], KC1[]
722
+ ; EG-NEXT: ALU 16 , @9, KC0[CB0:0-32], KC1[]
726
723
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
727
724
; EG-NEXT: CF_END
728
725
; EG-NEXT: PAD
@@ -731,10 +728,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
731
728
; EG-NEXT: ALU clause starting at 8:
732
729
; EG-NEXT: MOV * T0.X, KC0[2].Z,
733
730
; EG-NEXT: ALU clause starting at 9:
734
- ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
735
- ; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
736
- ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
737
- ; EG-NEXT: -16(nan), 3(4.203895e-45)
731
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
732
+ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
733
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
734
+ ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
735
+ ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
738
736
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
739
737
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
740
738
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -1103,8 +1101,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1103
1101
; SI-NEXT: s_mov_b32 s4, s0
1104
1102
; SI-NEXT: s_mov_b32 s5, s1
1105
1103
; SI-NEXT: s_waitcnt vmcnt(0)
1104
+ ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1106
1105
; SI-NEXT: v_ffbh_u32_e32 v0, v0
1107
- ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
1108
1106
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1109
1107
; SI-NEXT: s_endpgm
1110
1108
;
@@ -1117,9 +1115,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1117
1115
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1118
1116
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1119
1117
; VI-NEXT: s_waitcnt vmcnt(0)
1120
- ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1121
- ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
1122
- ; VI-NEXT: v_add_u16_e32 v2, -8, v0
1118
+ ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1119
+ ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1123
1120
; VI-NEXT: v_mov_b32_e32 v0, s0
1124
1121
; VI-NEXT: v_mov_b32_e32 v1, s1
1125
1122
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1138,13 +1135,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
1138
1135
; EG-NEXT: ALU clause starting at 8:
1139
1136
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1140
1137
; EG-NEXT: ALU clause starting at 9:
1141
- ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1138
+ ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1139
+ ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1140
+ ; EG-NEXT: FFBH_UINT T0.W, PV.W,
1142
1141
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1143
1142
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1144
- ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1145
- ; EG-NEXT: -24(nan), 0(0.000000e+00)
1146
1143
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1147
- ; EG-NEXT: LSHL * T1.W, T1.W , literal.y,
1144
+ ; EG-NEXT: LSHL * T1.W, PS , literal.y,
1148
1145
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1149
1146
; EG-NEXT: LSHL T0.X, PV.W, PS,
1150
1147
; EG-NEXT: LSHL * T0.W, literal.x, PS,
0 commit comments