@@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
135
135
; GFX7-NEXT: s_mov_b32 s7, 0xf000
136
136
; GFX7-NEXT: s_mov_b32 s6, -1
137
137
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
138
- ; GFX7-NEXT: s_xor_b32 s1, s1, s2
139
- ; GFX7-NEXT: s_and_b32 s0, s0, s1
140
- ; GFX7-NEXT: s_xor_b32 s0, s2, s0
138
+ ; GFX7-NEXT: s_andn2_b32 s2, s2, s0
139
+ ; GFX7-NEXT: s_and_b32 s0, s1, s0
140
+ ; GFX7-NEXT: s_or_b32 s0, s0, s2
141
141
; GFX7-NEXT: v_mov_b32_e32 v0, s0
142
142
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
143
143
; GFX7-NEXT: s_endpgm
@@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
147
147
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
148
148
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
149
149
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
150
- ; GFX8-NEXT: s_xor_b32 s1, s1, s2
151
- ; GFX8-NEXT: s_and_b32 s0, s0, s1
152
- ; GFX8-NEXT: s_xor_b32 s0, s2, s0
150
+ ; GFX8-NEXT: s_andn2_b32 s2, s2, s0
151
+ ; GFX8-NEXT: s_and_b32 s0, s1, s0
152
+ ; GFX8-NEXT: s_or_b32 s0, s0, s2
153
153
; GFX8-NEXT: v_mov_b32_e32 v0, s4
154
154
; GFX8-NEXT: v_mov_b32_e32 v1, s5
155
155
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
163
163
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
164
164
; GFX10-NEXT: v_mov_b32_e32 v0, 0
165
165
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
166
- ; GFX10-NEXT: s_xor_b32 s1, s1, s2
167
- ; GFX10-NEXT: s_and_b32 s0, s0, s1
168
- ; GFX10-NEXT: s_xor_b32 s0, s2, s0
166
+ ; GFX10-NEXT: s_andn2_b32 s2, s2, s0
167
+ ; GFX10-NEXT: s_and_b32 s0, s1, s0
168
+ ; GFX10-NEXT: s_or_b32 s0, s0, s2
169
169
; GFX10-NEXT: v_mov_b32_e32 v1, s0
170
170
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
171
171
; GFX10-NEXT: s_endpgm
@@ -317,19 +317,26 @@ entry:
317
317
define amdgpu_ps float @s_s_v_bfi_sha256_ch (i32 inreg %x , i32 inreg %y , i32 %z ) {
318
318
; GFX7-LABEL: s_s_v_bfi_sha256_ch:
319
319
; GFX7: ; %bb.0: ; %entry
320
- ; GFX7-NEXT: v_mov_b32_e32 v1, s0
321
- ; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0
320
+ ; GFX7-NEXT: s_not_b32 s1, s1
321
+ ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
322
+ ; GFX7-NEXT: s_nand_b32 s0, s1, s0
323
+ ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
322
324
; GFX7-NEXT: ; return to shader part epilog
323
325
;
324
326
; GFX8-LABEL: s_s_v_bfi_sha256_ch:
325
327
; GFX8: ; %bb.0: ; %entry
326
- ; GFX8-NEXT: v_mov_b32_e32 v1, s0
327
- ; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0
328
+ ; GFX8-NEXT: s_not_b32 s1, s1
329
+ ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
330
+ ; GFX8-NEXT: s_nand_b32 s0, s1, s0
331
+ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
328
332
; GFX8-NEXT: ; return to shader part epilog
329
333
;
330
334
; GFX10-LABEL: s_s_v_bfi_sha256_ch:
331
335
; GFX10: ; %bb.0: ; %entry
332
- ; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0
336
+ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
337
+ ; GFX10-NEXT: s_not_b32 s1, s1
338
+ ; GFX10-NEXT: s_nand_b32 s0, s1, s0
339
+ ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
333
340
; GFX10-NEXT: ; return to shader part epilog
334
341
;
335
342
; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
@@ -350,30 +357,40 @@ entry:
350
357
ret float %cast
351
358
}
352
359
353
- define amdgpu_ps float @s_v_v_bfi_sha256_ch (i32 inreg %x , i32 %y , i32 %z ) {
360
+ define amdgpu_ps float @s_v_v_bfi_sha256_ch (i32 inreg %x , i32 inreg %y , i32 %z ) {
354
361
; GFX7-LABEL: s_v_v_bfi_sha256_ch:
355
362
; GFX7: ; %bb.0: ; %entry
356
- ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
363
+ ; GFX7-NEXT: s_not_b32 s1, s1
364
+ ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
365
+ ; GFX7-NEXT: s_nand_b32 s0, s1, s0
366
+ ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
357
367
; GFX7-NEXT: ; return to shader part epilog
358
368
;
359
369
; GFX8-LABEL: s_v_v_bfi_sha256_ch:
360
370
; GFX8: ; %bb.0: ; %entry
361
- ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
371
+ ; GFX8-NEXT: s_not_b32 s1, s1
372
+ ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
373
+ ; GFX8-NEXT: s_nand_b32 s0, s1, s0
374
+ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
362
375
; GFX8-NEXT: ; return to shader part epilog
363
376
;
364
377
; GFX10-LABEL: s_v_v_bfi_sha256_ch:
365
378
; GFX10: ; %bb.0: ; %entry
366
- ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1
379
+ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
380
+ ; GFX10-NEXT: s_not_b32 s1, s1
381
+ ; GFX10-NEXT: s_nand_b32 s0, s1, s0
382
+ ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
367
383
; GFX10-NEXT: ; return to shader part epilog
368
384
;
369
385
; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
370
386
; GFX8-GISEL: ; %bb.0: ; %entry
371
- ; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
387
+ ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
388
+ ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0
372
389
; GFX8-GISEL-NEXT: ; return to shader part epilog
373
390
;
374
391
; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
375
392
; GFX10-GISEL: ; %bb.0: ; %entry
376
- ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
393
+ ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0
377
394
; GFX10-GISEL-NEXT: ; return to shader part epilog
378
395
entry:
379
396
%xor0 = xor i32 %y , %z
@@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6
1008
1025
define amdgpu_ps <2 x float > @s_s_v_bitselect_i64_pat_1 (i64 inreg %a , i64 inreg %b , i64 %mask ) {
1009
1026
; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
1010
1027
; GFX7: ; %bb.0:
1011
- ; GFX7-NEXT: v_mov_b32_e32 v2, s1
1012
- ; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1
1013
- ; GFX7-NEXT: v_mov_b32_e32 v2, s0
1014
- ; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0
1028
+ ; GFX7-NEXT: s_not_b64 s[0:1], s[0:1]
1029
+ ; GFX7-NEXT: v_or_b32_e32 v1, s3, v1
1030
+ ; GFX7-NEXT: v_or_b32_e32 v0, s2, v0
1031
+ ; GFX7-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
1032
+ ; GFX7-NEXT: v_and_b32_e32 v1, s1, v1
1033
+ ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
1015
1034
; GFX7-NEXT: ; return to shader part epilog
1016
1035
;
1017
1036
; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
1018
1037
; GFX8: ; %bb.0:
1019
- ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1020
- ; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1
1021
- ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1022
- ; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0
1038
+ ; GFX8-NEXT: s_not_b64 s[0:1], s[0:1]
1039
+ ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
1040
+ ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
1041
+ ; GFX8-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
1042
+ ; GFX8-NEXT: v_and_b32_e32 v1, s1, v1
1043
+ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
1023
1044
; GFX8-NEXT: ; return to shader part epilog
1024
1045
;
1025
1046
; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
1026
1047
; GFX10: ; %bb.0:
1027
- ; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0
1028
- ; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1
1048
+ ; GFX10-NEXT: v_or_b32_e32 v1, s3, v1
1049
+ ; GFX10-NEXT: v_or_b32_e32 v0, s2, v0
1050
+ ; GFX10-NEXT: s_not_b64 s[0:1], s[0:1]
1051
+ ; GFX10-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
1052
+ ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
1053
+ ; GFX10-NEXT: v_and_b32_e32 v1, s1, v1
1029
1054
; GFX10-NEXT: ; return to shader part epilog
1030
1055
;
1031
1056
; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
@@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1495
1520
; GFX7-NEXT: s_mov_b32 s7, 0xf000
1496
1521
; GFX7-NEXT: s_mov_b32 s6, -1
1497
1522
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1498
- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1499
1523
; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1500
- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1524
+ ; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1525
+ ; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
1501
1526
; GFX7-NEXT: s_add_u32 s0, s0, 10
1502
1527
; GFX7-NEXT: s_addc_u32 s1, s1, 0
1503
1528
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1510
1535
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1511
1536
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1512
1537
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1513
- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1514
1538
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1515
- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1539
+ ; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1540
+ ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
1516
1541
; GFX8-NEXT: s_add_u32 s0, s0, 10
1517
1542
; GFX8-NEXT: s_addc_u32 s1, s1, 0
1518
1543
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1526
1551
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1527
1552
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1528
1553
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1529
- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1530
1554
; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1531
- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1555
+ ; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1556
+ ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
1532
1557
; GFX10-NEXT: s_add_u32 s0, s0, 10
1533
1558
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1534
1559
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1583
1608
; GFX7-NEXT: s_mov_b32 s7, 0xf000
1584
1609
; GFX7-NEXT: s_mov_b32 s6, -1
1585
1610
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1586
- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1587
1611
; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1588
- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1612
+ ; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1613
+ ; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
1589
1614
; GFX7-NEXT: s_add_u32 s0, s0, 10
1590
1615
; GFX7-NEXT: s_addc_u32 s1, s1, 0
1591
1616
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1598
1623
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1599
1624
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1600
1625
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1601
- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1602
1626
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1603
- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1627
+ ; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1628
+ ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
1604
1629
; GFX8-NEXT: s_add_u32 s0, s0, 10
1605
1630
; GFX8-NEXT: s_addc_u32 s1, s1, 0
1606
1631
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1614
1639
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1615
1640
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1616
1641
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1617
- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1618
1642
; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1619
- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1643
+ ; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1644
+ ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
1620
1645
; GFX10-NEXT: s_add_u32 s0, s0, 10
1621
1646
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1622
1647
; GFX10-NEXT: v_mov_b32_e32 v0, s0
0 commit comments