@@ -457,6 +457,188 @@ define void @add_32r_seq_cst(i32* %p, i32 %v) {
457
457
ret void
458
458
}
459
459
460
+ ; ----- SUB -----
461
+
462
+ define void @sub_8r (i8* %p , i8 %v ) {
463
+ ; X64-LABEL: sub_8r:
464
+ ; X64: # %bb.0:
465
+ ; X64-NEXT: movb (%rdi), %al
466
+ ; X64-NEXT: subb %sil, %al
467
+ ; X64-NEXT: movb %al, (%rdi)
468
+ ; X64-NEXT: retq
469
+ ;
470
+ ; X32-LABEL: sub_8r:
471
+ ; X32: # %bb.0:
472
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
473
+ ; X32-NEXT: movb (%eax), %cl
474
+ ; X32-NEXT: subb {{[0-9]+}}(%esp), %cl
475
+ ; X32-NEXT: movb %cl, (%eax)
476
+ ; X32-NEXT: retl
477
+ %1 = load atomic i8 , i8* %p seq_cst , align 1
478
+ %2 = sub i8 %1 , %v
479
+ store atomic i8 %2 , i8* %p release , align 1
480
+ ret void
481
+ }
482
+
483
+ define void @sub_16r (i16* %p , i16 %v ) {
484
+ ; Currently the transformation is not done on 16 bit accesses, as the backend
485
+ ; treat 16 bit arithmetic as expensive on X86/X86_64.
486
+ ; X64-LABEL: sub_16r:
487
+ ; X64: # %bb.0:
488
+ ; X64-NEXT: movzwl (%rdi), %eax
489
+ ; X64-NEXT: subw %si, %ax
490
+ ; X64-NEXT: movw %ax, (%rdi)
491
+ ; X64-NEXT: retq
492
+ ;
493
+ ; X32-LABEL: sub_16r:
494
+ ; X32: # %bb.0:
495
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
496
+ ; X32-NEXT: movzwl (%eax), %ecx
497
+ ; X32-NEXT: subw {{[0-9]+}}(%esp), %cx
498
+ ; X32-NEXT: movw %cx, (%eax)
499
+ ; X32-NEXT: retl
500
+ %1 = load atomic i16 , i16* %p acquire , align 2
501
+ %2 = sub i16 %1 , %v
502
+ store atomic i16 %2 , i16* %p release , align 2
503
+ ret void
504
+ }
505
+
506
+ define void @sub_32r (i32* %p , i32 %v ) {
507
+ ; X64-LABEL: sub_32r:
508
+ ; X64: # %bb.0:
509
+ ; X64-NEXT: movl (%rdi), %eax
510
+ ; X64-NEXT: subl %esi, %eax
511
+ ; X64-NEXT: movl %eax, (%rdi)
512
+ ; X64-NEXT: retq
513
+ ;
514
+ ; X32-LABEL: sub_32r:
515
+ ; X32: # %bb.0:
516
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
517
+ ; X32-NEXT: movl (%eax), %ecx
518
+ ; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
519
+ ; X32-NEXT: movl %ecx, (%eax)
520
+ ; X32-NEXT: retl
521
+ %1 = load atomic i32 , i32* %p acquire , align 4
522
+ %2 = sub i32 %1 , %v
523
+ store atomic i32 %2 , i32* %p monotonic , align 4
524
+ ret void
525
+ }
526
+
527
+ ; The following is a corner case where the load is subed to itself. The pattern
528
+ ; matching should not fold this. We only test with 32-bit sub, but the same
529
+ ; applies to other sizes and operations.
530
+ define void @sub_32r_self (i32* %p ) {
531
+ ; X64-LABEL: sub_32r_self:
532
+ ; X64: # %bb.0:
533
+ ; X64-NEXT: movl (%rdi), %eax
534
+ ; X64-NEXT: movl $0, (%rdi)
535
+ ; X64-NEXT: retq
536
+ ;
537
+ ; X32-LABEL: sub_32r_self:
538
+ ; X32: # %bb.0:
539
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
540
+ ; X32-NEXT: movl (%eax), %ecx
541
+ ; X32-NEXT: movl $0, (%eax)
542
+ ; X32-NEXT: retl
543
+ %1 = load atomic i32 , i32* %p acquire , align 4
544
+ %2 = sub i32 %1 , %1
545
+ store atomic i32 %2 , i32* %p monotonic , align 4
546
+ ret void
547
+ }
548
+
549
+ ; The following is a corner case where the load's result is returned. The
550
+ ; optimizer isn't allowed to duplicate the load because it's atomic.
551
+ define i32 @sub_32r_ret_load (i32* %p , i32 %v ) {
552
+ ; X64-LABEL: sub_32r_ret_load:
553
+ ; X64: # %bb.0:
554
+ ; X64-NEXT: movl (%rdi), %eax
555
+ ; X64-NEXT: movl %eax, %ecx
556
+ ; X64-NEXT: subl %esi, %ecx
557
+ ; X64-NEXT: movl %ecx, (%rdi)
558
+ ; X64-NEXT: retq
559
+ ;
560
+ ; X32-LABEL: sub_32r_ret_load:
561
+ ; X32: # %bb.0:
562
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
563
+ ; X32-NEXT: movl (%ecx), %eax
564
+ ; X32-NEXT: movl %eax, %edx
565
+ ; X32-NEXT: subl {{[0-9]+}}(%esp), %edx
566
+ ; X32-NEXT: movl %edx, (%ecx)
567
+ ; X32-NEXT: retl
568
+ ; More code here, we just don't want it to load from P.
569
+ %1 = load atomic i32 , i32* %p acquire , align 4
570
+ %2 = sub i32 %1 , %v
571
+ store atomic i32 %2 , i32* %p monotonic , align 4
572
+ ret i32 %1
573
+ }
574
+
575
+ define void @sub_64r (i64* %p , i64 %v ) {
576
+ ; X64-LABEL: sub_64r:
577
+ ; X64: # %bb.0:
578
+ ; X64-NEXT: movq (%rdi), %rax
579
+ ; X64-NEXT: subq %rsi, %rax
580
+ ; X64-NEXT: movq %rax, (%rdi)
581
+ ; X64-NEXT: retq
582
+ ;
583
+ ; X32-LABEL: sub_64r:
584
+ ; X32: # %bb.0:
585
+ ; X32-NEXT: pushl %ebx
586
+ ; X32-NEXT: .cfi_def_cfa_offset 8
587
+ ; X32-NEXT: pushl %esi
588
+ ; X32-NEXT: .cfi_def_cfa_offset 12
589
+ ; X32-NEXT: .cfi_offset %esi, -12
590
+ ; X32-NEXT: .cfi_offset %ebx, -8
591
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
592
+ ; X32-NEXT: xorl %eax, %eax
593
+ ; X32-NEXT: xorl %edx, %edx
594
+ ; X32-NEXT: xorl %ecx, %ecx
595
+ ; X32-NEXT: xorl %ebx, %ebx
596
+ ; X32-NEXT: lock cmpxchg8b (%esi)
597
+ ; X32-NEXT: movl %edx, %ecx
598
+ ; X32-NEXT: movl %eax, %ebx
599
+ ; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx
600
+ ; X32-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
601
+ ; X32-NEXT: movl (%esi), %eax
602
+ ; X32-NEXT: movl 4(%esi), %edx
603
+ ; X32-NEXT: .p2align 4, 0x90
604
+ ; X32-NEXT: .LBB23_1: # %atomicrmw.start
605
+ ; X32-NEXT: # =>This Inner Loop Header: Depth=1
606
+ ; X32-NEXT: lock cmpxchg8b (%esi)
607
+ ; X32-NEXT: jne .LBB23_1
608
+ ; X32-NEXT: # %bb.2: # %atomicrmw.end
609
+ ; X32-NEXT: popl %esi
610
+ ; X32-NEXT: .cfi_def_cfa_offset 8
611
+ ; X32-NEXT: popl %ebx
612
+ ; X32-NEXT: .cfi_def_cfa_offset 4
613
+ ; X32-NEXT: retl
614
+ ; We do not check X86-32 as it cannot do 'subq'.
615
+ %1 = load atomic i64 , i64* %p acquire , align 8
616
+ %2 = sub i64 %1 , %v
617
+ store atomic i64 %2 , i64* %p release , align 8
618
+ ret void
619
+ }
620
+
621
+ define void @sub_32r_seq_cst (i32* %p , i32 %v ) {
622
+ ; X64-LABEL: sub_32r_seq_cst:
623
+ ; X64: # %bb.0:
624
+ ; X64-NEXT: movl (%rdi), %eax
625
+ ; X64-NEXT: subl %esi, %eax
626
+ ; X64-NEXT: xchgl %eax, (%rdi)
627
+ ; X64-NEXT: retq
628
+ ;
629
+ ; X32-LABEL: sub_32r_seq_cst:
630
+ ; X32: # %bb.0:
631
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
632
+ ; X32-NEXT: movl (%eax), %ecx
633
+ ; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
634
+ ; X32-NEXT: xchgl %ecx, (%eax)
635
+ ; X32-NEXT: retl
636
+ %1 = load atomic i32 , i32* %p monotonic , align 4
637
+ %2 = sub i32 %1 , %v
638
+ store atomic i32 %2 , i32* %p seq_cst , align 4
639
+ ret void
640
+ }
641
+
460
642
; ----- AND -----
461
643
462
644
define void @and_8i (i8* %p ) {
@@ -593,11 +775,11 @@ define void @and_64i(i64* %p) {
593
775
; X32-NEXT: movl (%esi), %eax
594
776
; X32-NEXT: movl 4(%esi), %edx
595
777
; X32-NEXT: .p2align 4, 0x90
596
- ; X32-NEXT: .LBB24_1 : # %atomicrmw.start
778
+ ; X32-NEXT: .LBB31_1 : # %atomicrmw.start
597
779
; X32-NEXT: # =>This Inner Loop Header: Depth=1
598
780
; X32-NEXT: xorl %ecx, %ecx
599
781
; X32-NEXT: lock cmpxchg8b (%esi)
600
- ; X32-NEXT: jne .LBB24_1
782
+ ; X32-NEXT: jne .LBB31_1
601
783
; X32-NEXT: # %bb.2: # %atomicrmw.end
602
784
; X32-NEXT: popl %esi
603
785
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -638,10 +820,10 @@ define void @and_64r(i64* %p, i64 %v) {
638
820
; X32-NEXT: movl (%esi), %eax
639
821
; X32-NEXT: movl 4(%esi), %edx
640
822
; X32-NEXT: .p2align 4, 0x90
641
- ; X32-NEXT: .LBB25_1 : # %atomicrmw.start
823
+ ; X32-NEXT: .LBB32_1 : # %atomicrmw.start
642
824
; X32-NEXT: # =>This Inner Loop Header: Depth=1
643
825
; X32-NEXT: lock cmpxchg8b (%esi)
644
- ; X32-NEXT: jne .LBB25_1
826
+ ; X32-NEXT: jne .LBB32_1
645
827
; X32-NEXT: # %bb.2: # %atomicrmw.end
646
828
; X32-NEXT: popl %esi
647
829
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -830,10 +1012,10 @@ define void @or_64i(i64* %p) {
830
1012
; X32-NEXT: movl (%esi), %eax
831
1013
; X32-NEXT: movl 4(%esi), %edx
832
1014
; X32-NEXT: .p2align 4, 0x90
833
- ; X32-NEXT: .LBB34_1 : # %atomicrmw.start
1015
+ ; X32-NEXT: .LBB41_1 : # %atomicrmw.start
834
1016
; X32-NEXT: # =>This Inner Loop Header: Depth=1
835
1017
; X32-NEXT: lock cmpxchg8b (%esi)
836
- ; X32-NEXT: jne .LBB34_1
1018
+ ; X32-NEXT: jne .LBB41_1
837
1019
; X32-NEXT: # %bb.2: # %atomicrmw.end
838
1020
; X32-NEXT: popl %esi
839
1021
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -874,10 +1056,10 @@ define void @or_64r(i64* %p, i64 %v) {
874
1056
; X32-NEXT: movl (%esi), %eax
875
1057
; X32-NEXT: movl 4(%esi), %edx
876
1058
; X32-NEXT: .p2align 4, 0x90
877
- ; X32-NEXT: .LBB35_1 : # %atomicrmw.start
1059
+ ; X32-NEXT: .LBB42_1 : # %atomicrmw.start
878
1060
; X32-NEXT: # =>This Inner Loop Header: Depth=1
879
1061
; X32-NEXT: lock cmpxchg8b (%esi)
880
- ; X32-NEXT: jne .LBB35_1
1062
+ ; X32-NEXT: jne .LBB42_1
881
1063
; X32-NEXT: # %bb.2: # %atomicrmw.end
882
1064
; X32-NEXT: popl %esi
883
1065
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -1066,10 +1248,10 @@ define void @xor_64i(i64* %p) {
1066
1248
; X32-NEXT: movl (%esi), %eax
1067
1249
; X32-NEXT: movl 4(%esi), %edx
1068
1250
; X32-NEXT: .p2align 4, 0x90
1069
- ; X32-NEXT: .LBB44_1 : # %atomicrmw.start
1251
+ ; X32-NEXT: .LBB51_1 : # %atomicrmw.start
1070
1252
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1071
1253
; X32-NEXT: lock cmpxchg8b (%esi)
1072
- ; X32-NEXT: jne .LBB44_1
1254
+ ; X32-NEXT: jne .LBB51_1
1073
1255
; X32-NEXT: # %bb.2: # %atomicrmw.end
1074
1256
; X32-NEXT: popl %esi
1075
1257
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -1110,10 +1292,10 @@ define void @xor_64r(i64* %p, i64 %v) {
1110
1292
; X32-NEXT: movl (%esi), %eax
1111
1293
; X32-NEXT: movl 4(%esi), %edx
1112
1294
; X32-NEXT: .p2align 4, 0x90
1113
- ; X32-NEXT: .LBB45_1 : # %atomicrmw.start
1295
+ ; X32-NEXT: .LBB52_1 : # %atomicrmw.start
1114
1296
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1115
1297
; X32-NEXT: lock cmpxchg8b (%esi)
1116
- ; X32-NEXT: jne .LBB45_1
1298
+ ; X32-NEXT: jne .LBB52_1
1117
1299
; X32-NEXT: # %bb.2: # %atomicrmw.end
1118
1300
; X32-NEXT: popl %esi
1119
1301
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -1266,10 +1448,10 @@ define void @inc_64(i64* %p) {
1266
1448
; X32-NEXT: movl (%esi), %eax
1267
1449
; X32-NEXT: movl 4(%esi), %edx
1268
1450
; X32-NEXT: .p2align 4, 0x90
1269
- ; X32-NEXT: .LBB51_1 : # %atomicrmw.start
1451
+ ; X32-NEXT: .LBB58_1 : # %atomicrmw.start
1270
1452
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1271
1453
; X32-NEXT: lock cmpxchg8b (%esi)
1272
- ; X32-NEXT: jne .LBB51_1
1454
+ ; X32-NEXT: jne .LBB58_1
1273
1455
; X32-NEXT: # %bb.2: # %atomicrmw.end
1274
1456
; X32-NEXT: popl %esi
1275
1457
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -1413,10 +1595,10 @@ define void @dec_64(i64* %p) {
1413
1595
; X32-NEXT: movl (%esi), %eax
1414
1596
; X32-NEXT: movl 4(%esi), %edx
1415
1597
; X32-NEXT: .p2align 4, 0x90
1416
- ; X32-NEXT: .LBB56_1 : # %atomicrmw.start
1598
+ ; X32-NEXT: .LBB63_1 : # %atomicrmw.start
1417
1599
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1418
1600
; X32-NEXT: lock cmpxchg8b (%esi)
1419
- ; X32-NEXT: jne .LBB56_1
1601
+ ; X32-NEXT: jne .LBB63_1
1420
1602
; X32-NEXT: # %bb.2: # %atomicrmw.end
1421
1603
; X32-NEXT: popl %esi
1422
1604
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -1545,10 +1727,10 @@ define void @not_64(i64* %p) {
1545
1727
; X32-NEXT: movl (%esi), %eax
1546
1728
; X32-NEXT: movl 4(%esi), %edx
1547
1729
; X32-NEXT: .p2align 4, 0x90
1548
- ; X32-NEXT: .LBB61_1 : # %atomicrmw.start
1730
+ ; X32-NEXT: .LBB68_1 : # %atomicrmw.start
1549
1731
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1550
1732
; X32-NEXT: lock cmpxchg8b (%esi)
1551
- ; X32-NEXT: jne .LBB61_1
1733
+ ; X32-NEXT: jne .LBB68_1
1552
1734
; X32-NEXT: # %bb.2: # %atomicrmw.end
1553
1735
; X32-NEXT: popl %esi
1554
1736
; X32-NEXT: .cfi_def_cfa_offset 8
@@ -1672,11 +1854,11 @@ define void @neg_64(i64* %p) {
1672
1854
; X32-NEXT: movl (%edi), %eax
1673
1855
; X32-NEXT: movl 4(%edi), %edx
1674
1856
; X32-NEXT: .p2align 4, 0x90
1675
- ; X32-NEXT: .LBB66_1 : # %atomicrmw.start
1857
+ ; X32-NEXT: .LBB73_1 : # %atomicrmw.start
1676
1858
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1677
1859
; X32-NEXT: movl %esi, %ecx
1678
1860
; X32-NEXT: lock cmpxchg8b (%edi)
1679
- ; X32-NEXT: jne .LBB66_1
1861
+ ; X32-NEXT: jne .LBB73_1
1680
1862
; X32-NEXT: # %bb.2: # %atomicrmw.end
1681
1863
; X32-NEXT: popl %esi
1682
1864
; X32-NEXT: .cfi_def_cfa_offset 12
@@ -1784,10 +1966,10 @@ define void @fadd_64r(double* %loc, double %val) {
1784
1966
; X32-NEXT: movl (%esi), %eax
1785
1967
; X32-NEXT: movl 4(%esi), %edx
1786
1968
; X32-NEXT: .p2align 4, 0x90
1787
- ; X32-NEXT: .LBB69_1 : # %atomicrmw.start
1969
+ ; X32-NEXT: .LBB76_1 : # %atomicrmw.start
1788
1970
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1789
1971
; X32-NEXT: lock cmpxchg8b (%esi)
1790
- ; X32-NEXT: jne .LBB69_1
1972
+ ; X32-NEXT: jne .LBB76_1
1791
1973
; X32-NEXT: # %bb.2: # %atomicrmw.end
1792
1974
; X32-NEXT: leal -8(%ebp), %esp
1793
1975
; X32-NEXT: popl %esi
@@ -1874,10 +2056,10 @@ define void @fadd_64g() {
1874
2056
; X32-NEXT: movl glob64+4, %edx
1875
2057
; X32-NEXT: movl glob64, %eax
1876
2058
; X32-NEXT: .p2align 4, 0x90
1877
- ; X32-NEXT: .LBB71_1 : # %atomicrmw.start
2059
+ ; X32-NEXT: .LBB78_1 : # %atomicrmw.start
1878
2060
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1879
2061
; X32-NEXT: lock cmpxchg8b glob64
1880
- ; X32-NEXT: jne .LBB71_1
2062
+ ; X32-NEXT: jne .LBB78_1
1881
2063
; X32-NEXT: # %bb.2: # %atomicrmw.end
1882
2064
; X32-NEXT: leal -4(%ebp), %esp
1883
2065
; X32-NEXT: popl %ebx
@@ -1961,10 +2143,10 @@ define void @fadd_64imm() {
1961
2143
; X32-NEXT: movl -559038737, %eax
1962
2144
; X32-NEXT: movl -559038733, %edx
1963
2145
; X32-NEXT: .p2align 4, 0x90
1964
- ; X32-NEXT: .LBB73_1 : # %atomicrmw.start
2146
+ ; X32-NEXT: .LBB80_1 : # %atomicrmw.start
1965
2147
; X32-NEXT: # =>This Inner Loop Header: Depth=1
1966
2148
; X32-NEXT: lock cmpxchg8b -559038737
1967
- ; X32-NEXT: jne .LBB73_1
2149
+ ; X32-NEXT: jne .LBB80_1
1968
2150
; X32-NEXT: # %bb.2: # %atomicrmw.end
1969
2151
; X32-NEXT: leal -4(%ebp), %esp
1970
2152
; X32-NEXT: popl %ebx
@@ -2048,10 +2230,10 @@ define void @fadd_64stack() {
2048
2230
; X32-NEXT: movl (%esp), %eax
2049
2231
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
2050
2232
; X32-NEXT: .p2align 4, 0x90
2051
- ; X32-NEXT: .LBB75_1 : # %atomicrmw.start
2233
+ ; X32-NEXT: .LBB82_1 : # %atomicrmw.start
2052
2234
; X32-NEXT: # =>This Inner Loop Header: Depth=1
2053
2235
; X32-NEXT: lock cmpxchg8b (%esp)
2054
- ; X32-NEXT: jne .LBB75_1
2236
+ ; X32-NEXT: jne .LBB82_1
2055
2237
; X32-NEXT: # %bb.2: # %atomicrmw.end
2056
2238
; X32-NEXT: leal -4(%ebp), %esp
2057
2239
; X32-NEXT: popl %ebx
@@ -2108,10 +2290,10 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) {
2108
2290
; X32-NEXT: movl (%edi,%esi,8), %eax
2109
2291
; X32-NEXT: movl 4(%edi,%esi,8), %edx
2110
2292
; X32-NEXT: .p2align 4, 0x90
2111
- ; X32-NEXT: .LBB76_1 : # %atomicrmw.start
2293
+ ; X32-NEXT: .LBB83_1 : # %atomicrmw.start
2112
2294
; X32-NEXT: # =>This Inner Loop Header: Depth=1
2113
2295
; X32-NEXT: lock cmpxchg8b (%edi,%esi,8)
2114
- ; X32-NEXT: jne .LBB76_1
2296
+ ; X32-NEXT: jne .LBB83_1
2115
2297
; X32-NEXT: # %bb.2: # %atomicrmw.end
2116
2298
; X32-NEXT: leal -12(%ebp), %esp
2117
2299
; X32-NEXT: popl %esi
0 commit comments