Skip to content

Commit 47b89fb

Browse files
authored
[AArch64] Use i32 extract from UADDV in popcount lowering. (#140718)
We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will anyext into the top bits. The instruction we create (UADDV) is known to be zeroes in the upper bits, so we can convert to a larger v2i32 vector and extract from there, similar to the operation currently performed for i64 types. Fixes #140707
1 parent 6fb23af commit 47b89fb

File tree

2 files changed

+113
-7
lines changed

2 files changed

+113
-7
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10866,13 +10866,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1086610866

1086710867
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
1086810868
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10869-
if (VT == MVT::i32)
10870-
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
10871-
DAG.getConstant(0, DL, MVT::i64));
10872-
else
10873-
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
10874-
DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
10875-
DAG.getConstant(0, DL, MVT::i64));
10869+
AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
10870+
VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
10871+
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
10872+
DAG.getConstant(0, DL, MVT::i64));
1087610873
if (IsParity)
1087710874
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
1087810875
return AddV;

llvm/test/CodeGen/AArch64/popcount.ll

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,4 +648,113 @@ Entry:
648648
ret <4 x i16> %1
649649
}
650650

651+
define i32 @ctpop_into_extract(ptr %p) {
652+
; CHECKO0-LABEL: ctpop_into_extract:
653+
; CHECKO0: // %bb.0:
654+
; CHECKO0-NEXT: mov w8, #-1 // =0xffffffff
655+
; CHECKO0-NEXT: // implicit-def: $d1
656+
; CHECKO0-NEXT: // implicit-def: $q0
657+
; CHECKO0-NEXT: fmov d0, d1
658+
; CHECKO0-NEXT: mov v0.s[0], w8
659+
; CHECKO0-NEXT: fmov d2, d0
660+
; CHECKO0-NEXT: ldr d0, [x0]
661+
; CHECKO0-NEXT: fmov s1, s0
662+
; CHECKO0-NEXT: fmov w8, s1
663+
; CHECKO0-NEXT: fmov s1, w8
664+
; CHECKO0-NEXT: // kill: def $d1 killed $s1
665+
; CHECKO0-NEXT: cnt v1.8b, v1.8b
666+
; CHECKO0-NEXT: uaddlv h1, v1.8b
667+
; CHECKO0-NEXT: // kill: def $q1 killed $h1
668+
; CHECKO0-NEXT: // kill: def $s1 killed $s1 killed $q1
669+
; CHECKO0-NEXT: fmov w8, s1
670+
; CHECKO0-NEXT: // implicit-def: $q1
671+
; CHECKO0-NEXT: fmov d1, d2
672+
; CHECKO0-NEXT: mov v1.s[1], w8
673+
; CHECKO0-NEXT: // kill: def $d1 killed $d1 killed $q1
674+
; CHECKO0-NEXT: sub v0.2s, v0.2s, v1.2s
675+
; CHECKO0-NEXT: str d0, [x0]
676+
; CHECKO0-NEXT: mov w0, wzr
677+
; CHECKO0-NEXT: ret
678+
;
679+
; CHECK-LABEL: ctpop_into_extract:
680+
; CHECK: // %bb.0:
681+
; CHECK-NEXT: ldr d0, [x0]
682+
; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
683+
; CHECK-NEXT: mov x8, x0
684+
; CHECK-NEXT: mov w0, wzr
685+
; CHECK-NEXT: fmov w9, s0
686+
; CHECK-NEXT: fmov s1, w9
687+
; CHECK-NEXT: cnt v1.8b, v1.8b
688+
; CHECK-NEXT: addv b1, v1.8b
689+
; CHECK-NEXT: mov v2.s[1], v1.s[0]
690+
; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
691+
; CHECK-NEXT: str d0, [x8]
692+
; CHECK-NEXT: ret
693+
;
694+
; BE-LABEL: ctpop_into_extract:
695+
; BE: // %bb.0:
696+
; BE-NEXT: ld1 { v0.2s }, [x0]
697+
; BE-NEXT: movi v2.2d, #0xffffffffffffffff
698+
; BE-NEXT: mov x8, x0
699+
; BE-NEXT: mov w0, wzr
700+
; BE-NEXT: fmov w9, s0
701+
; BE-NEXT: fmov s1, w9
702+
; BE-NEXT: cnt v1.8b, v1.8b
703+
; BE-NEXT: addv b1, v1.8b
704+
; BE-NEXT: mov v2.s[1], v1.s[0]
705+
; BE-NEXT: sub v0.2s, v0.2s, v2.2s
706+
; BE-NEXT: st1 { v0.2s }, [x8]
707+
; BE-NEXT: ret
708+
;
709+
; GISEL-LABEL: ctpop_into_extract:
710+
; GISEL: // %bb.0:
711+
; GISEL-NEXT: ldr d0, [x0]
712+
; GISEL-NEXT: mov w9, #-1 // =0xffffffff
713+
; GISEL-NEXT: mov x8, x0
714+
; GISEL-NEXT: mov v2.s[0], w9
715+
; GISEL-NEXT: mov w0, wzr
716+
; GISEL-NEXT: fmov w10, s0
717+
; GISEL-NEXT: fmov s1, w10
718+
; GISEL-NEXT: cnt v1.8b, v1.8b
719+
; GISEL-NEXT: uaddlv h1, v1.8b
720+
; GISEL-NEXT: mov v2.s[1], v1.s[0]
721+
; GISEL-NEXT: sub v0.2s, v0.2s, v2.2s
722+
; GISEL-NEXT: str d0, [x8]
723+
; GISEL-NEXT: ret
724+
;
725+
; GISELO0-LABEL: ctpop_into_extract:
726+
; GISELO0: // %bb.0:
727+
; GISELO0-NEXT: mov w8, #-1 // =0xffffffff
728+
; GISELO0-NEXT: // implicit-def: $d1
729+
; GISELO0-NEXT: // implicit-def: $q0
730+
; GISELO0-NEXT: fmov d0, d1
731+
; GISELO0-NEXT: mov v0.s[0], w8
732+
; GISELO0-NEXT: fmov d2, d0
733+
; GISELO0-NEXT: ldr d0, [x0]
734+
; GISELO0-NEXT: fmov s1, s0
735+
; GISELO0-NEXT: fmov w8, s1
736+
; GISELO0-NEXT: fmov s1, w8
737+
; GISELO0-NEXT: // kill: def $d1 killed $s1
738+
; GISELO0-NEXT: cnt v1.8b, v1.8b
739+
; GISELO0-NEXT: uaddlv h1, v1.8b
740+
; GISELO0-NEXT: // kill: def $q1 killed $h1
741+
; GISELO0-NEXT: // kill: def $s1 killed $s1 killed $q1
742+
; GISELO0-NEXT: fmov w8, s1
743+
; GISELO0-NEXT: // implicit-def: $q1
744+
; GISELO0-NEXT: fmov d1, d2
745+
; GISELO0-NEXT: mov v1.s[1], w8
746+
; GISELO0-NEXT: // kill: def $d1 killed $d1 killed $q1
747+
; GISELO0-NEXT: sub v0.2s, v0.2s, v1.2s
748+
; GISELO0-NEXT: str d0, [x0]
749+
; GISELO0-NEXT: mov w0, wzr
750+
; GISELO0-NEXT: ret
751+
%1 = load <2 x i32>, ptr %p, align 4
752+
%2 = extractelement <2 x i32> %1, i64 0
753+
%3 = call i32 @llvm.ctpop.i32(i32 %2)
754+
%4 = insertelement <2 x i32> <i32 -1, i32 poison>, i32 %3, i64 1
755+
%5 = sub <2 x i32> %1, %4
756+
store <2 x i32> %5, ptr %p, align 4
757+
ret i32 0
758+
}
759+
651760
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)

0 commit comments

Comments
 (0)