@@ -143,6 +143,12 @@ class AArch64InstructionSelector : public InstructionSelector {
143
143
const TargetRegisterClass *DstRC,
144
144
Register Scalar,
145
145
MachineIRBuilder &MIRBuilder) const ;
146
+ // / Helper to narrow vector that was widened by emitScalarToVector.
147
+ // / Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
148
+ // / vector, correspondingly.
149
+ MachineInstr *emitNarrowVector (Register DstReg, Register SrcReg,
150
+ MachineIRBuilder &MIRBuilder,
151
+ MachineRegisterInfo &MRI) const ;
146
152
147
153
// / Emit a lane insert into \p DstReg, or a new vector register if
148
154
// / std::nullopt is provided.
@@ -186,6 +192,8 @@ class AArch64InstructionSelector : public InstructionSelector {
186
192
// / \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
187
193
bool selectVectorLoadIntrinsic (unsigned Opc, unsigned NumVecs,
188
194
MachineInstr &I);
195
+ bool selectVectorLoadLaneIntrinsic (unsigned Opc, unsigned NumVecs,
196
+ MachineInstr &I);
189
197
bool selectIntrinsicWithSideEffects (MachineInstr &I,
190
198
MachineRegisterInfo &MRI);
191
199
bool selectIntrinsic (MachineInstr &I, MachineRegisterInfo &MRI);
@@ -3897,6 +3905,31 @@ MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3897
3905
}
3898
3906
}
3899
3907
3908
+ MachineInstr *
3909
+ AArch64InstructionSelector::emitNarrowVector (Register DstReg, Register SrcReg,
3910
+ MachineIRBuilder &MIB,
3911
+ MachineRegisterInfo &MRI) const {
3912
+ LLT DstTy = MRI.getType (DstReg);
3913
+ const TargetRegisterClass *RC =
3914
+ getRegClassForTypeOnBank (DstTy, *RBI.getRegBank (SrcReg, MRI, TRI));
3915
+ if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3916
+ LLVM_DEBUG (dbgs () << " Unsupported register class!\n " );
3917
+ return nullptr ;
3918
+ }
3919
+ unsigned SubReg = 0 ;
3920
+ if (!getSubRegForClass (RC, TRI, SubReg))
3921
+ return nullptr ;
3922
+ if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3923
+ LLVM_DEBUG (dbgs () << " Unsupported destination size! ("
3924
+ << DstTy.getSizeInBits () << " \n " );
3925
+ return nullptr ;
3926
+ }
3927
+ auto Copy = MIB.buildInstr (TargetOpcode::COPY, {DstReg}, {})
3928
+ .addReg (SrcReg, 0 , SubReg);
3929
+ RBI.constrainGenericRegister (DstReg, *RC, MRI);
3930
+ return Copy;
3931
+ }
3932
+
3900
3933
bool AArch64InstructionSelector::selectMergeValues (
3901
3934
MachineInstr &I, MachineRegisterInfo &MRI) {
3902
3935
assert (I.getOpcode () == TargetOpcode::G_MERGE_VALUES && " unexpected opcode" );
@@ -5384,24 +5417,8 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5384
5417
if (VecSize < 128 ) {
5385
5418
// If we had to widen to perform the insert, then we have to demote back to
5386
5419
// the original size to get the result we want.
5387
- Register DemoteVec = InsMI->getOperand (0 ).getReg ();
5388
- const TargetRegisterClass *RC =
5389
- getRegClassForTypeOnBank (DstTy, *RBI.getRegBank (DemoteVec, MRI, TRI));
5390
- if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5391
- LLVM_DEBUG (dbgs () << " Unsupported register class!\n " );
5392
- return false ;
5393
- }
5394
- unsigned SubReg = 0 ;
5395
- if (!getSubRegForClass (RC, TRI, SubReg))
5420
+ if (!emitNarrowVector (DstReg, InsMI->getOperand (0 ).getReg (), MIB, MRI))
5396
5421
return false ;
5397
- if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5398
- LLVM_DEBUG (dbgs () << " Unsupported destination size! (" << VecSize
5399
- << " \n " );
5400
- return false ;
5401
- }
5402
- MIB.buildInstr (TargetOpcode::COPY, {DstReg}, {})
5403
- .addReg (DemoteVec, 0 , SubReg);
5404
- RBI.constrainGenericRegister (DstReg, *RC, MRI);
5405
5422
} else {
5406
5423
// No widening needed.
5407
5424
InsMI->getOperand (0 ).setReg (DstReg);
@@ -5630,6 +5647,60 @@ bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5630
5647
return true ;
5631
5648
}
5632
5649
5650
+ bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic (
5651
+ unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5652
+ assert (I.getOpcode () == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5653
+ assert (Opc && " Expected an opcode?" );
5654
+ assert (NumVecs > 1 && NumVecs < 5 && " Only support 2, 3, or 4 vectors" );
5655
+ auto &MRI = *MIB.getMRI ();
5656
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5657
+ bool Narrow = Ty.getSizeInBits () == 64 ;
5658
+
5659
+ auto FirstSrcRegIt = I.operands_begin () + NumVecs + 1 ;
5660
+ SmallVector<Register, 4 > Regs (NumVecs);
5661
+ std::transform (FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin (),
5662
+ [](auto MO) { return MO.getReg (); });
5663
+
5664
+ if (Narrow) {
5665
+ transform (Regs, Regs.begin (), [this ](Register Reg) {
5666
+ return emitScalarToVector (64 , &AArch64::FPR128RegClass, Reg, MIB)
5667
+ ->getOperand (0 )
5668
+ .getReg ();
5669
+ });
5670
+ Ty = Ty.multiplyElements (2 );
5671
+ }
5672
+
5673
+ Register Tuple = createQTuple (Regs, MIB);
5674
+ auto LaneNo = getIConstantVRegVal ((FirstSrcRegIt + NumVecs)->getReg (), MRI);
5675
+ if (!LaneNo)
5676
+ return false ;
5677
+
5678
+ Register Ptr = (FirstSrcRegIt + NumVecs + 1 )->getReg ();
5679
+ auto Load = MIB.buildInstr (Opc, {Ty}, {})
5680
+ .addReg (Tuple)
5681
+ .addImm (LaneNo->getZExtValue ())
5682
+ .addReg (Ptr);
5683
+ Load.cloneMemRefs (I);
5684
+ constrainSelectedInstRegOperands (*Load, TII, TRI, RBI);
5685
+ Register SelectedLoadDst = Load->getOperand (0 ).getReg ();
5686
+ unsigned SubReg = AArch64::qsub0;
5687
+ for (unsigned Idx = 0 ; Idx < NumVecs; ++Idx) {
5688
+ auto Vec = MIB.buildInstr (TargetOpcode::COPY,
5689
+ {Narrow ? DstOp (&AArch64::FPR128RegClass)
5690
+ : DstOp (I.getOperand (Idx).getReg ())},
5691
+ {})
5692
+ .addReg (SelectedLoadDst, 0 , SubReg + Idx);
5693
+ Register WideReg = Vec.getReg (0 );
5694
+ // Emit the subreg copies and immediately select them.
5695
+ selectCopy (*Vec, TII, MRI, TRI, RBI);
5696
+ if (Narrow &&
5697
+ !emitNarrowVector (I.getOperand (Idx).getReg (), WideReg, MIB, MRI))
5698
+ return false ;
5699
+ }
5700
+
5701
+ return true ;
5702
+ }
5703
+
5633
5704
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects (
5634
5705
MachineInstr &I, MachineRegisterInfo &MRI) {
5635
5706
// Find the intrinsic ID.
@@ -5664,6 +5735,78 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5664
5735
MIB.buildInstr (AArch64::BRK, {}, {})
5665
5736
.addImm (I.getOperand (1 ).getImm () | (' U' << 8 ));
5666
5737
break ;
5738
+ case Intrinsic::aarch64_neon_ld1x2: {
5739
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5740
+ unsigned Opc = 0 ;
5741
+ if (Ty == LLT::fixed_vector (8 , S8))
5742
+ Opc = AArch64::LD1Twov8b;
5743
+ else if (Ty == LLT::fixed_vector (16 , S8))
5744
+ Opc = AArch64::LD1Twov16b;
5745
+ else if (Ty == LLT::fixed_vector (4 , S16))
5746
+ Opc = AArch64::LD1Twov4h;
5747
+ else if (Ty == LLT::fixed_vector (8 , S16))
5748
+ Opc = AArch64::LD1Twov8h;
5749
+ else if (Ty == LLT::fixed_vector (2 , S32))
5750
+ Opc = AArch64::LD1Twov2s;
5751
+ else if (Ty == LLT::fixed_vector (4 , S32))
5752
+ Opc = AArch64::LD1Twov4s;
5753
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
5754
+ Opc = AArch64::LD1Twov2d;
5755
+ else if (Ty == S64 || Ty == P0)
5756
+ Opc = AArch64::LD1Twov1d;
5757
+ else
5758
+ llvm_unreachable (" Unexpected type for ld1x2!" );
5759
+ selectVectorLoadIntrinsic (Opc, 2 , I);
5760
+ break ;
5761
+ }
5762
+ case Intrinsic::aarch64_neon_ld1x3: {
5763
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5764
+ unsigned Opc = 0 ;
5765
+ if (Ty == LLT::fixed_vector (8 , S8))
5766
+ Opc = AArch64::LD1Threev8b;
5767
+ else if (Ty == LLT::fixed_vector (16 , S8))
5768
+ Opc = AArch64::LD1Threev16b;
5769
+ else if (Ty == LLT::fixed_vector (4 , S16))
5770
+ Opc = AArch64::LD1Threev4h;
5771
+ else if (Ty == LLT::fixed_vector (8 , S16))
5772
+ Opc = AArch64::LD1Threev8h;
5773
+ else if (Ty == LLT::fixed_vector (2 , S32))
5774
+ Opc = AArch64::LD1Threev2s;
5775
+ else if (Ty == LLT::fixed_vector (4 , S32))
5776
+ Opc = AArch64::LD1Threev4s;
5777
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
5778
+ Opc = AArch64::LD1Threev2d;
5779
+ else if (Ty == S64 || Ty == P0)
5780
+ Opc = AArch64::LD1Threev1d;
5781
+ else
5782
+ llvm_unreachable (" Unexpected type for ld1x3!" );
5783
+ selectVectorLoadIntrinsic (Opc, 3 , I);
5784
+ break ;
5785
+ }
5786
+ case Intrinsic::aarch64_neon_ld1x4: {
5787
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5788
+ unsigned Opc = 0 ;
5789
+ if (Ty == LLT::fixed_vector (8 , S8))
5790
+ Opc = AArch64::LD1Fourv8b;
5791
+ else if (Ty == LLT::fixed_vector (16 , S8))
5792
+ Opc = AArch64::LD1Fourv16b;
5793
+ else if (Ty == LLT::fixed_vector (4 , S16))
5794
+ Opc = AArch64::LD1Fourv4h;
5795
+ else if (Ty == LLT::fixed_vector (8 , S16))
5796
+ Opc = AArch64::LD1Fourv8h;
5797
+ else if (Ty == LLT::fixed_vector (2 , S32))
5798
+ Opc = AArch64::LD1Fourv2s;
5799
+ else if (Ty == LLT::fixed_vector (4 , S32))
5800
+ Opc = AArch64::LD1Fourv4s;
5801
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
5802
+ Opc = AArch64::LD1Fourv2d;
5803
+ else if (Ty == S64 || Ty == P0)
5804
+ Opc = AArch64::LD1Fourv1d;
5805
+ else
5806
+ llvm_unreachable (" Unexpected type for ld1x4!" );
5807
+ selectVectorLoadIntrinsic (Opc, 4 , I);
5808
+ break ;
5809
+ }
5667
5810
case Intrinsic::aarch64_neon_ld2: {
5668
5811
LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5669
5812
unsigned Opc = 0 ;
@@ -5688,6 +5831,114 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5688
5831
selectVectorLoadIntrinsic (Opc, 2 , I);
5689
5832
break ;
5690
5833
}
5834
+ case Intrinsic::aarch64_neon_ld2lane: {
5835
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5836
+ unsigned Opc;
5837
+ if (Ty == LLT::fixed_vector (8 , S8) || Ty == LLT::fixed_vector (16 , S8))
5838
+ Opc = AArch64::LD2i8;
5839
+ else if (Ty == LLT::fixed_vector (4 , S16) || Ty == LLT::fixed_vector (8 , S16))
5840
+ Opc = AArch64::LD2i16;
5841
+ else if (Ty == LLT::fixed_vector (2 , S32) || Ty == LLT::fixed_vector (4 , S32))
5842
+ Opc = AArch64::LD2i32;
5843
+ else if (Ty == LLT::fixed_vector (2 , S64) ||
5844
+ Ty == LLT::fixed_vector (2 , P0) || Ty == S64 || Ty == P0)
5845
+ Opc = AArch64::LD2i64;
5846
+ else
5847
+ llvm_unreachable (" Unexpected type for st2lane!" );
5848
+ if (!selectVectorLoadLaneIntrinsic (Opc, 2 , I))
5849
+ return false ;
5850
+ break ;
5851
+ }
5852
+ case Intrinsic::aarch64_neon_ld2r: {
5853
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5854
+ unsigned Opc = 0 ;
5855
+ if (Ty == LLT::fixed_vector (8 , S8))
5856
+ Opc = AArch64::LD2Rv8b;
5857
+ else if (Ty == LLT::fixed_vector (16 , S8))
5858
+ Opc = AArch64::LD2Rv16b;
5859
+ else if (Ty == LLT::fixed_vector (4 , S16))
5860
+ Opc = AArch64::LD2Rv4h;
5861
+ else if (Ty == LLT::fixed_vector (8 , S16))
5862
+ Opc = AArch64::LD2Rv8h;
5863
+ else if (Ty == LLT::fixed_vector (2 , S32))
5864
+ Opc = AArch64::LD2Rv2s;
5865
+ else if (Ty == LLT::fixed_vector (4 , S32))
5866
+ Opc = AArch64::LD2Rv4s;
5867
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
5868
+ Opc = AArch64::LD2Rv2d;
5869
+ else if (Ty == S64 || Ty == P0)
5870
+ Opc = AArch64::LD2Rv1d;
5871
+ else
5872
+ llvm_unreachable (" Unexpected type for ld2r!" );
5873
+ selectVectorLoadIntrinsic (Opc, 2 , I);
5874
+ break ;
5875
+ }
5876
+ case Intrinsic::aarch64_neon_ld3: {
5877
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5878
+ unsigned Opc = 0 ;
5879
+ if (Ty == LLT::fixed_vector (8 , S8))
5880
+ Opc = AArch64::LD3Threev8b;
5881
+ else if (Ty == LLT::fixed_vector (16 , S8))
5882
+ Opc = AArch64::LD3Threev16b;
5883
+ else if (Ty == LLT::fixed_vector (4 , S16))
5884
+ Opc = AArch64::LD3Threev4h;
5885
+ else if (Ty == LLT::fixed_vector (8 , S16))
5886
+ Opc = AArch64::LD3Threev8h;
5887
+ else if (Ty == LLT::fixed_vector (2 , S32))
5888
+ Opc = AArch64::LD3Threev2s;
5889
+ else if (Ty == LLT::fixed_vector (4 , S32))
5890
+ Opc = AArch64::LD3Threev4s;
5891
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
5892
+ Opc = AArch64::LD3Threev2d;
5893
+ else if (Ty == S64 || Ty == P0)
5894
+ Opc = AArch64::LD1Threev1d;
5895
+ else
5896
+ llvm_unreachable (" Unexpected type for ld3!" );
5897
+ selectVectorLoadIntrinsic (Opc, 3 , I);
5898
+ break ;
5899
+ }
5900
+ case Intrinsic::aarch64_neon_ld3lane: {
5901
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5902
+ unsigned Opc;
5903
+ if (Ty == LLT::fixed_vector (8 , S8) || Ty == LLT::fixed_vector (16 , S8))
5904
+ Opc = AArch64::LD3i8;
5905
+ else if (Ty == LLT::fixed_vector (4 , S16) || Ty == LLT::fixed_vector (8 , S16))
5906
+ Opc = AArch64::LD3i16;
5907
+ else if (Ty == LLT::fixed_vector (2 , S32) || Ty == LLT::fixed_vector (4 , S32))
5908
+ Opc = AArch64::LD3i32;
5909
+ else if (Ty == LLT::fixed_vector (2 , S64) ||
5910
+ Ty == LLT::fixed_vector (2 , P0) || Ty == S64 || Ty == P0)
5911
+ Opc = AArch64::LD3i64;
5912
+ else
5913
+ llvm_unreachable (" Unexpected type for st3lane!" );
5914
+ if (!selectVectorLoadLaneIntrinsic (Opc, 3 , I))
5915
+ return false ;
5916
+ break ;
5917
+ }
5918
+ case Intrinsic::aarch64_neon_ld3r: {
5919
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5920
+ unsigned Opc = 0 ;
5921
+ if (Ty == LLT::fixed_vector (8 , S8))
5922
+ Opc = AArch64::LD3Rv8b;
5923
+ else if (Ty == LLT::fixed_vector (16 , S8))
5924
+ Opc = AArch64::LD3Rv16b;
5925
+ else if (Ty == LLT::fixed_vector (4 , S16))
5926
+ Opc = AArch64::LD3Rv4h;
5927
+ else if (Ty == LLT::fixed_vector (8 , S16))
5928
+ Opc = AArch64::LD3Rv8h;
5929
+ else if (Ty == LLT::fixed_vector (2 , S32))
5930
+ Opc = AArch64::LD3Rv2s;
5931
+ else if (Ty == LLT::fixed_vector (4 , S32))
5932
+ Opc = AArch64::LD3Rv4s;
5933
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
5934
+ Opc = AArch64::LD3Rv2d;
5935
+ else if (Ty == S64 || Ty == P0)
5936
+ Opc = AArch64::LD3Rv1d;
5937
+ else
5938
+ llvm_unreachable (" Unexpected type for ld3r!" );
5939
+ selectVectorLoadIntrinsic (Opc, 3 , I);
5940
+ break ;
5941
+ }
5691
5942
case Intrinsic::aarch64_neon_ld4: {
5692
5943
LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5693
5944
unsigned Opc = 0 ;
@@ -5712,6 +5963,48 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5712
5963
selectVectorLoadIntrinsic (Opc, 4 , I);
5713
5964
break ;
5714
5965
}
5966
+ case Intrinsic::aarch64_neon_ld4lane: {
5967
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5968
+ unsigned Opc;
5969
+ if (Ty == LLT::fixed_vector (8 , S8) || Ty == LLT::fixed_vector (16 , S8))
5970
+ Opc = AArch64::LD4i8;
5971
+ else if (Ty == LLT::fixed_vector (4 , S16) || Ty == LLT::fixed_vector (8 , S16))
5972
+ Opc = AArch64::LD4i16;
5973
+ else if (Ty == LLT::fixed_vector (2 , S32) || Ty == LLT::fixed_vector (4 , S32))
5974
+ Opc = AArch64::LD4i32;
5975
+ else if (Ty == LLT::fixed_vector (2 , S64) ||
5976
+ Ty == LLT::fixed_vector (2 , P0) || Ty == S64 || Ty == P0)
5977
+ Opc = AArch64::LD4i64;
5978
+ else
5979
+ llvm_unreachable (" Unexpected type for st4lane!" );
5980
+ if (!selectVectorLoadLaneIntrinsic (Opc, 4 , I))
5981
+ return false ;
5982
+ break ;
5983
+ }
5984
+ case Intrinsic::aarch64_neon_ld4r: {
5985
+ LLT Ty = MRI.getType (I.getOperand (0 ).getReg ());
5986
+ unsigned Opc = 0 ;
5987
+ if (Ty == LLT::fixed_vector (8 , S8))
5988
+ Opc = AArch64::LD4Rv8b;
5989
+ else if (Ty == LLT::fixed_vector (16 , S8))
5990
+ Opc = AArch64::LD4Rv16b;
5991
+ else if (Ty == LLT::fixed_vector (4 , S16))
5992
+ Opc = AArch64::LD4Rv4h;
5993
+ else if (Ty == LLT::fixed_vector (8 , S16))
5994
+ Opc = AArch64::LD4Rv8h;
5995
+ else if (Ty == LLT::fixed_vector (2 , S32))
5996
+ Opc = AArch64::LD4Rv2s;
5997
+ else if (Ty == LLT::fixed_vector (4 , S32))
5998
+ Opc = AArch64::LD4Rv4s;
5999
+ else if (Ty == LLT::fixed_vector (2 , S64) || Ty == LLT::fixed_vector (2 , P0))
6000
+ Opc = AArch64::LD4Rv2d;
6001
+ else if (Ty == S64 || Ty == P0)
6002
+ Opc = AArch64::LD4Rv1d;
6003
+ else
6004
+ llvm_unreachable (" Unexpected type for ld4r!" );
6005
+ selectVectorLoadIntrinsic (Opc, 4 , I);
6006
+ break ;
6007
+ }
5715
6008
case Intrinsic::aarch64_neon_st2: {
5716
6009
Register Src1 = I.getOperand (1 ).getReg ();
5717
6010
Register Src2 = I.getOperand (2 ).getReg ();
0 commit comments