[SVE][CodeGen] Fix issues with EXTRACT_SUBVECTOR when using scalable FP vectors

david-arm · david-arm · commit 88bbd3073656 · 2020-08-12T08:35:46.000+01:00
In this patch I have fixed two issues: 1. Our SVE tuple get/set intrinsics were using the wrong constant type for the index passed to EXTRACT_SUBVECTOR. I have fixed this by using the function SelectionDAG::getVectorIdxConstant to create the value. Also, I have updated the documentation for EXTRACT_SUBVECTOR describing what type the constant index should be and we now enforce this when creating the node. 2. The AArch64 backend was missing the appropriate patterns for extracting certain subvectors (nxv4f16 and nxv2f32) from legal SVE types. I have added them as part of this patch. The only way that I could find to test the new patterns was to use the SVE tuple get intrinsics, although I realise it looks a bit unusual. Tests added here: test/CodeGen/AArch64/sve-extract-subvector.ll Differential Revision: https://reviews.llvm.org/D85516
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -514,7 +514,8 @@ enum NodeType {
   /// IDX is first scaled by the runtime scaling factor of T. Elements IDX
   /// through (IDX + num_elements(T) - 1) must be valid VECTOR indices. If this
   /// condition cannot be determined statically but is false at runtime, then
-  /// the result vector is undefined.
+  /// the result vector is undefined. The IDX parameter must be a vector index
+  /// constant type, which for most targets will be an integer pointer type.
   ///
   /// This operation supports extracting a fixed-width vector from a scalable
   /// vector, but not the other way around.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5560,6 +5560,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
             (VT.getVectorMinNumElements() + N2C->getZExtValue()) <=
                 N1VT.getVectorMinNumElements()) &&
            "Extract subvector overflow!");
+    assert(N2C->getAPIntValue().getBitWidth() ==
+               TLI->getVectorIdxTy(getDataLayout())
+                   .getSizeInBits()
+                   .getFixedSize() &&
+           "Constant index for EXTRACT_SUBVECTOR has an invalid size");
 
     // Trivial extraction.
     if (VT == N1VT)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14235,9 +14235,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
       EVT ResVT = N->getValueType(0);
       uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+      SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
       SDValue Val =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
-                      DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
       return DAG.getMergeValues({Val, Chain}, DL);
     }
     case Intrinsic::aarch64_sve_tuple_set: {
@@ -14263,9 +14263,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
         if (I == IdxConst)
           Opnds.push_back(Vec);
         else {
-          Opnds.push_back(
-              DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
-                          DAG.getConstant(I * NumLanes, DL, MVT::i32)));
+          SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
+          Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
+                                      Vec.getValueType(), Tuple, ExtIdx));
         }
       }
       SDValue Concat =
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1155,6 +1155,16 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
             (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
 
+  // Extract subvectors from FP SVE vectors
+  def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_S ZPR:$Zs)>;
+  def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
+            (UUNPKHI_ZZ_S ZPR:$Zs)>;
+  def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))),
+            (UUNPKHI_ZZ_D ZPR:$Zs)>;
+
   // Concatenate two predicates.
   def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
             (UZP1_PPP_S $p1, $p2)>;
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-extract-subvector.ll
@@ -28,5 +28,43 @@ define <vscale x 2 x i64> @extract_nxv2i64_nxv32i8(<vscale x 32 x i8> %z0_z1) {
   ret <vscale x 2 x i64> %ext
 }
 
+define <vscale x 4 x half> @extract_lo_nxv4f16_nxv8f16(<vscale x 8 x half> %z0) {
+; CHECK-LABEL: extract_lo_nxv4f16_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ret
+  %ext = call <vscale x 4 x half> @llvm.aarch64.sve.tuple.get.nxv8f16(<vscale x 8 x half> %z0, i32 0)
+  ret <vscale x 4 x half> %ext
+}
+
+define <vscale x 4 x half> @extract_hi_nxv4f16_nxv8f16(<vscale x 8 x half> %z0) {
+; CHECK-LABEL: extract_hi_nxv4f16_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ret
+  %ext = call <vscale x 4 x half> @llvm.aarch64.sve.tuple.get.nxv8f16(<vscale x 8 x half> %z0, i32 1)
+  ret <vscale x 4 x half> %ext
+}
+
+define <vscale x 2 x float> @extract_lo_nxv2f32_nxv4f32(<vscale x 4 x float> %z0) {
+; CHECK-LABEL: extract_lo_nxv2f32_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ret
+  %ext = call <vscale x 2 x float> @llvm.aarch64.sve.tuple.get.nxv4f32(<vscale x 4 x float> %z0, i32 0)
+  ret <vscale x 2 x float> %ext
+}
+
+define <vscale x 2 x float> @extract_hi_nxv2f32_nxv4f32(<vscale x 4 x float> %z0) {
+; CHECK-LABEL: extract_hi_nxv2f32_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    ret
+  %ext = call <vscale x 2 x float> @llvm.aarch64.sve.tuple.get.nxv4f32(<vscale x 4 x float> %z0, i32 1)
+  ret <vscale x 2 x float> %ext
+}
+
 declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv4i64(<vscale x 4 x i64>, i32)
 declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv32i8(<vscale x 32 x i8>, i32)
+declare <vscale x 2 x float> @llvm.aarch64.sve.tuple.get.nxv4f32(<vscale x 4 x float>, i32)
+declare <vscale x 4 x half> @llvm.aarch64.sve.tuple.get.nxv8f16(<vscale x 8 x half>, i32)