llvm · chuongg3 · Sep 25, 2023 · Sep 6, 2023 · Sep 13, 2023 · Sep 21, 2023
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -156,6 +156,13 @@ def mul_const : GICombineRule<
   (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
+def lower_mull : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_MUL):$root,
+          [{ return matchExtMulToMULL(*${root}, MRI); }]),
+  (apply [{ applyExtMulToMULL(*${root}, MRI, B, Observer); }])
+>;
+
 def build_vector_to_dup : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_BUILD_VECTOR):$root,
@@ -232,7 +239,7 @@ def AArch64PostLegalizerLowering
                         icmp_lowering, build_vector_lowering,
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
-                        unmerge_ext_to_unmerge]> {
+                        unmerge_ext_to_unmerge, lower_mull]> {
 }
 
 // Post-legalization combines which are primarily optimizations.

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -215,6 +215,18 @@ def G_PREFETCH : AArch64GenericInstruction {
   let hasSideEffects = 1;
 }
 
+def G_UMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+def G_SMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -255,6 +267,9 @@ def : GINodeEquiv<G_FCMLTZ, AArch64fcmltz>;
 
 def : GINodeEquiv<G_BSP, AArch64bsp>;
 
+def : GINodeEquiv<G_UMULL, AArch64umull>;
+def : GINodeEquiv<G_SMULL, AArch64smull>;
+
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
 def : GINodeEquiv<G_PREFETCH, AArch64Prefetch>;

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -119,13 +119,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
-      .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
-      .scalarizeIf(
-          [=](const LegalityQuery &Query) {
-            return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
-          },
-          0)
-      .legalFor({v2s64})
+      .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, s64)
       .clampMaxNumElements(0, s8, 16)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1110,6 +1110,75 @@ void applyUnmergeExtToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
   Observer.changedInstr(MI);
 }
 
+// Match mul({z/s}ext , {z/s}ext) => {u/s}mull OR
+// Match v2s64 mul instructions, which will then be scalarised later on
+// Doing these two matches in one function to ensure that the order of matching
+// will always be the same.
+// Try lowering MUL to MULL before trying to scalarize if needed.
+bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  // Get the instructions that defined the source operand
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+  if (DstTy.isVector()) {
+    // If the source operands were EXTENDED before, then {U/S}MULL can be used
+    unsigned I1Opc = I1->getOpcode();
+    unsigned I2Opc = I2->getOpcode();
+    if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) ||
+         (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) &&
+        (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
+         MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
+        (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
+         MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
+      return true;
+    }
+    // If result type is v2s64, scalarise the instruction
+    else if (DstTy == LLT::fixed_vector(2, 64)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
+                       MachineIRBuilder &B, GISelChangeObserver &Observer) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL &&
+         "Expected a G_MUL instruction");
+
+  // Get the instructions that defined the source operand
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+  // If the source operands were EXTENDED before, then {U/S}MULL can be used
+  unsigned I1Opc = I1->getOpcode();
+  unsigned I2Opc = I2->getOpcode();
+  if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) ||
+       (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) &&
+      (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
+       MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
+      (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
+       MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
+
+    B.setInstrAndDebugLoc(MI);
+    B.buildInstr(I1->getOpcode() == TargetOpcode::G_ZEXT ? AArch64::G_UMULL
+                                                         : AArch64::G_SMULL,
+                 {MI.getOperand(0).getReg()},
+                 {I1->getOperand(1).getReg(), I2->getOperand(1).getReg()});
+    MI.eraseFromParent();
+  }
+  // If result type is v2s64, scalarise the instruction
+  else if (DstTy == LLT::fixed_vector(2, 64)) {
+    LegalizerHelper Helper(*MI.getMF(), Observer, B);
+    B.setInstrAndDebugLoc(MI);
+    Helper.fewerElementsVector(
+        MI, 0,
+        DstTy.changeElementCount(
+            DstTy.getElementCount().divideCoefficientBy(2)));
+  }
+}
+
 class AArch64PostLegalizerLoweringImpl : public Combiner {
 protected:
   // TODO: Make CombinerHelper methods const.

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -175,12 +175,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
-    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[UV]], [[UV2]]
-    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[UV1]], [[UV3]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL1]](s64)
-    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $q0 = COPY [[MUL]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
@@ -203,11 +203,9 @@ body:             |
     ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
     ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[UV]], [[UV2]]
     ; CHECK-NEXT: [[SDIV1:%[0-9]+]]:_(s64) = G_SDIV [[UV1]], [[UV3]]
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
-    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SDIV]], [[UV4]]
-    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SDIV1]], [[UV5]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL1]](s64)
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<2 x s64>) = G_SUB [[COPY]], [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SDIV]](s64), [[SDIV1]](s64)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[BUILD_VECTOR]], [[COPY1]]
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<2 x s64>) = G_SUB [[COPY]], [[MUL]]
     ; CHECK-NEXT: $q0 = COPY [[SUB]](<2 x s64>)
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -27,7 +27,7 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 #
-# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index
+# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 #