llvm · JonPsson1 · Jan 18, 2024 · Jan 17, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -75,8 +75,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
     return TTI::TCC_Free;
-  // No cost model for operations on integers larger than 64 bit implemented yet.
-  if (BitSize > 64)
+  // No cost model for operations on integers larger than 128 bit implemented yet.
+  if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
     return TTI::TCC_Free;
 
   if (Imm == 0)
@@ -96,7 +96,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
     return 2 * TTI::TCC_Basic;
   }
 
-  return 4 * TTI::TCC_Basic;
+  // i128 immediates loads from Constant Pool
+  return 2 * TTI::TCC_Basic;
 }
 
 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
@@ -479,21 +480,27 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
       return LIBCALL_COST;
 
     // Give discount for some combined logical operations if supported.
-    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+    if (Args.size() == 2) {
       if (Opcode == Instruction::Xor) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
             if (I->hasOneUse() &&
-                (I->getOpcode() == Instruction::And ||
-                 I->getOpcode() == Instruction::Or ||
+                (I->getOpcode() == Instruction::Or ||
+                 I->getOpcode() == Instruction::And ||
                  I->getOpcode() == Instruction::Xor))
-              return 0;
+              if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                  (isInt128InVR(Ty) &&
+                   (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
+                return 0;
         }
       }
-      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+      else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
+            if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
+                ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                 (isInt128InVR(Ty) &&
+                  (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
               return 0;
         }
       }
@@ -774,29 +781,63 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     assert (!Dst->isVectorTy());
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (Src->isIntegerTy(128))
+        return LIBCALL_COST;
       if (SrcScalarBits >= 32 ||
           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
         return 1;
       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
     }
 
-    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-        Src->isIntegerTy(1)) {
-      if (ST->hasLoadStoreOnCond2())
-        return 2; // li 0; loc 1
-
-      // This should be extension of a compare i1 result, which is done with
-      // ipm and a varying sequence of instructions.
-      unsigned Cost = 0;
-      if (Opcode == Instruction::SExt)
-        Cost = (DstScalarBits < 64 ? 3 : 4);
-      if (Opcode == Instruction::ZExt)
-        Cost = 3;
-      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
-      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
-        // If operands of an fp-type was compared, this costs +1.
-        Cost++;
-      return Cost;
+    if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
+        Dst->isIntegerTy(128))
+      return LIBCALL_COST;
+
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
+      if (Src->isIntegerTy(1)) {
+        if (DstScalarBits == 128)
+          return 5 /*branch seq.*/;
+
+        if (ST->hasLoadStoreOnCond2())
+          return 2; // li 0; loc 1
+
+        // This should be extension of a compare i1 result, which is done with
+        // ipm and a varying sequence of instructions.
+        unsigned Cost = 0;
+        if (Opcode == Instruction::SExt)
+          Cost = (DstScalarBits < 64 ? 3 : 4);
+        if (Opcode == Instruction::ZExt)
+          Cost = 3;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+        if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+          // If operands of an fp-type was compared, this costs +1.
+          Cost++;
+        return Cost;
+      }
+      else if (isInt128InVR(Dst)) {
+        // Extensions from GPR to i128 (in VR) typically costs two instructions,
+        // but a zero-extending load would be just one extra instruction.
+        if (Opcode == Instruction::ZExt && I != nullptr)
+          if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+            if (Ld->hasOneUse())
+              return 1;
+        return 2;
+      }
+    }
+
+    if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
+      if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+        if (Ld->hasOneUse())
+          return 0;  // Will be converted to GPR load.
+      bool OnlyTruncatingStores = true;
+      for (const User *U : I->users())
+        if (!isa<StoreInst>(U)) {
+          OnlyTruncatingStores = false;
+          break;
+        }
+      if (OnlyTruncatingStores)
+        return 0;
+      return 2; // Vector element extraction.
     }
   }
   else if (ST->hasVector()) {
@@ -930,7 +971,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       // A loaded value compared with 0 with multiple users becomes Load and
       // Test. The load is then not foldable, so return 0 cost for the ICmp.
       unsigned ScalarBits = ValTy->getScalarSizeInBits();
-      if (I != nullptr && ScalarBits >= 32)
+      if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
@@ -943,8 +984,8 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       return Cost;
     }
     case Instruction::Select:
-      if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP - costs a conditional jump.
+      if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
+        return 4; // No LOC for FP / i128 - costs a conditional jump.
       return 1; // Load On Condition / Select Register.
     }
   }
@@ -1157,6 +1198,10 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
 
+  // FP128 is a legal type but kept in a register pair on older CPUs.
+  if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
+    return 2;
+
   unsigned NumOps =
     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
 
@@ -1177,10 +1222,6 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     }
   }
 
-  if (Src->getScalarSizeInBits() == 128)
-    // 128 bit scalars are held in a pair of two 64 bit registers.
-    NumOps *= 2;
-
   return  NumOps;
 }
 

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -28,6 +28,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
   unsigned const LIBCALL_COST = 30;
 
+  bool isInt128InVR(Type *Ty) { return Ty->isIntegerTy(128) && ST->hasVector(); }
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),

diff --git a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+
+define i128 @fun1(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun1'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = sext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun2(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun2'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = zext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = zext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun3(i128 %val1, i128 %val2,
+                  i128 %val3, i128 %val4) {
+; CHECK-LABEL: 'fun3'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+  %cmp = icmp eq i128 %val1, %val2
+  %add = add i128 %val3, %val4
+  %sel = select i1 %cmp, i128 %val3, i128 %add
+  ret i128 %sel
+}
+
+define i128 @fun4(ptr %src) {
+; CHECK-LABEL: 'fun4'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun5(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun5'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun6(ptr %src) {
+; CHECK-LABEL: 'fun6'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res = zext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun7(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun7'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = zext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+; Truncating store is free.
+define void @fun8(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun8'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret void
+}
+
+; If there is a non-store user, an extraction is needed.
+define i64 @fun9(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun9'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret i64 %t
+}
+
+; Truncation of load is free.
+define i64 @fun10(ptr %src) {
+; CHECK-LABEL: 'fun10'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; If the load has another user, the truncation becomes an extract.
+define i64 @fun11(ptr %src, i128 %val2, ptr %dst) {
+; CHECK-LABEL: 'fun11'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  %a = add i128 %v, %val2
+  store i128 %a, ptr %dst
+  ret i64 %t
+}
+
+; Trunction with a GPR use typically requires an extraction.
+define i64 @fun12(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: 'fun12'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; Fp<->Int conversions require libcalls.
+define void @fun13() {
+; CHECK-LABEL: 'fun13'
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v0 = fptosi fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v1 = fptosi double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v2 = fptosi float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v3 = fptoui fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v4 = fptoui double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v5 = fptoui float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v6 = sitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v7 = sitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v8 = sitofp i128 undef to float
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v9 = uitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v10 = uitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v11 = uitofp i128 undef to float
+  %v0 = fptosi fp128 undef to i128
+  %v1 = fptosi double undef to i128
+  %v2 = fptosi float undef to i128
+  %v3 = fptoui fp128 undef to i128
+  %v4 = fptoui double undef to i128
+  %v5 = fptoui float undef to i128
+  %v6 = sitofp i128 undef to fp128
+  %v7 = sitofp i128 undef to double
+  %v8 = sitofp i128 undef to float
+  %v9 = uitofp i128 undef to fp128
+  %v10 = uitofp i128 undef to double
+  %v11 = uitofp i128 undef to float
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -8,6 +8,7 @@ define void @add() {
   %res1 = add i16 undef, undef
   %res2 = add i32 undef, undef
   %res3 = add i64 undef, undef
+  %resQ = add i128 undef, undef
   %res4 = add <2 x i8> undef, undef
   %res5 = add <2 x i16> undef, undef
   %res6 = add <2 x i32> undef, undef
@@ -29,6 +30,7 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = add i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = add i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = add i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = add i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = add <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = add <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = add <2 x i32> undef, undef
@@ -54,6 +56,7 @@ define void @sub() {
   %res1 = sub i16 undef, undef
   %res2 = sub i32 undef, undef
   %res3 = sub i64 undef, undef
+  %resQ = sub i128 undef, undef
   %res4 = sub <2 x i8> undef, undef
   %res5 = sub <2 x i16> undef, undef
   %res6 = sub <2 x i32> undef, undef
@@ -75,6 +78,7 @@ define void @sub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = sub i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = sub i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sub i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = sub i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = sub <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = sub <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = sub <2 x i32> undef, undef
@@ -100,6 +104,7 @@ define void @mul() {
   %res1 = mul i16 undef, undef
   %res2 = mul i32 undef, undef
   %res3 = mul i64 undef, undef
+  %resQ = mul i128 undef, undef
   %res4 = mul <2 x i8> undef, undef
   %res5 = mul <2 x i16> undef, undef
   %res6 = mul <2 x i32> undef, undef
@@ -121,6 +126,7 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = mul i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = mul i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = mul i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = mul i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = mul <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef