PreISelIntrinsicLowering: Lower llvm.exp/llvm.exp2 to a loop if scalable vec arg

steplong · steplong · commit bac7223371d8 · 2025-01-23T09:23:39.000-08:00
If the argument to the intrinsic call to llvm.exp and llvm.exp2 is a
scalable vector, lower it into a loop in PreISelIntrinsicLowering. If it
is a fixed vector, let SelectionDAG handle it.
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -10,6 +10,7 @@
 #define LLVM_ANALYSIS_TARGETLIBRARYINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
@@ -457,6 +458,21 @@ class TargetLibraryInfo {
     return Impl->CustomNames.find(F)->second;
   }
 
+  static unsigned getISDNode(Intrinsic::ID ID) {
+    unsigned Node;
+    switch (ID) {
+    case Intrinsic::exp:
+      Node = ISD::FEXP;
+      break;
+    case Intrinsic::exp2:
+      Node = ISD::FEXP2;
+      break;
+    default:
+      llvm_unreachable("Intrinsic ID not supported yet");
+    }
+    return Node;
+  }
+
   static void initExtensionsForTriple(bool &ShouldExtI32Param,
                                       bool &ShouldExtI32Return,
                                       bool &ShouldSignExtI32Param,
diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
@@ -0,0 +1,30 @@
+//===- llvm/Transforms/Utils/LowerVectorIntrinsics.h ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics with a scalable vector arg to loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H
+#define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H
+
+#include <cstdint>
+#include <optional>
+
+namespace llvm {
+
+class CallInst;
+class Module;
+
+/// Lower \p CI as a loop. \p CI is a unary intrinsic with a vector argument and
+/// is deleted and replaced with a loop.
+bool lowerUnaryVectorIntrinsicAsLoop(Module &M, CallInst *CI);
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
 
 using namespace llvm;
 
@@ -453,6 +454,19 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::objc_sync_exit:
       Changed |= lowerObjCCall(F, "objc_sync_exit");
       break;
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+      Changed |= forEachCall(F, [&](CallInst *CI) {
+        Type *Ty = CI->getArgOperand(0)->getType();
+        if (!isa<ScalableVectorType>(Ty))
+          return false;
+        const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+        unsigned Op = TargetLibraryInfo::getISDNode(F.getIntrinsicID());
+        if (!TL->isOperationExpand(Op, EVT::getEVT(Ty)))
+          return false;
+        return lowerUnaryVectorIntrinsicAsLoop(M, CI);
+      });
+      break;
     }
   }
   return Changed;
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMTransformUtils
   LowerInvoke.cpp
   LowerMemIntrinsics.cpp
   LowerSwitch.cpp
+  LowerVectorIntrinsics.cpp
   MatrixUtils.cpp
   MemoryOpRemark.cpp
   MemoryTaggingSupport.cpp
diff --git a/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp
@@ -0,0 +1,73 @@
+//===- LowerVectorIntrinsics.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lower-vector-intrinsics"
+
+using namespace llvm;
+
+bool llvm::lowerUnaryVectorIntrinsicAsLoop(Module &M, CallInst *CI) {
+  Type *ArgTy = CI->getArgOperand(0)->getType();
+  VectorType *VecTy = cast<VectorType>(ArgTy);
+
+  BasicBlock *PreLoopBB = CI->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  PostLoopBB = PreLoopBB->splitBasicBlock(CI);
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "", ParentFunc, PostLoopBB);
+  PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+  // Loop preheader
+  IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
+  Value *LoopEnd = nullptr;
+  if (auto *ScalableVecTy = dyn_cast<ScalableVectorType>(VecTy)) {
+    Value *VScale = PreLoopBuilder.CreateVScale(
+        ConstantInt::get(PreLoopBuilder.getInt64Ty(), 1));
+    Value *N = ConstantInt::get(PreLoopBuilder.getInt64Ty(),
+                                ScalableVecTy->getMinNumElements());
+    LoopEnd = PreLoopBuilder.CreateMul(VScale, N);
+  } else {
+    FixedVectorType *FixedVecTy = cast<FixedVectorType>(VecTy);
+    LoopEnd = ConstantInt::get(PreLoopBuilder.getInt64Ty(),
+                               FixedVecTy->getNumElements());
+  }
+
+  // Loop body
+  IRBuilder<> LoopBuilder(LoopBB);
+  Type *Int64Ty = LoopBuilder.getInt64Ty();
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(Int64Ty, 2);
+  LoopIndex->addIncoming(ConstantInt::get(Int64Ty, 0U), PreLoopBB);
+  PHINode *Vec = LoopBuilder.CreatePHI(VecTy, 2);
+  Vec->addIncoming(CI->getArgOperand(0), PreLoopBB);
+
+  Value *Elem = LoopBuilder.CreateExtractElement(Vec, LoopIndex);
+  Function *Exp = Intrinsic::getOrInsertDeclaration(&M, CI->getIntrinsicID(),
+                                                    VecTy->getElementType());
+  Value *Res = LoopBuilder.CreateCall(Exp, Elem);
+  Value *NewVec = LoopBuilder.CreateInsertElement(Vec, Res, LoopIndex);
+  Vec->addIncoming(NewVec, LoopBB);
+
+  Value *One = ConstantInt::get(Int64Ty, 1U);
+  Value *NextLoopIndex = LoopBuilder.CreateAdd(LoopIndex, One);
+  LoopIndex->addIncoming(NextLoopIndex, LoopBB);
+
+  Value *ExitCond =
+      LoopBuilder.CreateICmp(CmpInst::ICMP_EQ, NextLoopIndex, LoopEnd);
+  LoopBuilder.CreateCondBr(ExitCond, PostLoopBB, LoopBB);
+
+  CI->replaceAllUsesWith(NewVec);
+  CI->eraseFromParent();
+  return true;
+}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+define <vscale x 4 x float> @scalable_vec_exp(<vscale x 4 x float> %input) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_vec_exp(
+; CHECK-SAME: <vscale x 4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP9:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <vscale x 4 x float> [ [[INPUT]], [[TMP0]] ], [ [[TMP8:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x float> [[TMP5]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.exp.f32(float [[TMP6]])
+; CHECK-NEXT:    [[TMP8]] = insertelement <vscale x 4 x float> [[TMP5]], float [[TMP7]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB3]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP8]]
+;
+  %output = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %input)
+  ret <vscale x 4 x float> %output
+}
+
+define <4 x float> @fixed_vec_exp(<4 x float> %input) {
+; CHECK-LABEL: define <4 x float> @fixed_vec_exp(
+; CHECK-SAME: <4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[OUTPUT:%.*]] = call <4 x float> @llvm.exp.v4f32(<4 x float> [[INPUT]])
+; CHECK-NEXT:    ret <4 x float> [[OUTPUT]]
+;
+  %output = call <4 x float> @llvm.exp.v4f32(<4 x float> %input)
+  ret <4 x float> %output
+}
+
+declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
+declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0
+
+; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK-NEXT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -64,6 +64,7 @@ static_library("Utils") {
     "LowerInvoke.cpp",
     "LowerMemIntrinsics.cpp",
     "LowerSwitch.cpp",
+    "LowerVectorIntrinsics.cpp",
     "MatrixUtils.cpp",
     "Mem2Reg.cpp",
     "MemoryOpRemark.cpp",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+if not "AArch64" in config.root.targets:`
	`2`	`+ config.unsupported = True`