optimize expansion, update tests and add scalar test variants

bob80905 · bob80905 · commit e3ca0f0fb4f3 · 2024-08-10T00:46:03.000-07:00
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -237,7 +237,6 @@ static bool expandNormalizeIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig->getParent());
   Builder.SetInsertPoint(Orig);
 
-  Value *Elt = Builder.CreateExtractElement(X, (uint64_t)0);
   auto *XVec = dyn_cast<FixedVectorType>(Ty);
   if (!XVec) {
     if (auto *constantFP = dyn_cast<ConstantFP>(X)) {
@@ -253,25 +252,47 @@ static bool expandNormalizeIntrinsic(CallInst *Orig) {
     return true;
   }
 
+  Value *Elt = Builder.CreateExtractElement(X, (uint64_t)0);
   unsigned XVecSize = XVec->getNumElements();
-  Value *Sum = Builder.CreateFMul(Elt, Elt);
-  for (unsigned I = 1; I < XVecSize; I++) {
-    Elt = Builder.CreateExtractElement(X, I);
-    Value *Mul = Builder.CreateFMul(Elt, Elt);
-    Sum = Builder.CreateFAdd(Sum, Mul);
+  Value *DotProduct = nullptr;
+  switch (XVecSize) {
+  case 1:
+    report_fatal_error(Twine("Invalid input vector: length is zero"),
+                       /* gen_crash_diag=*/false);
+    break;
+  case 2:
+    DotProduct = Builder.CreateIntrinsic(
+        EltTy, Intrinsic::dx_dot2, ArrayRef<Value *>{X, X}, nullptr, "dx.dot2");
+    break;
+  case 3:
+    DotProduct = Builder.CreateIntrinsic(
+        EltTy, Intrinsic::dx_dot3, ArrayRef<Value *>{X, X}, nullptr, "dx.dot3");
+    break;
+  case 4:
+    DotProduct = Builder.CreateIntrinsic(
+        EltTy, Intrinsic::dx_dot4, ArrayRef<Value *>{X, X}, nullptr, "dx.dot4");
+    break;
+  default:
+    report_fatal_error(Twine("Invalid input vector: vector size is invalid."),
+                       /* gen_crash_diag=*/false);
   }
-  Value *Length = Builder.CreateIntrinsic(
-      EltTy, Intrinsic::sqrt, ArrayRef<Value *>{Sum}, nullptr, "elt.sqrt");
+
+  Value *Multiplicand = Builder.CreateIntrinsic(EltTy, Intrinsic::dx_rsqrt,
+                                                ArrayRef<Value *>{DotProduct},
+                                                nullptr, "dx.rsqrt");
 
   // verify that the length is non-zero
-  if (auto *constantFP = dyn_cast<ConstantFP>(Length)) {
+  // (if the reciprocal sqrt of the length is non-zero, then the length is
+  // non-zero)
+  if (auto *constantFP = dyn_cast<ConstantFP>(Multiplicand)) {
     const APFloat &fpVal = constantFP->getValueAPF();
     if (fpVal.isZero())
       report_fatal_error(Twine("Invalid input vector: length is zero"),
                          /* gen_crash_diag=*/false);
   }
-  Value *LengthVec = Builder.CreateVectorSplat(XVecSize, Length);
-  Value *Result = Builder.CreateFDiv(X, LengthVec);
+
+  Value *MultiplicandVec = Builder.CreateVectorSplat(XVecSize, Multiplicand);
+  Value *Result = Builder.CreateFMul(X, MultiplicandVec);
 
   Orig->replaceAllUsesWith(Result);
   Orig->eraseFromParent();
diff --git a/llvm/test/CodeGen/DirectX/normalize.ll b/llvm/test/CodeGen/DirectX/normalize.ll
@@ -13,15 +13,23 @@ declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>)
 declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>)
 declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>)
 
+define noundef half @test_normalize_half(half noundef %p0) {
+entry:
+  ; CHECK: fdiv half %p0, %p0
+  %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0)
+  ret half %hlsl.normalize
+}
+
 define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) {
 entry:
   ; CHECK: extractelement <2 x half> %{{.*}}, i64 0
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <2 x half> %{{.*}}, i64 1
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; EXPCHECK: call half @llvm.sqrt.f16(half %{{.*}})
-  ; DOPCHECK: call half @dx.op.unary.f16(i32 24, half %{{.*}})
+  ; EXPCHECK: call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}})
+  ; DOPCHECK: call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+  ; EXPCHECK: call half @llvm.dx.rsqrt.f16(half %{{.*}})
+  ; DOPCHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
+  ; CHECK: insertelement <2 x half> poison, half %{{.*}}, i64 0
+  ; CHECK: shufflevector <2 x half> %{{.*}}, <2 x half> poison, <2 x i32> zeroinitializer
+  ; CHECK: fmul <2 x half> %{{.*}}, %{{.*}}  
 
   %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0)
   ret <2 x half> %hlsl.normalize
@@ -30,15 +38,13 @@ entry:
 define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) {
 entry:
   ; CHECK: extractelement <3 x half> %{{.*}}, i64 0
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x half> %{{.*}}, i64 1
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x half> %{{.*}}, i64 2
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; EXPCHECK: call half @llvm.sqrt.f16(half %{{.*}})
-  ; DOPCHECK: call half @dx.op.unary.f16(i32 24, half %{{.*}})
+  ; EXPCHECK: call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}})
+  ; DOPCHECK: call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+  ; EXPCHECK: call half @llvm.dx.rsqrt.f16(half %{{.*}})
+  ; DOPCHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
+  ; CHECK: insertelement <3 x half> poison, half %{{.*}}, i64 0
+  ; CHECK: shufflevector <3 x half> %{{.*}}, <3 x half> poison, <3 x i32> zeroinitializer
+  ; CHECK: fmul <3 x half> %{{.*}}, %{{.*}}
 
   %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0)
   ret <3 x half> %hlsl.normalize
@@ -47,32 +53,35 @@ entry:
 define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) {
 entry:
   ; CHECK: extractelement <4 x half> %{{.*}}, i64 0
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 1
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 2
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x half> %{{.*}}, i64 3
-  ; CHECK: fmul half %{{.*}}, %{{.*}}
-  ; CHECK: fadd half %{{.*}}, %{{.*}}
-  ; EXPCHECK: call half @llvm.sqrt.f16(half %{{.*}})
-  ; DOPCHECK:  call half @dx.op.unary.f16(i32 24, half %{{.*}})
+  ; EXPCHECK: call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}})
+  ; DOPCHECK: call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+  ; EXPCHECK: call half @llvm.dx.rsqrt.f16(half %{{.*}})
+  ; DOPCHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
+  ; CHECK: insertelement <4 x half> poison, half %{{.*}}, i64 0
+  ; CHECK: shufflevector <4 x half> %{{.*}}, <4 x half> poison, <4 x i32> zeroinitializer
+  ; CHECK: fmul <4 x half> %{{.*}}, %{{.*}}
 
   %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0)
   ret <4 x half> %hlsl.normalize
 }
 
+define noundef float @test_normalize_float(float noundef %p0) {
+entry:
+  ; CHECK: fdiv float %p0, %p0
+  %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0)
+  ret float %hlsl.normalize
+}
+
 define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) {
 entry:
   ; CHECK: extractelement <2 x float> %{{.*}}, i64 0
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <2 x float> %{{.*}}, i64 1
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; EXPCHECK: call float @llvm.sqrt.f32(float %{{.*}})
-  ; DOPCHECK: call float @dx.op.unary.f32(i32 24, float %{{.*}})
+  ; EXPCHECK: call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}})
+  ; DOPCHECK: call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  ; EXPCHECK: call float @llvm.dx.rsqrt.f32(float %{{.*}})
+  ; DOPCHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
+  ; CHECK: insertelement <2 x float> poison, float %{{.*}}, i64 0
+  ; CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <2 x i32> zeroinitializer
+  ; CHECK: fmul <2 x float> %{{.*}}, %{{.*}}
 
   %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0)
   ret <2 x float> %hlsl.normalize
@@ -81,15 +90,13 @@ entry:
 define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) {
 entry:
   ; CHECK: extractelement <3 x float> %{{.*}}, i64 0
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x float> %{{.*}}, i64 1
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <3 x float> %{{.*}}, i64 2
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; EXPCHECK: call float @llvm.sqrt.f32(float %{{.*}})
-  ; DOPCHECK: call float @dx.op.unary.f32(i32 24, float %{{.*}})
+  ; EXPCHECK: call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}})
+  ; DOPCHECK: call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  ; EXPCHECK: call float @llvm.dx.rsqrt.f32(float %{{.*}})
+  ; DOPCHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
+  ; CHECK: insertelement <3 x float> poison, float %{{.*}}, i64 0
+  ; CHECK: shufflevector <3 x float> %{{.*}}, <3 x float> poison, <3 x i32> zeroinitializer
+  ; CHECK: fmul <3 x float> %{{.*}}, %{{.*}}
 
   %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0)
   ret <3 x float> %hlsl.normalize
@@ -98,18 +105,13 @@ entry:
 define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) {
 entry:
   ; CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 1
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 2
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; CHECK: extractelement <4 x float> %{{.*}}, i64 3
-  ; CHECK: fmul float %{{.*}}, %{{.*}}
-  ; CHECK: fadd float %{{.*}}, %{{.*}}
-  ; EXPCHECK: call float @llvm.sqrt.f32(float %{{.*}})
-  ; DOPCHECK:  call float @dx.op.unary.f32(i32 24, float %{{.*}})
+  ; EXPCHECK: call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  ; DOPCHECK: call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  ; EXPCHECK: call float @llvm.dx.rsqrt.f32(float %{{.*}})
+  ; DOPCHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
+  ; CHECK: insertelement <4 x float> poison, float %{{.*}}, i64 0
+  ; CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
+  ; CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
 
   %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0)
   ret <4 x float> %hlsl.normalize
diff --git a/llvm/test/CodeGen/DirectX/normalize_error.ll b/llvm/test/CodeGen/DirectX/normalize_error.ll
@@ -1,7 +1,7 @@
 ; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation normalize does not support double overload type
-; CHECK: Cannot create Sqrt operation: Invalid overload type
+; CHECK: Cannot create Dot2 operation: Invalid overload type
 
 define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) {
 entry: