remove use of vec_reduce_add

farzonl · farzonl · commit d0423eda7425 · 2025-01-03T18:21:54.000-05:00
diff --git a/clang/lib/Headers/hlsl/hlsl_detail.h b/clang/lib/Headers/hlsl/hlsl_detail.h
@@ -48,10 +48,11 @@ length_impl(T X) {
 }
 
 template <typename T, int N>
-constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
+enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
 length_vec_impl(vector<T, N> X) {
   vector<T, N> XSquared = X * X;
-  T XSquaredSum = __builtin_reduce_add(XSquared);
+  T XSquaredSum = XSquared[0];
+  [unroll] for (int i = 1; i < N; ++i) XSquaredSum += XSquared[i];
   return __builtin_elementwise_sqrt(XSquaredSum);
 }
 
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -1300,13 +1300,11 @@ float4 lerp(float4, float4, float4);
 const inline half length(half X) { return __detail::length_impl(X); }
 const inline float length(float X) { return __detail::length_impl(X); }
 
-template <int N>
-const inline half length(vector<half, N> X) {
+template <int N> const inline half length(vector<half, N> X) {
   return __detail::length_vec_impl(X);
 }
 
-template <int N> 
-const inline float length(vector<float, N> X) {
+template <int N> const inline float length(vector<float, N> X) {
   return __detail::length_vec_impl(X);
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -1,24 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN: -emit-llvm -O1 -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
-
+// RUN: -emit-llvm -O2 -o - | FileCheck %s
 
 // CHECK-LABEL: define noundef half @_Z16test_length_halfDh(
 // CHECK-SAME: half noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call noundef half @llvm.fabs.f16(half [[P0]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef half @_Z16test_length_halfDh(
-// SPVCHECK-SAME: half noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call noundef half @llvm.fabs.f16(half [[P0]])
-// SPVCHECK-NEXT:    ret half [[ELT_ABS_I]]
-//
 half test_length_half(half p0)
 {
   return length(p0);
@@ -28,16 +18,12 @@ half test_length_half(half p0)
 // CHECK-SAME: <2 x half> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x half> [[P0]], [[P0]]
-// CHECK-NEXT:    [[RDX_FADD_I:%.*]] = tail call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[MUL_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef half @llvm.sqrt.f16(half [[RDX_FADD_I]])
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x half> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x half> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef half @llvm.sqrt.f16(half [[ADD_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef half @_Z17test_length_half2Dv2_Dh(
-// SPVCHECK-SAME: <2 x half> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[HLSL_LENGTH_I:%.*]] = tail call noundef half @llvm.spv.length.v2f16(<2 x half> [[P0]])
-// SPVCHECK-NEXT:    ret half [[HLSL_LENGTH_I]]
-//
 half test_length_half2(half2 p0)
 {
   return length(p0);
@@ -47,16 +33,14 @@ half test_length_half2(half2 p0)
 // CHECK-SAME: <3 x half> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <3 x half> [[P0]], [[P0]]
-// CHECK-NEXT:    [[RDX_FADD_I:%.*]] = tail call half @llvm.vector.reduce.fadd.v3f16(half 0xH0000, <3 x half> [[MUL_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef half @llvm.sqrt.f16(half [[RDX_FADD_I]])
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <3 x half> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <3 x half> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd half [[VECEXT_I]], [[VECEXT1_I]]
+// CHECK-NEXT:    [[VECEXT1_I_1:%.*]] = extractelement <3 x half> [[MUL_I]], i64 2
+// CHECK-NEXT:    [[ADD_I_1:%.*]] = fadd half [[ADD_I]], [[VECEXT1_I_1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef half @llvm.sqrt.f16(half [[ADD_I_1]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef half @_Z17test_length_half3Dv3_Dh(
-// SPVCHECK-SAME: <3 x half> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[HLSL_LENGTH_I:%.*]] = tail call noundef half @llvm.spv.length.v3f16(<3 x half> [[P0]])
-// SPVCHECK-NEXT:    ret half [[HLSL_LENGTH_I]]
-//
 half test_length_half3(half3 p0)
 {
   return length(p0);
@@ -66,16 +50,16 @@ half test_length_half3(half3 p0)
 // CHECK-SAME: <4 x half> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x half> [[P0]], [[P0]]
-// CHECK-NEXT:    [[RDX_FADD_I:%.*]] = tail call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[MUL_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef half @llvm.sqrt.f16(half [[RDX_FADD_I]])
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x half> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <4 x half> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd half [[VECEXT_I]], [[VECEXT1_I]]
+// CHECK-NEXT:    [[VECEXT1_I_1:%.*]] = extractelement <4 x half> [[MUL_I]], i64 2
+// CHECK-NEXT:    [[ADD_I_1:%.*]] = fadd half [[ADD_I]], [[VECEXT1_I_1]]
+// CHECK-NEXT:    [[VECEXT1_I_2:%.*]] = extractelement <4 x half> [[MUL_I]], i64 3
+// CHECK-NEXT:    [[ADD_I_2:%.*]] = fadd half [[ADD_I_1]], [[VECEXT1_I_2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef half @llvm.sqrt.f16(half [[ADD_I_2]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef half @_Z17test_length_half4Dv4_Dh(
-// SPVCHECK-SAME: <4 x half> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[HLSL_LENGTH_I:%.*]] = tail call noundef half @llvm.spv.length.v4f16(<4 x half> [[P0]])
-// SPVCHECK-NEXT:    ret half [[HLSL_LENGTH_I]]
-//
 half test_length_half4(half4 p0)
 {
   return length(p0);
@@ -88,12 +72,6 @@ half test_length_half4(half4 p0)
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call noundef float @llvm.fabs.f32(float [[P0]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef float @_Z17test_length_floatf(
-// SPVCHECK-SAME: float noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call noundef float @llvm.fabs.f32(float [[P0]])
-// SPVCHECK-NEXT:    ret float [[ELT_ABS_I]]
-//
 float test_length_float(float p0)
 {
   return length(p0);
@@ -103,16 +81,12 @@ float test_length_float(float p0)
 // CHECK-SAME: <2 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[P0]], [[P0]]
-// CHECK-NEXT:    [[RDX_FADD_I:%.*]] = tail call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[MUL_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[RDX_FADD_I]])
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x float> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x float> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd float [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[ADD_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef float @_Z18test_length_float2Dv2_f(
-// SPVCHECK-SAME: <2 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[HLSL_LENGTH_I:%.*]] = tail call noundef float @llvm.spv.length.v2f32(<2 x float> [[P0]])
-// SPVCHECK-NEXT:    ret float [[HLSL_LENGTH_I]]
-//
 float test_length_float2(float2 p0)
 {
   return length(p0);
@@ -122,38 +96,56 @@ float test_length_float2(float2 p0)
 // CHECK-SAME: <3 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <3 x float> [[P0]], [[P0]]
-// CHECK-NEXT:    [[RDX_FADD_I:%.*]] = tail call float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[MUL_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[RDX_FADD_I]])
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <3 x float> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <3 x float> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd float [[VECEXT_I]], [[VECEXT1_I]]
+// CHECK-NEXT:    [[VECEXT1_I_1:%.*]] = extractelement <3 x float> [[MUL_I]], i64 2
+// CHECK-NEXT:    [[ADD_I_1:%.*]] = fadd float [[ADD_I]], [[VECEXT1_I_1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[ADD_I_1]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef float @_Z18test_length_float3Dv3_f(
-// SPVCHECK-SAME: <3 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[HLSL_LENGTH_I:%.*]] = tail call noundef float @llvm.spv.length.v3f32(<3 x float> [[P0]])
-// SPVCHECK-NEXT:    ret float [[HLSL_LENGTH_I]]
-//
 float test_length_float3(float3 p0)
 {
   return length(p0);
 }
 
+
 // CHECK-LABEL: define noundef float @_Z18test_length_float4Dv4_f(
 // CHECK-SAME: <4 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[P0]], [[P0]]
-// CHECK-NEXT:    [[RDX_FADD_I:%.*]] = tail call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[MUL_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[RDX_FADD_I]])
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <4 x float> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd float [[VECEXT_I]], [[VECEXT1_I]]
+// CHECK-NEXT:    [[VECEXT1_I_1:%.*]] = extractelement <4 x float> [[MUL_I]], i64 2
+// CHECK-NEXT:    [[ADD_I_1:%.*]] = fadd float [[ADD_I]], [[VECEXT1_I_1]]
+// CHECK-NEXT:    [[VECEXT1_I_2:%.*]] = extractelement <4 x float> [[MUL_I]], i64 3
+// CHECK-NEXT:    [[ADD_I_2:%.*]] = fadd float [[ADD_I_1]], [[VECEXT1_I_2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[ADD_I_2]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_length_float4(float4 p0)
+{
+  return length(p0);
+}
+
+
+// CHECK-LABEL: define noundef float @_Z26test_length_float4_extractDv4_f(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[P0]], [[P0]]
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[MUL_I]], i64 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <4 x float> [[MUL_I]], i64 1
+// CHECK-NEXT:    [[VECEXT1_I_1:%.*]] = extractelement <4 x float> [[MUL_I]], i64 2
+// CHECK-NEXT:    [[VECEXT1_I_2:%.*]] = extractelement <4 x float> [[MUL_I]], i64 3
+// CHECK-NEXT:    [[ADD_I12:%.*]] = fadd float [[VECEXT_I]], [[VECEXT1_I]]
+// CHECK-NEXT:    [[ADD_I12_1:%.*]] = fadd float [[ADD_I12]], [[VECEXT1_I_1]]
+// CHECK-NEXT:    [[ADD_I12_2:%.*]] = fadd float [[ADD_I12_1]], [[VECEXT1_I_2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef float @llvm.sqrt.f32(float [[ADD_I12_2]])
 // CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP0]]
 // CHECK-NEXT:    ret float [[ADD]]
 //
-// SPVCHECK-LABEL: define spir_func noundef float @_Z18test_length_float4Dv4_f(
-// SPVCHECK-SAME: <4 x float> noundef [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[HLSL_LENGTH_I:%.*]] = tail call noundef float @llvm.spv.length.v4f32(<4 x float> [[P0]])
-// SPVCHECK-NEXT:    [[ADD:%.*]] = fadd float [[HLSL_LENGTH_I]], [[HLSL_LENGTH_I]]
-// SPVCHECK-NEXT:    ret float [[ADD]]
-//
-float test_length_float4(float4 p0)
+float test_length_float4_extract(float4 p0)
 {
   return length(p0) + length(p0);
 }
diff --git a/llvm/test/CodeGen/DirectX/length.ll b/llvm/test/CodeGen/DirectX/length.ll

Original file line number	Diff line number	Diff line change
`@@ -48,10 +48,11 @@ length_impl(T X) {`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`template <typename T, int N>`
`51`		`-constexpr enable_if_t<is_same<float, T>::value \|\| is_same<half, T>::value, T>`
	`51`	`+enable_if_t<is_same<float, T>::value \|\| is_same<half, T>::value, T>`
`52`	`52`	`length_vec_impl(vector<T, N> X) {`
`53`	`53`	`vector<T, N> XSquared = X * X;`
`54`		`- T XSquaredSum = __builtin_reduce_add(XSquared);`
	`54`	`+ T XSquaredSum = XSquared[0];`
	`55`	`+ [unroll] for (int i = 1; i < N; ++i) XSquaredSum += XSquared[i];`
`55`	`56`	`return __builtin_elementwise_sqrt(XSquaredSum);`
`56`	`57`	`}`
`57`	`58`
Original file line number	Diff line number	Diff line change
`@@ -1300,13 +1300,11 @@ float4 lerp(float4, float4, float4);`
`1300`	`1300`	`const inline half length(half X) { return __detail::length_impl(X); }`
`1301`	`1301`	`const inline float length(float X) { return __detail::length_impl(X); }`
`1302`	`1302`
`1303`		`-template <int N>`
`1304`		`-const inline half length(vector<half, N> X) {`
	`1303`	`+template <int N> const inline half length(vector<half, N> X) {`
`1305`	`1304`	`return __detail::length_vec_impl(X);`
`1306`	`1305`	`}`
`1307`	`1306`
`1308`		`-template <int N>`
`1309`		`-const inline float length(vector<float, N> X) {`
	`1307`	`+template <int N> const inline float length(vector<float, N> X) {`
`1310`	`1308`	`return __detail::length_vec_impl(X);`
`1311`	`1309`	`}`
`1312`	`1310`