[ARM] Fix NaN behaviour for MVE compare intrinsics (#116371)

ostannard · web-flow · commit b2ec416aa5bc · 2024-11-19T10:23:14.000Z
The MVE intrinsics are defined as having the same behaviour as the
instructions which they correspond to. In particular, the vcmpleq and
vcmpltq intrinsics correspond to the VCMP instruction with the LE or LT
condition. However, these instructions with these two conditions do not
match the normal IEEE754 behaviour for NaNs, they return true if either
operand is a NaN, instead of false. Therefore we need to generate `fcmp`
IR instructions with the `ule` and `ult` conditions, instead of `ole`
and `olt`.

This differs from AdvSIMD, where only instructions with the EQ, GE and
GT conditions are available, and the intrinsics for the others are
defined by swapping the condition and operand order, so the results
match the IEEE754 behaviour for NaNs.
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
@@ -753,8 +753,8 @@ let params = T.Float in {
   defm: compare<"ne", fcmp_ne>;
   defm: compare<"gt", fcmp_gt>;
   defm: compare<"ge", fcmp_ge>;
-  defm: compare<"lt", fcmp_lt>;
-  defm: compare<"le", fcmp_le>;
+  defm: compare<"lt", fcmp_ult>;
+  defm: compare<"le", fcmp_ule>;
 }
 
 let params = T.Signed in {
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
@@ -116,8 +116,8 @@ def fcmp_eq: IRBuilder<"CreateFCmpOEQ">;
 def fcmp_ne: IRBuilder<"CreateFCmpUNE">; // not O: it must return true on NaNs
 def fcmp_gt: IRBuilder<"CreateFCmpOGT">;
 def fcmp_ge: IRBuilder<"CreateFCmpOGE">;
-def fcmp_lt: IRBuilder<"CreateFCmpOLT">;
-def fcmp_le: IRBuilder<"CreateFCmpOLE">;
+def fcmp_ult: IRBuilder<"CreateFCmpULT">;
+def fcmp_ule: IRBuilder<"CreateFCmpULE">;
 def splat: CGHelperFn<"ARMMVEVectorSplat">;
 def select: IRBuilder<"CreateSelect">;
 def fneg: IRBuilder<"CreateFNeg">;
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/compare.c b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
@@ -2376,7 +2376,7 @@ mve_pred16_t test_vcmphiq_m_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vcmpleq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2392,7 +2392,7 @@ mve_pred16_t test_vcmpleq_f16(float16x8_t a, float16x8_t b)
 
 // CHECK-LABEL: @test_vcmpleq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2458,7 +2458,7 @@ mve_pred16_t test_vcmpleq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2476,7 +2476,7 @@ mve_pred16_t test_vcmpleq_n_f16(float16x8_t a, float16_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2548,7 +2548,7 @@ mve_pred16_t test_vcmpleq_n_s32(int32x4_t a, int32_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2567,7 +2567,7 @@ mve_pred16_t test_vcmpleq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2645,7 +2645,7 @@ mve_pred16_t test_vcmpleq_m_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2666,7 +2666,7 @@ mve_pred16_t test_vcmpleq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2746,7 +2746,7 @@ mve_pred16_t test_vcmpleq_m_n_s32(int32x4_t a, int32_t b, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vcmpltq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2762,7 +2762,7 @@ mve_pred16_t test_vcmpltq_f16(float16x8_t a, float16x8_t b)
 
 // CHECK-LABEL: @test_vcmpltq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2828,7 +2828,7 @@ mve_pred16_t test_vcmpltq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2846,7 +2846,7 @@ mve_pred16_t test_vcmpltq_n_f16(float16x8_t a, float16_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2918,7 +2918,7 @@ mve_pred16_t test_vcmpltq_n_s32(int32x4_t a, int32_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2937,7 +2937,7 @@ mve_pred16_t test_vcmpltq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -3015,7 +3015,7 @@ mve_pred16_t test_vcmpltq_m_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -3036,7 +3036,7 @@ mve_pred16_t test_vcmpltq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16