[VPlan] Preserve IR flags when widening casts #115373

goldsteinn · 2024-11-07T21:44:06Z

We have nneg for both sext and uitofp.

Fixes #114856

llvmbot · 2024-11-07T21:44:40Z

@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-amdgpu

Author: None (goldsteinn)

Changes

We have nneg for both sext and uitofp.

Fixes #114856

Full diff: https://github.com/llvm/llvm-project/pull/115373.diff

10 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+2)
(modified) llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll (+2)
(modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll (+1)
(modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (+1)
(modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (+1)
(modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir (+8)
(modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll (+5-1)
(added) llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll (+78)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll (+1-1)
(modified) llvm/test/tools/llvm-reduce/mir/preserve-machine-function-info-amdgpu.mir (+2)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6254ea15191819..ef2ca9af7268d1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1484,6 +1484,8 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
   State.set(this, Cast);
   State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
+  if (auto *CastOp = dyn_cast<Instruction>(Cast))
+    setFlags(CastOp);
 }
 
 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 0f7a5f8e0941ad..0ae51c602a8d98 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -38,6 +38,7 @@
 ; CHECK-NEXT:     fp64-fp16-output-denormals: true
 ; CHECK-NEXT:   highBitsOf32BitAddress: 0
 ; CHECK-NEXT:   occupancy:       5
+; CHECK-NEXT:   numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT:   scavengeFI:      '%stack.0'
 ; CHECK-NEXT:   vgprForAGPRCopy: ''
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
@@ -304,6 +305,7 @@
 ; CHECK-NEXT:     fp64-fp16-output-denormals: true
 ; CHECK-NEXT:   highBitsOf32BitAddress: 0
 ; CHECK-NEXT:   occupancy:       5
+; CHECK-NEXT:   numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT:   scavengeFI:      '%stack.0'
 ; CHECK-NEXT:   vgprForAGPRCopy: ''
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 7759501ea42268..07b933cdb6583c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -38,6 +38,7 @@
 ; AFTER-PEI-NEXT:   fp64-fp16-output-denormals: true
 ; AFTER-PEI-NEXT: highBitsOf32BitAddress: 0
 ; AFTER-PEI-NEXT: occupancy: 5
+; AFTER-PEI-NEXT: numPhysicalVGPRSpillLanes: 0
 ; AFTER-PEI-NEXT: scavengeFI: '%stack.3'
 ; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
 ; AFTER-PEI-NEXT: sgprForEXECCopy: ''
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 4545c8bbeb3e6c..ea61ec9cb512ca 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -39,6 +39,7 @@
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
+; CHECK-NEXT: numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 8215ba834170f2..0a689df49258c1 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -39,6 +39,7 @@
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
+; CHECK-NEXT: numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 51795a4fea515e..b430488987e03c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -48,6 +48,7 @@
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
+# FULL-NEXT:  numPhysicalVGPRSpillLanes: 0
 # FULL-NEXT:  vgprForAGPRCopy: ''
 # FULL-NEXT:  sgprForEXECCopy: ''
 # FULL-NEXT:  longBranchReservedReg: ''
@@ -81,6 +82,7 @@
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # SIMPLE-NEXT: occupancy: 8
+# SIMPLE-NEXT: numPhysicalVGPRSpillLanes: 0
 # SIMPLE-NEXT: body:
 name: kernel0
 machineFunctionInfo:
@@ -152,6 +154,7 @@ body:             |
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
+# FULL-NEXT: numPhysicalVGPRSpillLanes: 0
 # FULL-NEXT: vgprForAGPRCopy: ''
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
@@ -174,6 +177,7 @@ body:             |
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # SIMPLE-NEXT:  occupancy: 8
+# SIMPLE-NEXT:  numPhysicalVGPRSpillLanes: 0
 # SIMPLE-NEXT: body:
 
 name: no_mfi
@@ -227,6 +231,7 @@ body:             |
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
+# FULL-NEXT: numPhysicalVGPRSpillLanes: 0
 # FULL-NEXT: vgprForAGPRCopy: ''
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
@@ -249,6 +254,7 @@ body:             |
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # SIMPLE-NEXT:  occupancy: 8
+# SIMPLE-NEXT:  numPhysicalVGPRSpillLanes: 0
 # SIMPLE-NEXT: body:
 
 name: empty_mfi
@@ -303,6 +309,7 @@ body:             |
 # FULL-NEXT: fp64-fp16-output-denormals: true
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
+# FULL-NEXT: numPhysicalVGPRSpillLanes: 0
 # FULL-NEXT: vgprForAGPRCopy: ''
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
@@ -326,6 +333,7 @@ body:             |
 # SIMPLE-NEXT: workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
 # SIMPLE-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # SIMPLE-NEXT: occupancy: 8
+# SIMPLE-NEXT: numPhysicalVGPRSpillLanes: 0
 # SIMPLE-NEXT: body:
 
 name: empty_mfi_entry_func
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index b69ede6f24f0f1..3fa4977a98e734 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -48,10 +48,11 @@
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
-; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: hasInitWholeWave: false    
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
   %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -94,6 +95,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
@@ -164,6 +166,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
@@ -216,6 +219,7 @@ define void @function() {
 ; CHECK-NEXT: fp64-fp16-output-denormals: true
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: numPhysicalVGPRSpillLanes: 0
 ; CHECK-NEXT: vgprForAGPRCopy: ''
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll
new file mode 100644
index 00000000000000..b093f35159fc71
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @uitofp_preserve_nneg(ptr nocapture noundef writeonly %result, i32 noundef %size, float noundef %y, float noundef %delta) {
+; CHECK-LABEL: @uitofp_preserve_nneg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[SIZE]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[DELTA:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = uitofp nneg <4 x i32> [[VEC_IND]] to <4 x float>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[TMP0]], <4 x float> [[BROADCAST_SPLAT3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[RESULT:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER4]]
+; CHECK:       for.body.preheader4:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER4]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp nneg i32 [[TMP4]] to float
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.fmuladd.f32(float [[DELTA]], float [[CONV]], float [[Y]])
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[RESULT]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp4 = icmp sgt i32 %size, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %t.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %conv = uitofp nneg i32 %t.05 to float
+  %0 = tail call float @llvm.fmuladd.f32(float %delta, float %conv, float %y)
+  %idxprom = zext nneg i32 %t.05 to i64
+  %arrayidx = getelementptr inbounds float, ptr %result, i64 %idxprom
+  store float %0, ptr %arrayidx, align 4
+  %inc = add nuw nsw i32 %t.05, 1
+  %cmp = icmp slt i32 %inc, %size
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index e4dbc5829f6115..b312688b7932dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -109,7 +109,7 @@ define void @test3(float %0) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x float> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> poison, <2 x i1> <i1 true, i1 true>, i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> poison, <2 x i1> splat (i1 true), i64 0)
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> [[TMP6]], <2 x i1> [[TMP5]], i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
diff --git a/llvm/test/tools/llvm-reduce/mir/preserve-machine-function-info-amdgpu.mir b/llvm/test/tools/llvm-reduce/mir/preserve-machine-function-info-amdgpu.mir
index 73e75fc0f7ef5b..303ebaabd34410 100644
--- a/llvm/test/tools/llvm-reduce/mir/preserve-machine-function-info-amdgpu.mir
+++ b/llvm/test/tools/llvm-reduce/mir/preserve-machine-function-info-amdgpu.mir
@@ -52,10 +52,12 @@
 # RESULT-NEXT: fp64-fp16-output-denormals: false
 # RESULT-NEXT: highBitsOf32BitAddress: 4276993775
 # RESULT-NEXT: occupancy:       8
+# RESULT-NEXT: numPhysicalVGPRSpillLanes: 0
 # RESULT-NEXT: wwmReservedRegs:
 # RESULT-NEXT: - '$vgpr2'
 # RESULT-NEXT: - '$vgpr3'
 # RESULT-NEXT: vgprForAGPRCopy: '$vgpr33'
+# RESULT-NEXT: body:
 
 # RESULT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51
 # RESULT: S_NOP 0, implicit $vgpr33

nikic · 2024-11-07T22:04:56Z

Looks like there are some unrelated test changes.

goldsteinn · 2024-11-07T23:49:00Z

Looks like there are some unrelated test changes.

Fixed, sorry saw them failing earlier and didn't really think about it when fixing them.

dtcxzyw · 2024-11-08T06:31:54Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s


Suggested change

; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s

; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s

Yes please, the test should be independent of O3

dtcxzyw

LG.

dtcxzyw · 2024-11-08T06:34:38Z

It reminds me that we should support samesign in VPRecipeWithIRFlags.

Mel-Chen

LG, Thank you.

Mel-Chen · 2024-11-08T07:16:22Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @uitofp_preserve_nneg(ptr nocapture noundef writeonly %result, i32 noundef %size, float noundef %y, float noundef %delta) {


nit: Do you really need dso_local, nocapture, noundef, writeonly in this patch? If not, please clean up.

fhahn · 2024-11-08T10:05:25Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"


does this test need to be X86 specific? it looks like you are already forcing VF & interleave count, so would be good to drop the target triple and move to up one level if possible

fhahn · 2024-11-08T10:05:48Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+  %cmp4 = icmp sgt i32 %size, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end


this shouldn't be needed

fhahn · 2024-11-08T10:07:11Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+  br label %for.body
+
+for.body:
+  %t.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]


nit:

Suggested change

%t.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]

%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]

fhahn · 2024-11-08T10:07:47Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+  %arrayidx = getelementptr inbounds float, ptr %result, i64 %idxprom
+  store float %0, ptr %arrayidx, align 4
+  %inc = add nuw nsw i32 %t.05, 1
+  %cmp = icmp slt i32 %inc, %size


can use constant trip count to avoid runtime check and extension of size

fhahn · 2024-11-08T10:08:04Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void


Suggested change

for.end.loopexit:

br label %for.end

for.end:

ret void

exit:

ret void

fhahn · 2024-11-08T10:08:31Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

+for.body:
+  %t.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %conv = uitofp nneg i32 %t.05 to float
+  %0 = tail call float @llvm.fmuladd.f32(float %delta, float %conv, float %y)


sufficient to just have a simple fmul/ fadd instruction?

Sure, I didn't really simplify the tests much, but will do so.

fhahn · 2024-11-08T10:09:45Z

llvm/test/Transforms/LoopVectorize/X86/uitofp-preserve-nneg.ll

@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s


Yes please, the test should be independent of O3

fhahn · 2024-11-08T10:10:02Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -1484,6 +1484,8 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
  Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);


Would be nice to have a way to specify the flags at construction rather than having to add them after the fact, but it looks like the builder interface hasn't been updated yet to take the flags at construction :(

We have `nneg` for both `sext` and `uitofp`. Fixes llvm#114856

fhahn

LGTM, with another nit in the test as suggestion

fhahn · 2024-11-08T20:21:03Z

llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll

+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s
+
+define dso_local void @uitofp_preserve_nneg(ptr nocapture noundef writeonly %result, i32 noundef %size, float noundef %y) {


Suggested change

define dso_local void @uitofp_preserve_nneg(ptr nocapture noundef writeonly %result, i32 noundef %size, float noundef %y) {

define void @uitofp_preserve_nneg(ptr %result, float %y) {

We have `nneg` for both `sext` and `uitofp`. Fixes llvm#114856 Closes llvm#115373

goldsteinn requested review from nikic, fhahn, RKSimon, rovka, davemgreen, dtcxzyw and HanKuanChen November 7, 2024 21:44

llvmbot added backend:AMDGPU vectorizers llvm:transforms labels Nov 7, 2024

goldsteinn requested a review from arsenm November 7, 2024 21:44

goldsteinn force-pushed the goldsteinn/vplan-casts branch from 08c0745 to b21f82b Compare November 8, 2024 02:02

arsenm approved these changes Nov 8, 2024

View reviewed changes

dtcxzyw reviewed Nov 8, 2024

View reviewed changes

dtcxzyw approved these changes Nov 8, 2024

View reviewed changes

Mel-Chen approved these changes Nov 8, 2024

View reviewed changes

fhahn reviewed Nov 8, 2024

View reviewed changes

goldsteinn added 2 commits November 8, 2024 12:56

[LV] Add test for preserving flags when widening casts; NFC

2dc6b5b

[VPlan] Preserve IR flags when widening casts

8c1ad9b

We have `nneg` for both `sext` and `uitofp`. Fixes llvm#114856

goldsteinn force-pushed the goldsteinn/vplan-casts branch from b21f82b to 8c1ad9b Compare November 8, 2024 19:07

fhahn approved these changes Nov 8, 2024

View reviewed changes

goldsteinn closed this in 8af5ae0 Nov 8, 2024

Groverkss pushed a commit to iree-org/llvm-project that referenced this pull request Nov 15, 2024

[VPlan] Preserve IR flags when widening casts

cccff9b

We have `nneg` for both `sext` and `uitofp`. Fixes llvm#114856 Closes llvm#115373

		@@ -0,0 +1,78 @@
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
		; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s \| FileCheck %s

	; RUN: opt -passes="default<O3>,loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s \| FileCheck %s
	; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s \| FileCheck %s

		%cmp4 = icmp sgt i32 %size, 0
		br i1 %cmp4, label %for.body.preheader, label %for.end

	%t.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
	%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]

		@@ -1484,6 +1484,8 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
		Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);

	define dso_local void @uitofp_preserve_nneg(ptr nocapture noundef writeonly %result, i32 noundef %size, float noundef %y) {
	define void @uitofp_preserve_nneg(ptr %result, float %y) {

[VPlan] Preserve IR flags when widening casts #115373

[VPlan] Preserve IR flags when widening casts #115373

Uh oh!

Conversation

goldsteinn commented Nov 7, 2024

Uh oh!

llvmbot commented Nov 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic commented Nov 7, 2024

Uh oh!

goldsteinn commented Nov 7, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

dtcxzyw commented Nov 8, 2024

Uh oh!

Mel-Chen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvmbot commented Nov 7, 2024 •

edited

Loading