Skip to content

Commit 3755ea9

Browse files
authored
[AMDGPU] Fix scan of atomicFSub in AtomicOptimizer. (#66082)
[D156301](https://reviews.llvm.org/D156301) introduced atomic optimizations for FAdd/FSub. For FSub, reduction/scan needs to be performed using add operation (`not sub`) and memory location will be updated by reduced value using atomic sub later by only one lane. --------- Authored-by: Pravin Jagtap <[email protected]>
1 parent f3fdc96 commit 3755ea9

File tree

5 files changed

+746
-466
lines changed

5 files changed

+746
-466
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,16 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
742742

743743
Function *F = I.getFunction();
744744
LLVMContext &C = F->getContext();
745-
Value *Identity = getIdentityValueForAtomicOp(Ty, Op);
745+
746+
// For atomic sub, perform scan with add operation and allow one lane to
747+
// subtract the reduced value later.
748+
AtomicRMWInst::BinOp ScanOp = Op;
749+
if (Op == AtomicRMWInst::Sub) {
750+
ScanOp = AtomicRMWInst::Add;
751+
} else if (Op == AtomicRMWInst::FSub) {
752+
ScanOp = AtomicRMWInst::FAdd;
753+
}
754+
Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
746755

747756
Value *ExclScan = nullptr;
748757
Value *NewV = nullptr;
@@ -754,8 +763,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
754763
// If we have a divergent value in each lane, we need to combine the value
755764
// using DPP.
756765
if (ValDivergent) {
757-
const AtomicRMWInst::BinOp ScanOp =
758-
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
759766
if (ScanImpl == ScanOptions::DPP) {
760767
// First we need to set all inactive invocations to the identity value, so
761768
// that they can correctly contribute to the final result.
@@ -766,8 +773,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
766773
NewV = B.CreateBitCast(NewV, Ty);
767774
V = B.CreateBitCast(V, Ty);
768775
Identity = B.CreateBitCast(Identity, Ty);
769-
const AtomicRMWInst::BinOp ScanOp =
770-
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
771776
if (!NeedResult && ST->hasPermLaneX16()) {
772777
// On GFX10 the permlanex16 instruction helps us build a reduction
773778
// without too many readlanes and writelanes, which are generally bad

llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
439439
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
440440
; IR-ITERATIVE-NEXT: ret float [[TMP19]]
441441
; IR-ITERATIVE: ComputeLoop:
442-
; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
442+
; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
443443
; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
444444
; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
445445
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
@@ -451,7 +451,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
451451
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
452452
; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
453453
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
454-
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fsub.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
454+
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
455455
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
456456
; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
457457
; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
@@ -472,22 +472,22 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
472472
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
473473
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
474474
; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
475-
; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 0) #[[ATTR8]]
475+
; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
476476
; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
477477
; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
478-
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
479-
; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
480-
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
481-
; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
482-
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
483-
; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
484-
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
485-
; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
486-
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
487-
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
488-
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
489-
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
490-
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
478+
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
479+
; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
480+
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
481+
; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
482+
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
483+
; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
484+
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
485+
; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
486+
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
487+
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
488+
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
489+
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
490+
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
491491
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
492492
; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
493493
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float

llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -186,14 +186,14 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
186186
; IR-ITERATIVE: 10:
187187
; IR-ITERATIVE-NEXT: ret void
188188
; IR-ITERATIVE: ComputeLoop:
189-
; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ]
189+
; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ]
190190
; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ]
191191
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
192192
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
193193
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
194194
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
195195
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
196-
; IR-ITERATIVE-NEXT: [[TMP16]] = fsub float [[ACCUMULATOR]], [[TMP15]]
196+
; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
197197
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
198198
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1
199199
; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]]
@@ -213,21 +213,21 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
213213
; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
214214
; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
215215
; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32
216-
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 0)
216+
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648)
217217
; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
218218
; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float
219-
; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
220-
; IR-DPP-NEXT: [[TMP12:%.*]] = fsub float [[TMP9]], [[TMP11]]
221-
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false)
222-
; IR-DPP-NEXT: [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
223-
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false)
224-
; IR-DPP-NEXT: [[TMP16:%.*]] = fsub float [[TMP14]], [[TMP15]]
225-
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false)
226-
; IR-DPP-NEXT: [[TMP18:%.*]] = fsub float [[TMP16]], [[TMP17]]
227-
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false)
228-
; IR-DPP-NEXT: [[TMP20:%.*]] = fsub float [[TMP18]], [[TMP19]]
229-
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
230-
; IR-DPP-NEXT: [[TMP22:%.*]] = fsub float [[TMP20]], [[TMP21]]
219+
; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
220+
; IR-DPP-NEXT: [[TMP12:%.*]] = fadd float [[TMP9]], [[TMP11]]
221+
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false)
222+
; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP12]], [[TMP13]]
223+
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false)
224+
; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]]
225+
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false)
226+
; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]]
227+
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false)
228+
; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]]
229+
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
230+
; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
231231
; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
232232
; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
233233
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float

llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -347,14 +347,14 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
347347
; IR-ITERATIVE: 13:
348348
; IR-ITERATIVE-NEXT: ret void
349349
; IR-ITERATIVE: ComputeLoop:
350-
; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
350+
; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
351351
; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
352352
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
353353
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
354354
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
355355
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
356356
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
357-
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fsub.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
357+
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
358358
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
359359
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
360360
; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
@@ -375,21 +375,21 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
375375
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
376376
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
377377
; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
378-
; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 0) #[[ATTR8]]
378+
; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
379379
; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
380380
; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
381-
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
382-
; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
383-
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
384-
; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
385-
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
386-
; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
387-
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
388-
; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
389-
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
390-
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
391-
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
392-
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
381+
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
382+
; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
383+
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
384+
; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
385+
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
386+
; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
387+
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
388+
; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
389+
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
390+
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
391+
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
392+
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
393393
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
394394
; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
395395
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float

0 commit comments

Comments
 (0)