Skip to content

Commit f5b900d

Browse files
Extend intrinsic types
1 parent 8a8f4f4 commit f5b900d

File tree

3 files changed

+335
-16
lines changed

3 files changed

+335
-16
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -110,24 +110,30 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo *UI) {
110110
Intrinsic::amdgcn_readlane, Intrinsic::amdgcn_ballot};
111111

112112
bool IsChanged = false;
113-
113+
// TODO: Vector types can also be optimized, provided generic way to query
114+
// getDeclarationIfExists().
115+
SmallVector<Type *, 7> Tys = {
116+
Type::getInt16Ty(Ctx), // i16
117+
Type::getInt32Ty(Ctx), // i32
118+
Type::getInt64Ty(Ctx), // i64
119+
Type::getHalfTy(Ctx), // Float16
120+
Type::getFloatTy(Ctx), // float
121+
Type::getDoubleTy(Ctx), // double
122+
Type::getBFloatTy(Ctx) // bfloat16
123+
};
114124
// Iterate over each intrinsic in the list and process its uses within F.
115125
for (Intrinsic::ID IID : Intrinsics) {
116-
// Determine the correct return type for the intrinsic.
117-
// Most intrinsics return i32, but amdgcn_ballot returns i64.
118-
llvm::Type *IntrinsicTy = (IID == Intrinsic::amdgcn_ballot)
119-
? llvm::Type::getInt64Ty(Ctx)
120-
: llvm::Type::getInt32Ty(Ctx);
121-
122-
// Check if the intrinsic is declared in the module with the expected type.
123-
if (Function *Intr =
124-
Intrinsic::getDeclarationIfExists(M, IID, {IntrinsicTy})) {
125-
// Iterate over all users of the intrinsic.
126-
for (User *U : Intr->users()) {
127-
// Ensure the user is an intrinsic call within function F.
128-
if (auto *II = dyn_cast<IntrinsicInst>(U)) {
129-
if (II->getFunction() == &F) {
130-
IsChanged |= optimizeUniformIntrinsic(*II, UI);
126+
for (Type *Ty : Tys) {
127+
// Check if the intrinsic is declared in the module with the expected
128+
// type.
129+
if (Function *Intr = Intrinsic::getDeclarationIfExists(M, IID, {Ty})) {
130+
// Iterate over all users of the intrinsic.
131+
for (User *U : Intr->users()) {
132+
// Ensure the user is an intrinsic call within function F.
133+
if (auto *II = dyn_cast<IntrinsicInst>(U)) {
134+
if (II->getFunction() == &F) {
135+
IsChanged |= optimizeUniformIntrinsic(*II, UI);
136+
}
131137
}
132138
}
133139
}

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,107 @@ exit:
349349
ret void
350350
}
351351

352+
define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace(1) %out) {
353+
; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(
354+
; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] {
355+
; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
356+
; CURRENT-CHECK-NEXT: [[BALLOT_PEEL:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
357+
; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[BALLOT_PEEL]], 0
358+
; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]]
359+
; CURRENT-CHECK: [[IF_PEEL]]:
360+
; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
361+
; CURRENT-CHECK-NEXT: br label %[[EXIT]]
362+
; CURRENT-CHECK: [[EXIT]]:
363+
; CURRENT-CHECK-NEXT: ret void
364+
;
365+
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(
366+
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
367+
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
368+
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
369+
; PASS-CHECK: [[WHILE]]:
370+
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
371+
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
372+
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
373+
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
374+
; PASS-CHECK: [[IF]]:
375+
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
376+
; PASS-CHECK-NEXT: br label %[[WHILE]]
377+
; PASS-CHECK: [[EXIT]]:
378+
; PASS-CHECK-NEXT: ret void
379+
;
380+
; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(
381+
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
382+
; DCE-CHECK-NEXT: [[ENTRY:.*:]]
383+
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
384+
; DCE-CHECK-NEXT: ret void
385+
;
386+
entry:
387+
br label %while
388+
389+
while:
390+
%done = phi i1 [ 0, %entry ], [ 1, %if ]
391+
%not_done = xor i1 %done, true
392+
%ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %not_done)
393+
%is_done = icmp eq i32 %ballot, 0 ; in this case is_done = !not_done
394+
br i1 %is_done, label %exit, label %if
395+
396+
if:
397+
store i32 5, ptr addrspace(1) %out
398+
br label %while
399+
400+
exit:
401+
ret void
402+
}
403+
404+
define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace(1) %out) {
405+
; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(
406+
; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
407+
; CURRENT-CHECK-NEXT: [[ENTRY:.*:]]
408+
; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
409+
; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]]
410+
; CURRENT-CHECK: [[WHILE]]:
411+
; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true)
412+
; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[BALLOT]], 0
413+
; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
414+
; CURRENT-CHECK: [[EXIT]]:
415+
; CURRENT-CHECK-NEXT: ret void
416+
;
417+
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(
418+
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
419+
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
420+
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
421+
; PASS-CHECK: [[WHILE]]:
422+
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
423+
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
424+
; PASS-CHECK: [[IF]]:
425+
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
426+
; PASS-CHECK-NEXT: br label %[[WHILE]]
427+
; PASS-CHECK: [[EXIT]]:
428+
; PASS-CHECK-NEXT: ret void
429+
;
430+
; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(
431+
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
432+
; DCE-CHECK-NEXT: [[ENTRY:.*:]]
433+
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
434+
; DCE-CHECK-NEXT: ret void
435+
;
436+
entry:
437+
br label %while
438+
439+
while:
440+
%done = phi i1 [ 0, %entry ], [ 1, %if ]
441+
%ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %done)
442+
%is_done = icmp ne i32 0, %ballot ; in this case is_done = done
443+
br i1 %is_done, label %exit, label %if
444+
445+
if:
446+
store i32 5, ptr addrspace(1) %out
447+
br label %while
448+
449+
exit:
450+
ret void
451+
}
452+
352453
declare i64 @llvm.amdgcn.ballot.i64(i1) #1
353454
!6 = !{i64 690}
354455
!7 = distinct !{!7, !8}
@@ -357,4 +458,5 @@ declare i64 @llvm.amdgcn.ballot.i64(i1) #1
357458
; CURRENT-CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
358459
; CURRENT-CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
359460
; CURRENT-CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
461+
; CURRENT-CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
360462
;.

llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,3 +586,214 @@ define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) {
586586
ret void
587587
}
588588

589+
define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
590+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
591+
; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
592+
; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
593+
; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
594+
; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
595+
; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1
596+
; CURRENT-CHECK-NEXT: ret void
597+
;
598+
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
599+
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
600+
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
601+
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
602+
; PASS-CHECK-NEXT: ret void
603+
;
604+
; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
605+
; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
606+
; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
607+
; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
608+
; DCE-CHECK-NEXT: ret void
609+
;
610+
%c = trunc i32 %v to i1
611+
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
612+
%ballot_ne_zero = icmp ne i32 %ballot, 0
613+
store i1 %ballot_ne_zero, ptr addrspace(1) %out
614+
ret void
615+
}
616+
617+
define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
618+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
619+
; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
620+
; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
621+
; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
622+
; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[TMP1]], 0
623+
; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1
624+
; CURRENT-CHECK-NEXT: ret void
625+
;
626+
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
627+
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
628+
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
629+
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
630+
; PASS-CHECK-NEXT: ret void
631+
;
632+
; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
633+
; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
634+
; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
635+
; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
636+
; DCE-CHECK-NEXT: ret void
637+
;
638+
%c = trunc i32 %v to i1
639+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
640+
%ballot_ne_zero = icmp ne i64 %ballot, 0
641+
store i1 %ballot_ne_zero, ptr addrspace(1) %out
642+
ret void
643+
}
644+
645+
define amdgpu_kernel void @test_readlane_i16(i16 %src0, i32 %src1) {
646+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
647+
; CURRENT-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
648+
; CURRENT-CHECK-NEXT: [[READLANE:%.*]] = tail call i16 @llvm.amdgcn.readlane.i16(i16 [[SRC0]], i32 [[SRC1]])
649+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
650+
; CURRENT-CHECK-NEXT: ret void
651+
;
652+
; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
653+
; PASS-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
654+
; PASS-CHECK-NEXT: call void asm sideeffect "
655+
; PASS-CHECK-NEXT: ret void
656+
;
657+
; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16(
658+
; DCE-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
659+
; DCE-CHECK-NEXT: call void asm sideeffect "
660+
; DCE-CHECK-NEXT: ret void
661+
;
662+
%readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src0, i32 %src1)
663+
call void asm sideeffect "; use $0", "s"(i16 %readlane)
664+
ret void
665+
}
666+
667+
define amdgpu_kernel void @test_readlane_i64(i64 %src0, i32 %src1) {
668+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
669+
; CURRENT-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
670+
; CURRENT-CHECK-NEXT: [[READLANE:%.*]] = tail call i64 @llvm.amdgcn.readlane.i64(i64 [[SRC0]], i32 [[SRC1]])
671+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
672+
; CURRENT-CHECK-NEXT: ret void
673+
;
674+
; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
675+
; PASS-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
676+
; PASS-CHECK-NEXT: call void asm sideeffect "
677+
; PASS-CHECK-NEXT: ret void
678+
;
679+
; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64(
680+
; DCE-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
681+
; DCE-CHECK-NEXT: call void asm sideeffect "
682+
; DCE-CHECK-NEXT: ret void
683+
;
684+
%readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
685+
call void asm sideeffect "; use $0", "s"(i64 %readlane)
686+
ret void
687+
}
688+
689+
define amdgpu_kernel void @test_readlane_bf16(bfloat %src0, i32 %src1) {
690+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
691+
; CURRENT-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
692+
; CURRENT-CHECK-NEXT: [[READLANE:%.*]] = tail call bfloat @llvm.amdgcn.readlane.bf16(bfloat [[SRC0]], i32 [[SRC1]])
693+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
694+
; CURRENT-CHECK-NEXT: ret void
695+
;
696+
; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
697+
; PASS-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
698+
; PASS-CHECK-NEXT: call void asm sideeffect "
699+
; PASS-CHECK-NEXT: ret void
700+
;
701+
; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16(
702+
; DCE-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
703+
; DCE-CHECK-NEXT: call void asm sideeffect "
704+
; DCE-CHECK-NEXT: ret void
705+
;
706+
%readlane = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src0, i32 %src1)
707+
call void asm sideeffect "; use $0", "s"(bfloat %readlane)
708+
ret void
709+
}
710+
711+
define amdgpu_kernel void @test_readlane_f16(half %src0, i32 %src1) {
712+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
713+
; CURRENT-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
714+
; CURRENT-CHECK-NEXT: [[READLANE:%.*]] = tail call half @llvm.amdgcn.readlane.f16(half [[SRC0]], i32 [[SRC1]])
715+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
716+
; CURRENT-CHECK-NEXT: ret void
717+
;
718+
; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
719+
; PASS-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
720+
; PASS-CHECK-NEXT: call void asm sideeffect "
721+
; PASS-CHECK-NEXT: ret void
722+
;
723+
; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16(
724+
; DCE-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
725+
; DCE-CHECK-NEXT: call void asm sideeffect "
726+
; DCE-CHECK-NEXT: ret void
727+
;
728+
%readlane = call half @llvm.amdgcn.readlane.f16(half %src0, i32 %src1)
729+
call void asm sideeffect "; use $0", "s"(half %readlane)
730+
ret void
731+
}
732+
733+
define amdgpu_kernel void @test_readlane_f32(float %src0, i32 %src1) {
734+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
735+
; CURRENT-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
736+
; CURRENT-CHECK-NEXT: [[READLANE:%.*]] = tail call float @llvm.amdgcn.readlane.f32(float [[SRC0]], i32 [[SRC1]])
737+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
738+
; CURRENT-CHECK-NEXT: ret void
739+
;
740+
; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
741+
; PASS-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
742+
; PASS-CHECK-NEXT: call void asm sideeffect "
743+
; PASS-CHECK-NEXT: ret void
744+
;
745+
; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32(
746+
; DCE-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
747+
; DCE-CHECK-NEXT: call void asm sideeffect "
748+
; DCE-CHECK-NEXT: ret void
749+
;
750+
%readlane = call float @llvm.amdgcn.readlane.f32(float %src0, i32 %src1)
751+
call void asm sideeffect "; use $0", "s"(float %readlane)
752+
ret void
753+
}
754+
755+
define amdgpu_kernel void @test_readlane_f64(double %src0, i32 %src1) {
756+
; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
757+
; CURRENT-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
758+
; CURRENT-CHECK-NEXT: [[READLANE:%.*]] = tail call double @llvm.amdgcn.readlane.f64(double [[SRC0]], i32 [[SRC1]])
759+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
760+
; CURRENT-CHECK-NEXT: ret void
761+
;
762+
; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
763+
; PASS-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
764+
; PASS-CHECK-NEXT: call void asm sideeffect "
765+
; PASS-CHECK-NEXT: ret void
766+
;
767+
; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64(
768+
; DCE-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
769+
; DCE-CHECK-NEXT: call void asm sideeffect "
770+
; DCE-CHECK-NEXT: ret void
771+
;
772+
%readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
773+
call void asm sideeffect "; use $0", "s"(double %readlane)
774+
ret void
775+
}
776+
; All such cases can be optimised, given generic way to query getDeclarationIfExists()
777+
define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
778+
; CURRENT-CHECK-LABEL: define void @test_readlane_v8i16(
779+
; CURRENT-CHECK-SAME: ptr addrspace(1) readnone captures(none) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] {
780+
; CURRENT-CHECK-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
781+
; CURRENT-CHECK-NEXT: tail call void asm sideeffect "
782+
; CURRENT-CHECK-NEXT: ret void
783+
;
784+
; PASS-CHECK-LABEL: define void @test_readlane_v8i16(
785+
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
786+
; PASS-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
787+
; PASS-CHECK-NEXT: call void asm sideeffect "
788+
; PASS-CHECK-NEXT: ret void
789+
;
790+
; DCE-CHECK-LABEL: define void @test_readlane_v8i16(
791+
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
792+
; DCE-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]])
793+
; DCE-CHECK-NEXT: call void asm sideeffect "
794+
; DCE-CHECK-NEXT: ret void
795+
;
796+
%x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
797+
call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
798+
ret void
799+
}

0 commit comments

Comments
 (0)