Skip to content

Commit 09b941d

Browse files
SC llvm teamSC llvm team
authored andcommitted
Merged main:1157187496af into amd-gfx:8ab4aeea6c64
Local branch amd-gfx 8ab4aee Merged main:37e48e4a7360 into amd-gfx:97f9a4fb1810 Remote branch main 1157187 [VPlan] Propagate all GEP flags (llvm#119899)
2 parents 8ab4aee + 1157187 commit 09b941d

File tree

58 files changed

+845
-400
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+845
-400
lines changed

clang/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,6 @@ if (APPLE AND NOT CMAKE_LINKER MATCHES ".*lld.*")
362362
message(STATUS "Host linker version: ${HOST_LINK_VERSION}")
363363
endif()
364364

365-
include(CMakeParseArguments)
366365
include(AddClang)
367366

368367
set(CMAKE_INCLUDE_CURRENT_DIR ON)

clang/include/clang/Basic/arm_sme.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,11 @@ let SMETargetGuard = "sme-f8f32" in {
873873
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
874874
def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
875875
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
876+
// FMLALL (multiple)
877+
def SVMLA_FP8_MULTI_ZA32_VG4x2 : Inst<"svmla_za32[_mf8]_vg4x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fmlall_multi_za32_vg4x2",
878+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
879+
def SVMLA_FP8_MULTI_ZA32_VG4x4 : Inst<"svmla_za32[_mf8]_vg4x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fmlall_multi_za32_vg4x4",
880+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
876881
}
877882

878883
let SMETargetGuard = "sme-f8f16" in {
@@ -892,6 +897,11 @@ let SMETargetGuard = "sme-f8f16" in {
892897
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
893898
def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
894899
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
900+
// FMLAL (multiple)
901+
def SVMLA_FP8_MULTI_ZA16_VG2x2 : Inst<"svmla_za16[_mf8]_vg2x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fmlal_multi_za16_vg2x2",
902+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
903+
def SVMLA_FP8_MULTI_ZA16_VG2x4 : Inst<"svmla_za16[_mf8]_vg2x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fmlal_multi_za16_vg2x4",
904+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
895905
}
896906

897907
} // let SVETargetGuard = InvalidMode

clang/lib/AST/ByteCode/Compiler.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6483,14 +6483,6 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
64836483
QualType ToType = E->getType();
64846484
std::optional<PrimType> ToT = classify(ToType);
64856485

6486-
// Bitcasting TO nullptr_t is always fine.
6487-
if (ToType->isNullPtrType()) {
6488-
if (!this->discard(SubExpr))
6489-
return false;
6490-
6491-
return this->emitNullPtr(0, nullptr, E);
6492-
}
6493-
64946486
assert(!ToType->isReferenceType());
64956487

64966488
// Prepare storage for the result in case we discard.
@@ -6523,8 +6515,8 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
65236515
return false;
65246516
}
65256517

6526-
if (!ToT || ToT == PT_Ptr) {
6527-
if (!this->emitBitCastPtr(E))
6518+
if (!ToT) {
6519+
if (!this->emitBitCast(E))
65286520
return false;
65296521
return DiscardResult ? this->emitPopPtr(E) : true;
65306522
}
@@ -6540,8 +6532,8 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
65406532
ToType->isSpecificBuiltinType(BuiltinType::Char_U));
65416533
uint32_t ResultBitWidth = std::max(Ctx.getBitWidth(ToType), 8u);
65426534

6543-
if (!this->emitBitCast(*ToT, ToTypeIsUChar || ToType->isStdByteType(),
6544-
ResultBitWidth, TargetSemantics, E))
6535+
if (!this->emitBitCastPrim(*ToT, ToTypeIsUChar || ToType->isStdByteType(),
6536+
ResultBitWidth, TargetSemantics, E))
65456537
return false;
65466538

65476539
if (DiscardResult)

clang/lib/AST/ByteCode/Interp.h

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3030,43 +3030,51 @@ bool CheckNewTypeMismatchArray(InterpState &S, CodePtr OpPC, const Expr *E) {
30303030
bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E);
30313031

30323032
template <PrimType Name, class T = typename PrimConv<Name>::T>
3033-
inline bool BitCast(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
3034-
uint32_t ResultBitWidth, const llvm::fltSemantics *Sem) {
3033+
inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
3034+
uint32_t ResultBitWidth,
3035+
const llvm::fltSemantics *Sem) {
30353036
const Pointer &FromPtr = S.Stk.pop<Pointer>();
30363037

30373038
if (!CheckLoad(S, OpPC, FromPtr))
30383039
return false;
30393040

3040-
size_t BuffSize = ResultBitWidth / 8;
3041-
llvm::SmallVector<std::byte> Buff(BuffSize);
3042-
bool HasIndeterminateBits = false;
3041+
if constexpr (std::is_same_v<T, Pointer>) {
3042+
// The only pointer type we can validly bitcast to is nullptr_t.
3043+
S.Stk.push<Pointer>();
3044+
return true;
3045+
} else {
30433046

3044-
Bits FullBitWidth(ResultBitWidth);
3045-
Bits BitWidth = FullBitWidth;
3047+
size_t BuffSize = ResultBitWidth / 8;
3048+
llvm::SmallVector<std::byte> Buff(BuffSize);
3049+
bool HasIndeterminateBits = false;
30463050

3047-
if constexpr (std::is_same_v<T, Floating>) {
3048-
assert(Sem);
3049-
BitWidth = Bits(llvm::APFloatBase::getSizeInBits(*Sem));
3050-
}
3051+
Bits FullBitWidth(ResultBitWidth);
3052+
Bits BitWidth = FullBitWidth;
30513053

3052-
if (!DoBitCast(S, OpPC, FromPtr, Buff.data(), BitWidth, FullBitWidth,
3053-
HasIndeterminateBits))
3054-
return false;
3054+
if constexpr (std::is_same_v<T, Floating>) {
3055+
assert(Sem);
3056+
BitWidth = Bits(llvm::APFloatBase::getSizeInBits(*Sem));
3057+
}
30553058

3056-
if (!CheckBitCast(S, OpPC, HasIndeterminateBits, TargetIsUCharOrByte))
3057-
return false;
3059+
if (!DoBitCast(S, OpPC, FromPtr, Buff.data(), BitWidth, FullBitWidth,
3060+
HasIndeterminateBits))
3061+
return false;
30583062

3059-
if constexpr (std::is_same_v<T, Floating>) {
3060-
assert(Sem);
3061-
S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
3062-
} else {
3063-
assert(!Sem);
3064-
S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
3063+
if (!CheckBitCast(S, OpPC, HasIndeterminateBits, TargetIsUCharOrByte))
3064+
return false;
3065+
3066+
if constexpr (std::is_same_v<T, Floating>) {
3067+
assert(Sem);
3068+
S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
3069+
} else {
3070+
assert(!Sem);
3071+
S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
3072+
}
3073+
return true;
30653074
}
3066-
return true;
30673075
}
30683076

3069-
inline bool BitCastPtr(InterpState &S, CodePtr OpPC) {
3077+
inline bool BitCast(InterpState &S, CodePtr OpPC) {
30703078
const Pointer &FromPtr = S.Stk.pop<Pointer>();
30713079
Pointer &ToPtr = S.Stk.peek<Pointer>();
30723080

clang/lib/AST/ByteCode/Opcodes.td

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -839,13 +839,14 @@ def IsConstantContext: Opcode;
839839
def CheckAllocations : Opcode;
840840

841841
def BitCastTypeClass : TypeClass {
842-
let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64, IntAP, IntAPS, Bool, Float];
842+
let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64,
843+
IntAP, IntAPS, Bool, Float, Ptr];
843844
}
844845

845-
def BitCast : Opcode {
846+
def BitCastPrim : Opcode {
846847
let Types = [BitCastTypeClass];
847848
let Args = [ArgBool, ArgUint32, ArgFltSemantics];
848849
let HasGroup = 1;
849850
}
850851

851-
def BitCastPtr : Opcode;
852+
def BitCast : Opcode;

clang/lib/CodeGen/CGHLSLRuntime.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -512,13 +512,17 @@ void CGHLSLRuntime::generateGlobalCtorDtorCalls() {
512512
IP = Token->getNextNode();
513513
}
514514
IRBuilder<> B(IP);
515-
for (auto *Fn : CtorFns)
516-
B.CreateCall(FunctionCallee(Fn), {}, OB);
515+
for (auto *Fn : CtorFns) {
516+
auto CI = B.CreateCall(FunctionCallee(Fn), {}, OB);
517+
CI->setCallingConv(Fn->getCallingConv());
518+
}
517519

518520
// Insert global dtors before the terminator of the last instruction
519521
B.SetInsertPoint(F.back().getTerminator());
520-
for (auto *Fn : DtorFns)
521-
B.CreateCall(FunctionCallee(Fn), {}, OB);
522+
for (auto *Fn : DtorFns) {
523+
auto CI = B.CreateCall(FunctionCallee(Fn), {}, OB);
524+
CI->setCallingConv(Fn->getCallingConv());
525+
}
522526
}
523527

524528
// No need to keep global ctors/dtors for non-lib profile after call to

clang/lib/Sema/SemaStmtAsm.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -664,11 +664,16 @@ StmtResult Sema::ActOnGCCAsmStmt(SourceLocation AsmLoc, bool IsSimple,
664664
SmallerValueMentioned |= OutSize < InSize;
665665
}
666666

667+
// If the input is an integer register while the output is floating point,
668+
// or vice-versa, there is no way they can work together.
669+
bool FPTiedToInt = (InputDomain == AD_FP) ^ (OutputDomain == AD_FP);
670+
667671
// If the smaller value wasn't mentioned in the asm string, and if the
668672
// output was a register, just extend the shorter one to the size of the
669673
// larger one.
670-
if (!SmallerValueMentioned && InputDomain != AD_Other &&
674+
if (!SmallerValueMentioned && !FPTiedToInt && InputDomain != AD_Other &&
671675
OutputConstraintInfos[TiedTo].allowsRegister()) {
676+
672677
// FIXME: GCC supports the OutSize to be 128 at maximum. Currently codegen
673678
// crash when the size larger than the register size. So we limit it here.
674679
if (OutTy->isStructureType() &&

clang/test/AST/ByteCode/builtin-bit-cast.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,3 +507,11 @@ typedef bool bool9 __attribute__((ext_vector_type(9)));
507507
// both-error@+2 {{constexpr variable 'bad_bool9_to_short' must be initialized by a constant expression}}
508508
// both-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(9)))' (vector of 9 'bool' values) is not allowed in a constant expression; element size 1 * element count 9 is not a multiple of the byte size 8}}
509509
constexpr unsigned short bad_bool9_to_short = __builtin_bit_cast(unsigned short, bool9{1,1,0,1,0,1,0,1,0});
510+
511+
// both-warning@+2 {{returning reference to local temporary object}}
512+
// both-note@+1 {{temporary created here}}
513+
constexpr const intptr_t &returns_local() { return 0L; }
514+
515+
// both-error@+2 {{constexpr variable 'test_nullptr_bad' must be initialized by a constant expression}}
516+
// both-note@+1 {{read of temporary whose lifetime has ended}}
517+
constexpr nullptr_t test_nullptr_bad = __builtin_bit_cast(nullptr_t, returns_local());

clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
32
// REQUIRES: aarch64-registered-target
43

@@ -239,3 +238,79 @@ void test_svmla_single_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t
239238
void test_svmla_single_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
240239
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, fpm);
241240
}
241+
242+
// FMLAL (multi)
243+
244+
// CHECK-LABEL: define dso_local void @test_svmla_multi_za16_vg2x2(
245+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
246+
// CHECK-NEXT: [[ENTRY:.*:]]
247+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
248+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
249+
// CHECK-NEXT: ret void
250+
//
251+
// CPP-CHECK-LABEL: define dso_local void @_Z27test_svmla_multi_za16_vg2x2j13svmfloat8x2_tS_m(
252+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
253+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
254+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
255+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
256+
// CPP-CHECK-NEXT: ret void
257+
//
258+
void test_svmla_multi_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
259+
SME_ACLE_FUNC(svmla_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, fpm);
260+
}
261+
262+
// CHECK-LABEL: define dso_local void @test_svmla_multi_za16_vg2x4(
263+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
264+
// CHECK-NEXT: [[ENTRY:.*:]]
265+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
266+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
267+
// CHECK-NEXT: ret void
268+
//
269+
// CPP-CHECK-LABEL: define dso_local void @_Z27test_svmla_multi_za16_vg2x4j13svmfloat8x4_tS_m(
270+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
271+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
272+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
273+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
274+
// CPP-CHECK-NEXT: ret void
275+
//
276+
void test_svmla_multi_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8x4_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
277+
SME_ACLE_FUNC(svmla_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, fpm);
278+
}
279+
280+
// FMLALL (multi)
281+
282+
// CHECK-LABEL: define dso_local void @test_svmla_multi_za32_vg4x2(
283+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
284+
// CHECK-NEXT: [[ENTRY:.*:]]
285+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
286+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
287+
// CHECK-NEXT: ret void
288+
//
289+
// CPP-CHECK-LABEL: define dso_local void @_Z27test_svmla_multi_za32_vg4x2j13svmfloat8x2_tS_m(
290+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
291+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
292+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
293+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
294+
// CPP-CHECK-NEXT: ret void
295+
//
296+
void test_svmla_multi_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
297+
SME_ACLE_FUNC(svmla_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, fpm);
298+
}
299+
300+
// CHECK-LABEL: define dso_local void @test_svmla_multi_za32_vg4x4(
301+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
302+
// CHECK-NEXT: [[ENTRY:.*:]]
303+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
304+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
305+
// CHECK-NEXT: ret void
306+
//
307+
// CPP-CHECK-LABEL: define dso_local void @_Z27test_svmla_multi_za32_vg4x4j13svmfloat8x4_tS_m(
308+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
309+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
310+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
311+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
312+
// CPP-CHECK-NEXT: ret void
313+
//
314+
void test_svmla_multi_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8x4_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
315+
SME_ACLE_FUNC(svmla_za32,_mf8,_vg4x4_fpm,,)(slice, zn, zm, fpm);
316+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -O3 -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
2+
// RUN: %clang_cc1 -triple spirv-vulkan-compute -x hlsl -emit-llvm -O3 -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
3+
4+
// CHECK-SPIRV: %"class.hlsl::RWBuffer" = type { target("spirv.Image", float, 5, 2, 0, 0, 2, 0) }
5+
// CHECK-DXIL: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", float, 1, 0, 0) }
6+
RWBuffer<float> Buf : register(u5, space3);
7+
8+
[shader("compute")]
9+
[numthreads(1, 1, 1)]
10+
void main() {
11+
// CHECK: define void @main()
12+
// CHECK-NEXT: entry:
13+
14+
// CHECK-SPIRV-NEXT: %Buf_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
15+
// CHECK-SPIRV-NEXT: store target("spirv.Image", float, 5, 2, 0, 0, 2, 0) %Buf_h.i, ptr @Buf, align 8
16+
17+
// CHECK-DXIL-NEXT: %Buf_h.i = tail call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
18+
// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h.i, ptr @Buf, align 4
19+
20+
// CHECK-NEXT: ret void
21+
}

clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_mla.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,16 @@ void test_svmla(uint32_t slice, svmfloat8_t zn, svmfloat8x2_t znx2, svmfloat8x4_
4141

4242
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
4343
svmla_single_za32_mf8_vg4x4_fpm(slice, znx4, zn, fpmr);
44+
45+
// expected-error@+1 {{'svmla_za16_mf8_vg2x2_fpm' needs target feature sme,sme-f8f16}}
46+
svmla_za16_mf8_vg2x2_fpm(slice, znx2, znx2, fpmr);
47+
48+
// expected-error@+1 {{'svmla_za16_mf8_vg2x4_fpm' needs target feature sme,sme-f8f16}}
49+
svmla_za16_mf8_vg2x4_fpm(slice, znx4, znx4, fpmr);
50+
51+
// expected-error@+1 {{'svmla_za32_mf8_vg4x2_fpm' needs target feature sme,sme-f8f32}}
52+
svmla_za32_mf8_vg4x2_fpm(slice, znx2, znx2, fpmr);
53+
54+
// expected-error@+1 {{'svmla_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
55+
svmla_za32_mf8_vg4x4_fpm(slice, znx4, znx4, fpmr);
4456
}

0 commit comments

Comments
 (0)