Skip to content

[X86][AMX] Support AMX-MOVRS #115151

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,7 @@ X86 Support
* Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
- Support ISA of ``AMX-FP8``.
- Support ISA of ``AMX-TRANSPOSE``.
- Support ISA of ``AMX-MOVRS``.
- Support ISA of ``AMX-AVX512``.
- Support ISA of ``AMX-TF32``.

Expand Down
14 changes: 14 additions & 0 deletions clang/include/clang/Basic/BuiltinsX86_64.def
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
Expand All @@ -129,10 +131,15 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")

TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
Expand All @@ -147,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")

TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")

TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Basic/Targets/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXCOMPLEX = true;
} else if (Feature == "+amx-fp8") {
HasAMXFP8 = true;
} else if (Feature == "+amx-movrs") {
HasAMXMOVRS = true;
} else if (Feature == "+amx-transpose") {
HasAMXTRANSPOSE = true;
} else if (Feature == "+amx-avx512") {
Expand Down Expand Up @@ -957,6 +959,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_COMPLEX__");
if (HasAMXFP8)
Builder.defineMacro("__AMX_FP8__");
if (HasAMXMOVRS)
Builder.defineMacro("__AMX_MOVRS__");
if (HasAMXTRANSPOSE)
Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasAMXAVX512)
Expand Down Expand Up @@ -1094,6 +1098,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-fp16", true)
.Case("amx-fp8", true)
.Case("amx-int8", true)
.Case("amx-movrs", true)
.Case("amx-tf32", true)
.Case("amx-tile", true)
.Case("amx-transpose", true)
Expand Down Expand Up @@ -1216,6 +1221,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-fp8", HasAMXFP8)
.Case("amx-int8", HasAMXINT8)
.Case("amx-movrs", HasAMXMOVRS)
.Case("amx-tf32", HasAMXTF32)
.Case("amx-tile", HasAMXTILE)
.Case("amx-transpose", HasAMXTRANSPOSE)
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Basic/Targets/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
bool HasAMXFP8 = false;
bool HasAMXMOVRS = false;
bool HasAMXTRANSPOSE = false;
bool HasAMXAVX512 = false;
bool HasAMXTF32 = false;
Expand Down
18 changes: 17 additions & 1 deletion clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17025,25 +17025,41 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
}
// Corresponding to intrisics which will return 2 tiles (tile0_tile1).
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
Intrinsic::ID IID;
switch (BuiltinID) {
default:
llvm_unreachable("Unsupported intrinsic!");
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
IID = Intrinsic::x86_t2rpntlvwz0_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
IID = Intrinsic::x86_t2rpntlvwz1_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
break;
}

// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ set(x86_files
amxfp16intrin.h
amxfp8intrin.h
amxintrin.h
amxmovrsintrin.h
amxmovrstransposeintrin.h
amxtf32intrin.h
amxtf32transposeintrin.h
amxtransposeintrin.h
Expand Down
48 changes: 48 additions & 0 deletions clang/lib/Headers/amxmovrsintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* ===-------------------------------------------------------------------=== */

#ifndef __IMMINTRIN_H
#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
#endif /* __IMMINTRIN_H */

#ifndef __AMXMOVRSINTRIN_H
#define __AMXMOVRSINTRIN_H
#ifdef __x86_64__

#define __DEFAULT_FN_ATTRS_MOVRS \
__attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))

#define _tile_loaddrs(dst, base, stride) \
__builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
#define _tile_stream_loaddrs(dst, base, stride) \
__builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloaddrs64_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloaddrst164_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
}
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
}
#undef __DEFAULT_FN_ATTRS_MOVRS
#endif /* __x86_64__ */
#endif /* __AMXMOVRSINTRIN_H */
Loading
Loading