Skip to content

Commit f77101e

Browse files
authored
[X86][AMX] Support AMX-MOVRS (#115151)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
1 parent 82d5dd2 commit f77101e

34 files changed

+1273
-9
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,7 @@ X86 Support
741741
* Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
742742
- Support ISA of ``AMX-FP8``.
743743
- Support ISA of ``AMX-TRANSPOSE``.
744+
- Support ISA of ``AMX-MOVRS``.
744745
- Support ISA of ``AMX-AVX512``.
745746
- Support ISA of ``AMX-TF32``.
746747

clang/include/clang/Basic/BuiltinsX86_64.def

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
117117
// AMX internal builtin
118118
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
119119
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
120+
TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
120121
TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
122+
TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
121123
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
122124
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
123125
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
@@ -129,10 +131,15 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i",
129131
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
130132
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
131133
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
134+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
132135
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
136+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
133137
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
138+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
134139
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
140+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
135141
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
142+
136143
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
137144
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
138145
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -147,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
147154
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
148155
TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
149156
TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
157+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
158+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
159+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
160+
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
161+
162+
TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
163+
TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")
150164

151165
TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
152166
TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")

clang/include/clang/Driver/Options.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
63036303
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
63046304
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
63056305
def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
6306+
def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
6307+
def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
63066308
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
63076309
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
63086310
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;

clang/lib/Basic/Targets/X86.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
430430
HasAMXCOMPLEX = true;
431431
} else if (Feature == "+amx-fp8") {
432432
HasAMXFP8 = true;
433+
} else if (Feature == "+amx-movrs") {
434+
HasAMXMOVRS = true;
433435
} else if (Feature == "+amx-transpose") {
434436
HasAMXTRANSPOSE = true;
435437
} else if (Feature == "+amx-avx512") {
@@ -957,6 +959,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
957959
Builder.defineMacro("__AMX_COMPLEX__");
958960
if (HasAMXFP8)
959961
Builder.defineMacro("__AMX_FP8__");
962+
if (HasAMXMOVRS)
963+
Builder.defineMacro("__AMX_MOVRS__");
960964
if (HasAMXTRANSPOSE)
961965
Builder.defineMacro("__AMX_TRANSPOSE__");
962966
if (HasAMXAVX512)
@@ -1094,6 +1098,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
10941098
.Case("amx-fp16", true)
10951099
.Case("amx-fp8", true)
10961100
.Case("amx-int8", true)
1101+
.Case("amx-movrs", true)
10971102
.Case("amx-tf32", true)
10981103
.Case("amx-tile", true)
10991104
.Case("amx-transpose", true)
@@ -1216,6 +1221,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
12161221
.Case("amx-fp16", HasAMXFP16)
12171222
.Case("amx-fp8", HasAMXFP8)
12181223
.Case("amx-int8", HasAMXINT8)
1224+
.Case("amx-movrs", HasAMXMOVRS)
12191225
.Case("amx-tf32", HasAMXTF32)
12201226
.Case("amx-tile", HasAMXTILE)
12211227
.Case("amx-transpose", HasAMXTRANSPOSE)

clang/lib/Basic/Targets/X86.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
158158
bool HasAMXBF16 = false;
159159
bool HasAMXCOMPLEX = false;
160160
bool HasAMXFP8 = false;
161+
bool HasAMXMOVRS = false;
161162
bool HasAMXTRANSPOSE = false;
162163
bool HasAMXAVX512 = false;
163164
bool HasAMXTF32 = false;

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17025,25 +17025,41 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1702517025
}
1702617026
// Corresponding to intrisics which will return 2 tiles (tile0_tile1).
1702717027
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
17028+
case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
1702817029
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
17030+
case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
1702917031
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
17030-
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
17032+
case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
17033+
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
17034+
case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
1703117035
Intrinsic::ID IID;
1703217036
switch (BuiltinID) {
1703317037
default:
1703417038
llvm_unreachable("Unsupported intrinsic!");
1703517039
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
1703617040
IID = Intrinsic::x86_t2rpntlvwz0_internal;
1703717041
break;
17042+
case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
17043+
IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
17044+
break;
1703817045
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
1703917046
IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
1704017047
break;
17048+
case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
17049+
IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
17050+
break;
1704117051
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
1704217052
IID = Intrinsic::x86_t2rpntlvwz1_internal;
1704317053
break;
17054+
case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
17055+
IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
17056+
break;
1704417057
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
1704517058
IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
1704617059
break;
17060+
case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
17061+
IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
17062+
break;
1704717063
}
1704817064

1704917065
// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)

clang/lib/Headers/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ set(x86_files
151151
amxfp16intrin.h
152152
amxfp8intrin.h
153153
amxintrin.h
154+
amxmovrsintrin.h
155+
amxmovrstransposeintrin.h
154156
amxtf32intrin.h
155157
amxtf32transposeintrin.h
156158
amxtransposeintrin.h

clang/lib/Headers/amxmovrsintrin.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
* ===-------------------------------------------------------------------=== */
8+
9+
#ifndef __IMMINTRIN_H
10+
#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
11+
#endif /* __IMMINTRIN_H */
12+
13+
#ifndef __AMXMOVRSINTRIN_H
14+
#define __AMXMOVRSINTRIN_H
15+
#ifdef __x86_64__
16+
17+
#define __DEFAULT_FN_ATTRS_MOVRS \
18+
__attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
19+
20+
#define _tile_loaddrs(dst, base, stride) \
21+
__builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \
22+
(__SIZE_TYPE__)(stride))
23+
#define _tile_stream_loaddrs(dst, base, stride) \
24+
__builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \
25+
(__SIZE_TYPE__)(stride))
26+
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
27+
_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
28+
__SIZE_TYPE__ stride) {
29+
return __builtin_ia32_tileloaddrs64_internal(m, n, base,
30+
(__SIZE_TYPE__)(stride));
31+
}
32+
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
33+
_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
34+
__SIZE_TYPE__ stride) {
35+
return __builtin_ia32_tileloaddrst164_internal(m, n, base,
36+
(__SIZE_TYPE__)(stride));
37+
}
38+
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
39+
__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
40+
dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
41+
}
42+
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
43+
__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
44+
dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
45+
}
46+
#undef __DEFAULT_FN_ATTRS_MOVRS
47+
#endif /* __x86_64__ */
48+
#endif /* __AMXMOVRSINTRIN_H */

0 commit comments

Comments
 (0)