Skip to content

Commit 049d6a3

Browse files
committed
[X86] Add SM4 instructions.
For more details about these instructions, please refer to the latest ISE document: https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html Reviewed By: pengfei, skan Differential Revision: https://reviews.llvm.org/D155148
1 parent 75d7180 commit 049d6a3

File tree

28 files changed

+1129
-3
lines changed

28 files changed

+1129
-3
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,9 @@ X86 Support
821821
* Support intrinsic of ``_mm_sm3msg1_epi32``.
822822
* Support intrinsic of ``_mm_sm3msg2_epi32``.
823823
* Support intrinsic of ``_mm_sm3rnds2_epi32``.
824+
- Support ISA of ``SM4``.
825+
* Support intrinsic of ``_mm(256)_sm4key4_epi32``.
826+
* Support intrinsic of ``_mm(256)_sm4rnds4_epi32``.
824827

825828
Arm and AArch64 Support
826829
^^^^^^^^^^^^^^^^^^^^^^^

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2151,6 +2151,12 @@ TARGET_BUILTIN(__builtin_ia32_vsm3msg1, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
21512151
TARGET_BUILTIN(__builtin_ia32_vsm3msg2, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
21522152
TARGET_BUILTIN(__builtin_ia32_vsm3rnds2, "V4UiV4UiV4UiV4UiIUi", "nV:128:", "sm3")
21532153

2154+
// SM4
2155+
TARGET_BUILTIN(__builtin_ia32_vsm4key4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
2156+
TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
2157+
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
2158+
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
2159+
21542160
#undef BUILTIN
21552161
#undef TARGET_BUILTIN
21562162
#undef TARGET_HEADER_BUILTIN

clang/include/clang/Driver/Options.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5060,6 +5060,8 @@ def msha512 : Flag<["-"], "msha512">, Group<m_x86_Features_Group>;
50605060
def mno_sha512 : Flag<["-"], "mno-sha512">, Group<m_x86_Features_Group>;
50615061
def msm3 : Flag<["-"], "msm3">, Group<m_x86_Features_Group>;
50625062
def mno_sm3 : Flag<["-"], "mno-sm3">, Group<m_x86_Features_Group>;
5063+
def msm4 : Flag<["-"], "msm4">, Group<m_x86_Features_Group>;
5064+
def mno_sm4 : Flag<["-"], "mno-sm4">, Group<m_x86_Features_Group>;
50635065
def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
50645066
def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
50655067
def mtsxldtrk : Flag<["-"], "mtsxldtrk">, Group<m_x86_Features_Group>;

clang/lib/Basic/Targets/X86.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
267267
HasSHSTK = true;
268268
} else if (Feature == "+sm3") {
269269
HasSM3 = true;
270+
} else if (Feature == "+sm4") {
271+
HasSM4 = true;
270272
} else if (Feature == "+movbe") {
271273
HasMOVBE = true;
272274
} else if (Feature == "+sgx") {
@@ -780,6 +782,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
780782
Builder.defineMacro("__SGX__");
781783
if (HasSM3)
782784
Builder.defineMacro("__SM3__");
785+
if (HasSM4)
786+
Builder.defineMacro("__SM4__");
783787
if (HasPREFETCHI)
784788
Builder.defineMacro("__PREFETCHI__");
785789
if (HasPREFETCHWT1)
@@ -1010,6 +1014,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
10101014
.Case("sha512", true)
10111015
.Case("shstk", true)
10121016
.Case("sm3", true)
1017+
.Case("sm4", true)
10131018
.Case("sse", true)
10141019
.Case("sse2", true)
10151020
.Case("sse3", true)
@@ -1117,6 +1122,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
11171122
.Case("sha512", HasSHA512)
11181123
.Case("shstk", HasSHSTK)
11191124
.Case("sm3", HasSM3)
1125+
.Case("sm4", HasSM4)
11201126
.Case("sse", SSELevel >= SSE1)
11211127
.Case("sse2", SSELevel >= SSE2)
11221128
.Case("sse3", SSELevel >= SSE3)

clang/lib/Basic/Targets/X86.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
116116
bool HasSHSTK = false;
117117
bool HasSM3 = false;
118118
bool HasSGX = false;
119+
bool HasSM4 = false;
119120
bool HasCX8 = false;
120121
bool HasCX16 = false;
121122
bool HasFXSR = false;

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ set(x86_files
206206
sha512intrin.h
207207
shaintrin.h
208208
sm3intrin.h
209+
sm4intrin.h
209210
smmintrin.h
210211
tbmintrin.h
211212
tmmintrin.h

clang/lib/Headers/immintrin.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@
279279
#include <sm3intrin.h>
280280
#endif
281281

282+
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
283+
defined(__SM4__)
284+
#include <sm4intrin.h>
285+
#endif
286+
282287
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
283288
defined(__RDPID__)
284289
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).

clang/lib/Headers/sm4intrin.h

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
10+
#ifndef __IMMINTRIN_H
11+
#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
12+
#endif // __IMMINTRIN_H
13+
14+
#ifndef __SM4INTRIN_H
15+
#define __SM4INTRIN_H
16+
17+
/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
18+
/// operates on independent 128-bit lanes. The calculated results are
19+
/// stored in \a dst.
20+
/// \headerfile <immintrin.h>
21+
///
22+
/// \code
23+
/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
24+
/// \endcode
25+
///
26+
/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
27+
///
28+
/// \param __A
29+
/// A 128-bit vector of [4 x int].
30+
/// \param __B
31+
/// A 128-bit vector of [4 x int].
32+
/// \returns
33+
/// A 128-bit vector of [4 x int].
34+
///
35+
/// \code{.operation}
36+
/// DEFINE ROL32(dword, n) {
37+
/// count := n % 32
38+
/// dest := (dword << count) | (dword >> (32-count))
39+
/// RETURN dest
40+
/// }
41+
/// DEFINE SBOX_BYTE(dword, i) {
42+
/// RETURN sbox[dword.byte[i]]
43+
/// }
44+
/// DEFINE lower_t(dword) {
45+
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
46+
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
47+
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
48+
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
49+
/// RETURN tmp
50+
/// }
51+
/// DEFINE L_KEY(dword) {
52+
/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
53+
/// }
54+
/// DEFINE T_KEY(dword) {
55+
/// RETURN L_KEY(lower_t(dword))
56+
/// }
57+
/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
58+
/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
59+
/// }
60+
/// FOR i:= 0 to 0
61+
/// P[0] := __B.xmm[i].dword[0]
62+
/// P[1] := __B.xmm[i].dword[1]
63+
/// P[2] := __B.xmm[i].dword[2]
64+
/// P[3] := __B.xmm[i].dword[3]
65+
/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
66+
/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
67+
/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
68+
/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
69+
/// DEST.xmm[i].dword[0] := C[0]
70+
/// DEST.xmm[i].dword[1] := C[1]
71+
/// DEST.xmm[i].dword[2] := C[2]
72+
/// DEST.xmm[i].dword[3] := C[3]
73+
/// ENDFOR
74+
/// DEST[MAX:128] := 0
75+
/// \endcode
76+
#define _mm_sm4key4_epi32(A, B) \
77+
(__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
78+
79+
/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
80+
/// operates on independent 128-bit lanes. The calculated results are
81+
/// stored in \a dst.
82+
/// \headerfile <immintrin.h>
83+
///
84+
/// \code
85+
/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
86+
/// \endcode
87+
///
88+
/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
89+
///
90+
/// \param __A
91+
/// A 256-bit vector of [8 x int].
92+
/// \param __B
93+
/// A 256-bit vector of [8 x int].
94+
/// \returns
95+
/// A 256-bit vector of [8 x int].
96+
///
97+
/// \code{.operation}
98+
/// DEFINE ROL32(dword, n) {
99+
/// count := n % 32
100+
/// dest := (dword << count) | (dword >> (32-count))
101+
/// RETURN dest
102+
/// }
103+
/// DEFINE SBOX_BYTE(dword, i) {
104+
/// RETURN sbox[dword.byte[i]]
105+
/// }
106+
/// DEFINE lower_t(dword) {
107+
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
108+
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
109+
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
110+
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
111+
/// RETURN tmp
112+
/// }
113+
/// DEFINE L_KEY(dword) {
114+
/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
115+
/// }
116+
/// DEFINE T_KEY(dword) {
117+
/// RETURN L_KEY(lower_t(dword))
118+
/// }
119+
/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
120+
/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
121+
/// }
122+
/// FOR i:= 0 to 1
123+
/// P[0] := __B.xmm[i].dword[0]
124+
/// P[1] := __B.xmm[i].dword[1]
125+
/// P[2] := __B.xmm[i].dword[2]
126+
/// P[3] := __B.xmm[i].dword[3]
127+
/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
128+
/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
129+
/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
130+
/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
131+
/// DEST.xmm[i].dword[0] := C[0]
132+
/// DEST.xmm[i].dword[1] := C[1]
133+
/// DEST.xmm[i].dword[2] := C[2]
134+
/// DEST.xmm[i].dword[3] := C[3]
135+
/// ENDFOR
136+
/// DEST[MAX:256] := 0
137+
/// \endcode
138+
#define _mm256_sm4key4_epi32(A, B) \
139+
(__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
140+
141+
/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
142+
/// operates on independent 128-bit lanes. The calculated results are
143+
/// stored in \a dst.
144+
/// \headerfile <immintrin.h>
145+
///
146+
/// \code
147+
/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
148+
/// \endcode
149+
///
150+
/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
151+
///
152+
/// \param __A
153+
/// A 128-bit vector of [4 x int].
154+
/// \param __B
155+
/// A 128-bit vector of [4 x int].
156+
/// \returns
157+
/// A 128-bit vector of [4 x int].
158+
///
159+
/// \code{.operation}
160+
/// DEFINE ROL32(dword, n) {
161+
/// count := n % 32
162+
/// dest := (dword << count) | (dword >> (32-count))
163+
/// RETURN dest
164+
/// }
165+
/// DEFINE lower_t(dword) {
166+
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
167+
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
168+
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
169+
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
170+
/// RETURN tmp
171+
/// }
172+
/// DEFINE L_RND(dword) {
173+
/// tmp := dword
174+
/// tmp := tmp ^ ROL32(dword, 2)
175+
/// tmp := tmp ^ ROL32(dword, 10)
176+
/// tmp := tmp ^ ROL32(dword, 18)
177+
/// tmp := tmp ^ ROL32(dword, 24)
178+
/// RETURN tmp
179+
/// }
180+
/// DEFINE T_RND(dword) {
181+
/// RETURN L_RND(lower_t(dword))
182+
/// }
183+
/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
184+
/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
185+
/// }
186+
/// FOR i:= 0 to 0
187+
/// P[0] := __B.xmm[i].dword[0]
188+
/// P[1] := __B.xmm[i].dword[1]
189+
/// P[2] := __B.xmm[i].dword[2]
190+
/// P[3] := __B.xmm[i].dword[3]
191+
/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
192+
/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
193+
/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
194+
/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
195+
/// DEST.xmm[i].dword[0] := C[0]
196+
/// DEST.xmm[i].dword[1] := C[1]
197+
/// DEST.xmm[i].dword[2] := C[2]
198+
/// DEST.xmm[i].dword[3] := C[3]
199+
/// ENDFOR
200+
/// DEST[MAX:128] := 0
201+
/// \endcode
202+
#define _mm_sm4rnds4_epi32(A, B) \
203+
(__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
204+
205+
/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
206+
/// operates on independent 128-bit lanes. The calculated results are
207+
/// stored in \a dst.
208+
/// \headerfile <immintrin.h>
209+
///
210+
/// \code
211+
/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
212+
/// \endcode
213+
///
214+
/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
215+
///
216+
/// \param __A
217+
/// A 256-bit vector of [8 x int].
218+
/// \param __B
219+
/// A 256-bit vector of [8 x int].
220+
/// \returns
221+
/// A 256-bit vector of [8 x int].
222+
///
223+
/// \code{.operation}
224+
/// DEFINE ROL32(dword, n) {
225+
/// count := n % 32
226+
/// dest := (dword << count) | (dword >> (32-count))
227+
/// RETURN dest
228+
/// }
229+
/// DEFINE lower_t(dword) {
230+
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
231+
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
232+
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
233+
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
234+
/// RETURN tmp
235+
/// }
236+
/// DEFINE L_RND(dword) {
237+
/// tmp := dword
238+
/// tmp := tmp ^ ROL32(dword, 2)
239+
/// tmp := tmp ^ ROL32(dword, 10)
240+
/// tmp := tmp ^ ROL32(dword, 18)
241+
/// tmp := tmp ^ ROL32(dword, 24)
242+
/// RETURN tmp
243+
/// }
244+
/// DEFINE T_RND(dword) {
245+
/// RETURN L_RND(lower_t(dword))
246+
/// }
247+
/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
248+
/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
249+
/// }
250+
/// FOR i:= 0 to 0
251+
/// P[0] := __B.xmm[i].dword[0]
252+
/// P[1] := __B.xmm[i].dword[1]
253+
/// P[2] := __B.xmm[i].dword[2]
254+
/// P[3] := __B.xmm[i].dword[3]
255+
/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
256+
/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
257+
/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
258+
/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
259+
/// DEST.xmm[i].dword[0] := C[0]
260+
/// DEST.xmm[i].dword[1] := C[1]
261+
/// DEST.xmm[i].dword[2] := C[2]
262+
/// DEST.xmm[i].dword[3] := C[3]
263+
/// ENDFOR
264+
/// DEST[MAX:256] := 0
265+
/// \endcode
266+
#define _mm256_sm4rnds4_epi32(A, B) \
267+
(__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
268+
269+
#endif // __SM4INTRIN_H

clang/test/CodeGen/X86/sm4-builtins.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +sm4 -emit-llvm -o - -Wall -Werror | FileCheck %s
2+
// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +sm4 -emit-llvm -o - -Wall -Werror | FileCheck %s
3+
4+
#include <immintrin.h>
5+
6+
__m128i test_mm_sm4key4_epi32(__m128i __A, __m128i __B) {
7+
// CHECK-LABEL: @test_mm_sm4key4_epi32(
8+
// CHECK: call <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
9+
return _mm_sm4key4_epi32(__A, __B);
10+
}
11+
12+
__m256i test_mm256_sm4key4_epi32(__m256i __A, __m256i __B) {
13+
// CHECK-LABEL: @test_mm256_sm4key4_epi32(
14+
// CHECK: call <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
15+
return _mm256_sm4key4_epi32(__A, __B);
16+
}
17+
18+
__m128i test_mm_sm4rnds4_epi32(__m128i __A, __m128i __B) {
19+
// CHECK-LABEL: @test_mm_sm4rnds4_epi32(
20+
// CHECK: call <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
21+
return _mm_sm4rnds4_epi32(__A, __B);
22+
}
23+
24+
__m256i test_mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) {
25+
// CHECK-LABEL: @test_mm256_sm4rnds4_epi32(
26+
// CHECK: call <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
27+
return _mm256_sm4rnds4_epi32(__A, __B);
28+
}

0 commit comments

Comments
 (0)