|
| 1 | +/*===--------------- sm4intrin.h - SM4 intrinsics -----------------=== |
| 2 | + * |
| 3 | + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | + * See https://llvm.org/LICENSE.txt for license information. |
| 5 | + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | + * |
| 7 | + *===-----------------------------------------------------------------------=== |
| 8 | + */ |
| 9 | + |
| 10 | +#ifndef __IMMINTRIN_H |
| 11 | +#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead." |
| 12 | +#endif // __IMMINTRIN_H |
| 13 | + |
| 14 | +#ifndef __SM4INTRIN_H |
| 15 | +#define __SM4INTRIN_H |
| 16 | + |
| 17 | +/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic |
| 18 | +/// operates on independent 128-bit lanes. The calculated results are |
| 19 | +/// stored in \a dst. |
| 20 | +/// \headerfile <immintrin.h> |
| 21 | +/// |
| 22 | +/// \code |
| 23 | +/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B) |
| 24 | +/// \endcode |
| 25 | +/// |
| 26 | +/// This intrinsic corresponds to the \c VSM4KEY4 instruction. |
| 27 | +/// |
| 28 | +/// \param __A |
| 29 | +/// A 128-bit vector of [4 x int]. |
| 30 | +/// \param __B |
| 31 | +/// A 128-bit vector of [4 x int]. |
| 32 | +/// \returns |
| 33 | +/// A 128-bit vector of [4 x int]. |
| 34 | +/// |
| 35 | +/// \code{.operation} |
| 36 | +/// DEFINE ROL32(dword, n) { |
| 37 | +/// count := n % 32 |
| 38 | +/// dest := (dword << count) | (dword >> (32-count)) |
| 39 | +/// RETURN dest |
| 40 | +/// } |
| 41 | +/// DEFINE SBOX_BYTE(dword, i) { |
| 42 | +/// RETURN sbox[dword.byte[i]] |
| 43 | +/// } |
| 44 | +/// DEFINE lower_t(dword) { |
| 45 | +/// tmp.byte[0] := SBOX_BYTE(dword, 0) |
| 46 | +/// tmp.byte[1] := SBOX_BYTE(dword, 1) |
| 47 | +/// tmp.byte[2] := SBOX_BYTE(dword, 2) |
| 48 | +/// tmp.byte[3] := SBOX_BYTE(dword, 3) |
| 49 | +/// RETURN tmp |
| 50 | +/// } |
| 51 | +/// DEFINE L_KEY(dword) { |
| 52 | +/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) |
| 53 | +/// } |
| 54 | +/// DEFINE T_KEY(dword) { |
| 55 | +/// RETURN L_KEY(lower_t(dword)) |
| 56 | +/// } |
| 57 | +/// DEFINE F_KEY(X0, X1, X2, X3, round_key) { |
| 58 | +/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) |
| 59 | +/// } |
| 60 | +/// FOR i:= 0 to 0 |
| 61 | +/// P[0] := __B.xmm[i].dword[0] |
| 62 | +/// P[1] := __B.xmm[i].dword[1] |
| 63 | +/// P[2] := __B.xmm[i].dword[2] |
| 64 | +/// P[3] := __B.xmm[i].dword[3] |
| 65 | +/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) |
| 66 | +/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) |
| 67 | +/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) |
| 68 | +/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) |
| 69 | +/// DEST.xmm[i].dword[0] := C[0] |
| 70 | +/// DEST.xmm[i].dword[1] := C[1] |
| 71 | +/// DEST.xmm[i].dword[2] := C[2] |
| 72 | +/// DEST.xmm[i].dword[3] := C[3] |
| 73 | +/// ENDFOR |
| 74 | +/// DEST[MAX:128] := 0 |
| 75 | +/// \endcode |
| 76 | +#define _mm_sm4key4_epi32(A, B) \ |
| 77 | + (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B) |
| 78 | + |
| 79 | +/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic |
| 80 | +/// operates on independent 128-bit lanes. The calculated results are |
| 81 | +/// stored in \a dst. |
| 82 | +/// \headerfile <immintrin.h> |
| 83 | +/// |
| 84 | +/// \code |
| 85 | +/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B) |
| 86 | +/// \endcode |
| 87 | +/// |
| 88 | +/// This intrinsic corresponds to the \c VSM4KEY4 instruction. |
| 89 | +/// |
| 90 | +/// \param __A |
| 91 | +/// A 256-bit vector of [8 x int]. |
| 92 | +/// \param __B |
| 93 | +/// A 256-bit vector of [8 x int]. |
| 94 | +/// \returns |
| 95 | +/// A 256-bit vector of [8 x int]. |
| 96 | +/// |
| 97 | +/// \code{.operation} |
| 98 | +/// DEFINE ROL32(dword, n) { |
| 99 | +/// count := n % 32 |
| 100 | +/// dest := (dword << count) | (dword >> (32-count)) |
| 101 | +/// RETURN dest |
| 102 | +/// } |
| 103 | +/// DEFINE SBOX_BYTE(dword, i) { |
| 104 | +/// RETURN sbox[dword.byte[i]] |
| 105 | +/// } |
| 106 | +/// DEFINE lower_t(dword) { |
| 107 | +/// tmp.byte[0] := SBOX_BYTE(dword, 0) |
| 108 | +/// tmp.byte[1] := SBOX_BYTE(dword, 1) |
| 109 | +/// tmp.byte[2] := SBOX_BYTE(dword, 2) |
| 110 | +/// tmp.byte[3] := SBOX_BYTE(dword, 3) |
| 111 | +/// RETURN tmp |
| 112 | +/// } |
| 113 | +/// DEFINE L_KEY(dword) { |
| 114 | +/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) |
| 115 | +/// } |
| 116 | +/// DEFINE T_KEY(dword) { |
| 117 | +/// RETURN L_KEY(lower_t(dword)) |
| 118 | +/// } |
| 119 | +/// DEFINE F_KEY(X0, X1, X2, X3, round_key) { |
| 120 | +/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) |
| 121 | +/// } |
| 122 | +/// FOR i:= 0 to 1 |
| 123 | +/// P[0] := __B.xmm[i].dword[0] |
| 124 | +/// P[1] := __B.xmm[i].dword[1] |
| 125 | +/// P[2] := __B.xmm[i].dword[2] |
| 126 | +/// P[3] := __B.xmm[i].dword[3] |
| 127 | +/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) |
| 128 | +/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) |
| 129 | +/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) |
| 130 | +/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) |
| 131 | +/// DEST.xmm[i].dword[0] := C[0] |
| 132 | +/// DEST.xmm[i].dword[1] := C[1] |
| 133 | +/// DEST.xmm[i].dword[2] := C[2] |
| 134 | +/// DEST.xmm[i].dword[3] := C[3] |
| 135 | +/// ENDFOR |
| 136 | +/// DEST[MAX:256] := 0 |
| 137 | +/// \endcode |
| 138 | +#define _mm256_sm4key4_epi32(A, B) \ |
| 139 | + (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B) |
| 140 | + |
| 141 | +/// This intrinisc performs four rounds of SM4 encryption. The intrinisc |
| 142 | +/// operates on independent 128-bit lanes. The calculated results are |
| 143 | +/// stored in \a dst. |
| 144 | +/// \headerfile <immintrin.h> |
| 145 | +/// |
| 146 | +/// \code |
| 147 | +/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B) |
| 148 | +/// \endcode |
| 149 | +/// |
| 150 | +/// This intrinsic corresponds to the \c VSM4RNDS4 instruction. |
| 151 | +/// |
| 152 | +/// \param __A |
| 153 | +/// A 128-bit vector of [4 x int]. |
| 154 | +/// \param __B |
| 155 | +/// A 128-bit vector of [4 x int]. |
| 156 | +/// \returns |
| 157 | +/// A 128-bit vector of [4 x int]. |
| 158 | +/// |
| 159 | +/// \code{.operation} |
| 160 | +/// DEFINE ROL32(dword, n) { |
| 161 | +/// count := n % 32 |
| 162 | +/// dest := (dword << count) | (dword >> (32-count)) |
| 163 | +/// RETURN dest |
| 164 | +/// } |
| 165 | +/// DEFINE lower_t(dword) { |
| 166 | +/// tmp.byte[0] := SBOX_BYTE(dword, 0) |
| 167 | +/// tmp.byte[1] := SBOX_BYTE(dword, 1) |
| 168 | +/// tmp.byte[2] := SBOX_BYTE(dword, 2) |
| 169 | +/// tmp.byte[3] := SBOX_BYTE(dword, 3) |
| 170 | +/// RETURN tmp |
| 171 | +/// } |
| 172 | +/// DEFINE L_RND(dword) { |
| 173 | +/// tmp := dword |
| 174 | +/// tmp := tmp ^ ROL32(dword, 2) |
| 175 | +/// tmp := tmp ^ ROL32(dword, 10) |
| 176 | +/// tmp := tmp ^ ROL32(dword, 18) |
| 177 | +/// tmp := tmp ^ ROL32(dword, 24) |
| 178 | +/// RETURN tmp |
| 179 | +/// } |
| 180 | +/// DEFINE T_RND(dword) { |
| 181 | +/// RETURN L_RND(lower_t(dword)) |
| 182 | +/// } |
| 183 | +/// DEFINE F_RND(X0, X1, X2, X3, round_key) { |
| 184 | +/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) |
| 185 | +/// } |
| 186 | +/// FOR i:= 0 to 0 |
| 187 | +/// P[0] := __B.xmm[i].dword[0] |
| 188 | +/// P[1] := __B.xmm[i].dword[1] |
| 189 | +/// P[2] := __B.xmm[i].dword[2] |
| 190 | +/// P[3] := __B.xmm[i].dword[3] |
| 191 | +/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) |
| 192 | +/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) |
| 193 | +/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) |
| 194 | +/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) |
| 195 | +/// DEST.xmm[i].dword[0] := C[0] |
| 196 | +/// DEST.xmm[i].dword[1] := C[1] |
| 197 | +/// DEST.xmm[i].dword[2] := C[2] |
| 198 | +/// DEST.xmm[i].dword[3] := C[3] |
| 199 | +/// ENDFOR |
| 200 | +/// DEST[MAX:128] := 0 |
| 201 | +/// \endcode |
| 202 | +#define _mm_sm4rnds4_epi32(A, B) \ |
| 203 | + (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B) |
| 204 | + |
| 205 | +/// This intrinisc performs four rounds of SM4 encryption. The intrinisc |
| 206 | +/// operates on independent 128-bit lanes. The calculated results are |
| 207 | +/// stored in \a dst. |
| 208 | +/// \headerfile <immintrin.h> |
| 209 | +/// |
| 210 | +/// \code |
| 211 | +/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) |
| 212 | +/// \endcode |
| 213 | +/// |
| 214 | +/// This intrinsic corresponds to the \c VSM4RNDS4 instruction. |
| 215 | +/// |
| 216 | +/// \param __A |
| 217 | +/// A 256-bit vector of [8 x int]. |
| 218 | +/// \param __B |
| 219 | +/// A 256-bit vector of [8 x int]. |
| 220 | +/// \returns |
| 221 | +/// A 256-bit vector of [8 x int]. |
| 222 | +/// |
| 223 | +/// \code{.operation} |
| 224 | +/// DEFINE ROL32(dword, n) { |
| 225 | +/// count := n % 32 |
| 226 | +/// dest := (dword << count) | (dword >> (32-count)) |
| 227 | +/// RETURN dest |
| 228 | +/// } |
| 229 | +/// DEFINE lower_t(dword) { |
| 230 | +/// tmp.byte[0] := SBOX_BYTE(dword, 0) |
| 231 | +/// tmp.byte[1] := SBOX_BYTE(dword, 1) |
| 232 | +/// tmp.byte[2] := SBOX_BYTE(dword, 2) |
| 233 | +/// tmp.byte[3] := SBOX_BYTE(dword, 3) |
| 234 | +/// RETURN tmp |
| 235 | +/// } |
| 236 | +/// DEFINE L_RND(dword) { |
| 237 | +/// tmp := dword |
| 238 | +/// tmp := tmp ^ ROL32(dword, 2) |
| 239 | +/// tmp := tmp ^ ROL32(dword, 10) |
| 240 | +/// tmp := tmp ^ ROL32(dword, 18) |
| 241 | +/// tmp := tmp ^ ROL32(dword, 24) |
| 242 | +/// RETURN tmp |
| 243 | +/// } |
| 244 | +/// DEFINE T_RND(dword) { |
| 245 | +/// RETURN L_RND(lower_t(dword)) |
| 246 | +/// } |
| 247 | +/// DEFINE F_RND(X0, X1, X2, X3, round_key) { |
| 248 | +/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) |
| 249 | +/// } |
| 250 | +/// FOR i:= 0 to 0 |
| 251 | +/// P[0] := __B.xmm[i].dword[0] |
| 252 | +/// P[1] := __B.xmm[i].dword[1] |
| 253 | +/// P[2] := __B.xmm[i].dword[2] |
| 254 | +/// P[3] := __B.xmm[i].dword[3] |
| 255 | +/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) |
| 256 | +/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) |
| 257 | +/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) |
| 258 | +/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) |
| 259 | +/// DEST.xmm[i].dword[0] := C[0] |
| 260 | +/// DEST.xmm[i].dword[1] := C[1] |
| 261 | +/// DEST.xmm[i].dword[2] := C[2] |
| 262 | +/// DEST.xmm[i].dword[3] := C[3] |
| 263 | +/// ENDFOR |
| 264 | +/// DEST[MAX:256] := 0 |
| 265 | +/// \endcode |
| 266 | +#define _mm256_sm4rnds4_epi32(A, B) \ |
| 267 | + (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B) |
| 268 | + |
| 269 | +#endif // __SM4INTRIN_H |
0 commit comments