Skip to content

Commit 230c57b

Browse files
committed
[X86] Synchronize the encodekey builtins with gcc. Don't assume void* is 16 byte aligned.
We were taking multiple pointer arguments in the builtin. gcc accepts a single void*. The cast from void* to _m128i* caused the IR generation to assume the pointer was aligned. Instead make the builtin take a single void*, emit i8* GEPs to adjust then cast to <2 x i64>* and perform a store with align of 1.
1 parent 28595cb commit 230c57b

File tree

5 files changed

+194
-37
lines changed

5 files changed

+194
-37
lines changed

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,10 +1902,10 @@ TARGET_BUILTIN(__builtin_ia32_enqcmds, "Ucv*vC*", "n", "enqcmd")
19021902

19031903
// KEY LOCKER
19041904
TARGET_BUILTIN(__builtin_ia32_loadiwkey, "vV2OiV2OiV2OiUi", "nV:128:", "kl")
1905-
TARGET_BUILTIN(__builtin_ia32_encodekey128,
1906-
"UiUiV2OiV2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*", "nV:128:", "kl")
1907-
TARGET_BUILTIN(__builtin_ia32_encodekey256,
1908-
"UiUiV2OiV2OiV2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*", "nV:128:", "kl")
1905+
TARGET_BUILTIN(__builtin_ia32_encodekey128_u32,
1906+
"UiUiV2Oiv*", "nV:128:", "kl")
1907+
TARGET_BUILTIN(__builtin_ia32_encodekey256_u32,
1908+
"UiUiV2OiV2Oiv*", "nV:128:", "kl")
19091909
TARGET_BUILTIN(__builtin_ia32_aesenc128kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")
19101910
TARGET_BUILTIN(__builtin_ia32_aesenc256kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")
19111911
TARGET_BUILTIN(__builtin_ia32_aesdec128kl, "UcV2Oi*V2OivC*", "nV:128:", "kl")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14039,8 +14039,37 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1403914039
case X86::BI__builtin_ia32_psubusb128:
1404014040
case X86::BI__builtin_ia32_psubusw128:
1404114041
return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::usub_sat);
14042-
case X86::BI__builtin_ia32_encodekey128:
14043-
case X86::BI__builtin_ia32_encodekey256:
14042+
case X86::BI__builtin_ia32_encodekey128_u32: {
14043+
Intrinsic::ID IID = Intrinsic::x86_encodekey128;
14044+
14045+
Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
14046+
14047+
for (int i = 0; i < 6; ++i) {
14048+
Value *Extract = Builder.CreateExtractValue(Call, i + 1);
14049+
Value *Ptr = Builder.CreateConstGEP1_32(Ops[2], i * 16);
14050+
Ptr = Builder.CreateBitCast(
14051+
Ptr, llvm::PointerType::getUnqual(Extract->getType()));
14052+
Builder.CreateAlignedStore(Extract, Ptr, Align(1));
14053+
}
14054+
14055+
return Builder.CreateExtractValue(Call, 0);
14056+
}
14057+
case X86::BI__builtin_ia32_encodekey256_u32: {
14058+
Intrinsic::ID IID = Intrinsic::x86_encodekey256;
14059+
14060+
Value *Call =
14061+
Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
14062+
14063+
for (int i = 0; i < 7; ++i) {
14064+
Value *Extract = Builder.CreateExtractValue(Call, i + 1);
14065+
Value *Ptr = Builder.CreateConstGEP1_32(Ops[3], i * 16);
14066+
Ptr = Builder.CreateBitCast(
14067+
Ptr, llvm::PointerType::getUnqual(Extract->getType()));
14068+
Builder.CreateAlignedStore(Extract, Ptr, Align(1));
14069+
}
14070+
14071+
return Builder.CreateExtractValue(Call, 0);
14072+
}
1404414073
case X86::BI__builtin_ia32_aesenc128kl:
1404514074
case X86::BI__builtin_ia32_aesdec128kl:
1404614075
case X86::BI__builtin_ia32_aesenc256kl:
@@ -14056,18 +14085,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1405614085

1405714086
switch (BuiltinID) {
1405814087
default: llvm_unreachable("Unsupported intrinsic!");
14059-
case X86::BI__builtin_ia32_encodekey128:
14060-
ID = Intrinsic::x86_encodekey128;
14061-
InOps = {Ops[0], Ops[1]};
14062-
FirstReturnOp = 2;
14063-
ResultCount = 6;
14064-
break;
14065-
case X86::BI__builtin_ia32_encodekey256:
14066-
ID = Intrinsic::x86_encodekey256;
14067-
InOps = {Ops[0], Ops[1], Ops[2]};
14068-
FirstReturnOp = 3;
14069-
ResultCount = 7;
14070-
break;
1407114088
case X86::BI__builtin_ia32_aesenc128kl:
1407214089
case X86::BI__builtin_ia32_aesdec128kl:
1407314090
case X86::BI__builtin_ia32_aesenc256kl:

clang/lib/Headers/keylockerintrin.h

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -132,15 +132,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
132132
/// \endoperation
133133
static __inline__ unsigned int __DEFAULT_FN_ATTRS
134134
_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
135-
__m128i *__results = (__m128i*)__h;
136-
137-
return __builtin_ia32_encodekey128(__htype, __key,
138-
__results,
139-
__results + 1,
140-
__results + 2,
141-
__results + 3,
142-
__results + 4,
143-
__results + 5);
135+
return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
144136
}
145137

146138
/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
@@ -181,16 +173,8 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
181173
static __inline__ unsigned int __DEFAULT_FN_ATTRS
182174
_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
183175
void *__h) {
184-
__m128i *__results = (__m128i*)__h;
185-
186-
return __builtin_ia32_encodekey256(__htype, __key_lo, __key_hi,
187-
__results,
188-
__results + 1,
189-
__results + 2,
190-
__results + 3,
191-
__results + 4,
192-
__results + 5,
193-
__results + 6);
176+
return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
177+
(__v2di)__key_hi, __h);
194178
}
195179

196180
/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using

clang/test/CodeGen/X86/keylocker.c

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,64 @@ void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i
1414
unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
1515
//CHECK-LABEL: @test_encodekey128_u32
1616
//CHECK: call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %{{.*}}, <2 x i64> %{{.*}})
17+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
18+
//CHECK: itcast i8* %{{.*}} to <2 x i64>*
19+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
20+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
21+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 16
22+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
23+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
24+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
25+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 32
26+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
27+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
28+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
29+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 48
30+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
31+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
32+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
33+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 64
34+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
35+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
36+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
37+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 80
38+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
39+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
40+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
1741
return _mm_encodekey128_u32(htype, key, h);
1842
}
1943

2044
unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i key_hi, void *h) {
2145
//CHECK-LABEL: @test_encodekey256_u32
2246
//CHECK: call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
47+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 1
48+
//CHECK: itcast i8* %{{.*}} to <2 x i64>*
49+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
50+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 2
51+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 16
52+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
53+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
54+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 3
55+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 32
56+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
57+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
58+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 4
59+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 48
60+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
61+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
62+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 5
63+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 64
64+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
65+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
66+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 6
67+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 80
68+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
69+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
70+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 7
71+
//CHECK: getelementptr i8, i8* %{{.*}}, i32 96
72+
//CHECK: bitcast i8* %{{.*}} to <2 x i64>*
73+
//CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
74+
//CHECK: extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %{{.*}}, 0
2375
return _mm_encodekey256_u32(htype, key_lo, key_hi, h);
2476
}
2577

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+kl,+widekl | FileCheck %s
3+
4+
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/X86/keylocker-builtins.c
5+
6+
define void @test_loadiwkey(i32 %ctl, <2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi) {
7+
; CHECK-LABEL: test_loadiwkey:
8+
; CHECK: # %bb.0: # %entry
9+
; CHECK-NEXT: movl %edi, %eax
10+
; CHECK-NEXT: loadiwkey %xmm2, %xmm1
11+
; CHECK-NEXT: retq
12+
entry:
13+
tail call void @llvm.x86.loadiwkey(<2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi, i32 %ctl)
14+
ret void
15+
}
16+
17+
define i32 @test_encodekey128_u32(i32 %htype, <2 x i64> %key, i8* nocapture %h) {
18+
; CHECK-LABEL: test_encodekey128_u32:
19+
; CHECK: # %bb.0: # %entry
20+
; CHECK-NEXT: encodekey128 %edi, %eax
21+
; CHECK-NEXT: movups %xmm0, (%rsi)
22+
; CHECK-NEXT: movups %xmm1, 16(%rsi)
23+
; CHECK-NEXT: movups %xmm2, 32(%rsi)
24+
; CHECK-NEXT: movups %xmm4, 48(%rsi)
25+
; CHECK-NEXT: movups %xmm5, 64(%rsi)
26+
; CHECK-NEXT: movups %xmm6, 80(%rsi)
27+
; CHECK-NEXT: retq
28+
entry:
29+
%0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %htype, <2 x i64> %key)
30+
%1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
31+
%2 = bitcast i8* %h to <2 x i64>*
32+
store <2 x i64> %1, <2 x i64>* %2, align 1
33+
%3 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
34+
%4 = getelementptr i8, i8* %h, i64 16
35+
%5 = bitcast i8* %4 to <2 x i64>*
36+
store <2 x i64> %3, <2 x i64>* %5, align 1
37+
%6 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
38+
%7 = getelementptr i8, i8* %h, i64 32
39+
%8 = bitcast i8* %7 to <2 x i64>*
40+
store <2 x i64> %6, <2 x i64>* %8, align 1
41+
%9 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
42+
%10 = getelementptr i8, i8* %h, i64 48
43+
%11 = bitcast i8* %10 to <2 x i64>*
44+
store <2 x i64> %9, <2 x i64>* %11, align 1
45+
%12 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
46+
%13 = getelementptr i8, i8* %h, i64 64
47+
%14 = bitcast i8* %13 to <2 x i64>*
48+
store <2 x i64> %12, <2 x i64>* %14, align 1
49+
%15 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
50+
%16 = getelementptr i8, i8* %h, i64 80
51+
%17 = bitcast i8* %16 to <2 x i64>*
52+
store <2 x i64> %15, <2 x i64>* %17, align 1
53+
%18 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
54+
ret i32 %18
55+
}
56+
57+
define i32 @test_encodekey256_u32(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi, i8* nocapture %h) {
58+
; CHECK-LABEL: test_encodekey256_u32:
59+
; CHECK: # %bb.0: # %entry
60+
; CHECK-NEXT: encodekey256 %edi, %eax
61+
; CHECK-NEXT: movups %xmm0, (%rsi)
62+
; CHECK-NEXT: movups %xmm1, 16(%rsi)
63+
; CHECK-NEXT: movups %xmm2, 32(%rsi)
64+
; CHECK-NEXT: movups %xmm3, 48(%rsi)
65+
; CHECK-NEXT: movups %xmm4, 64(%rsi)
66+
; CHECK-NEXT: movups %xmm5, 80(%rsi)
67+
; CHECK-NEXT: movups %xmm6, 96(%rsi)
68+
; CHECK-NEXT: retq
69+
entry:
70+
%0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi)
71+
%1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
72+
%2 = bitcast i8* %h to <2 x i64>*
73+
store <2 x i64> %1, <2 x i64>* %2, align 1
74+
%3 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
75+
%4 = getelementptr i8, i8* %h, i64 16
76+
%5 = bitcast i8* %4 to <2 x i64>*
77+
store <2 x i64> %3, <2 x i64>* %5, align 1
78+
%6 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
79+
%7 = getelementptr i8, i8* %h, i64 32
80+
%8 = bitcast i8* %7 to <2 x i64>*
81+
store <2 x i64> %6, <2 x i64>* %8, align 1
82+
%9 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
83+
%10 = getelementptr i8, i8* %h, i64 48
84+
%11 = bitcast i8* %10 to <2 x i64>*
85+
store <2 x i64> %9, <2 x i64>* %11, align 1
86+
%12 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
87+
%13 = getelementptr i8, i8* %h, i64 64
88+
%14 = bitcast i8* %13 to <2 x i64>*
89+
store <2 x i64> %12, <2 x i64>* %14, align 1
90+
%15 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
91+
%16 = getelementptr i8, i8* %h, i64 80
92+
%17 = bitcast i8* %16 to <2 x i64>*
93+
store <2 x i64> %15, <2 x i64>* %17, align 1
94+
%18 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
95+
%19 = getelementptr i8, i8* %h, i64 96
96+
%20 = bitcast i8* %19 to <2 x i64>*
97+
store <2 x i64> %18, <2 x i64>* %20, align 1
98+
%21 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
99+
ret i32 %21
100+
}
101+
102+
declare void @llvm.x86.loadiwkey(<2 x i64>, <2 x i64>, <2 x i64>, i32)
103+
declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32, <2 x i64>)
104+
declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32, <2 x i64>, <2 x i64>)

0 commit comments

Comments
 (0)