Skip to content

Commit fb7bcd1

Browse files
authored
Merge pull request #3987 from TDecking/vpclmul
Implement LLVM x86 vpclmulqdq intrinsics
2 parents ca0e5df + 05530b6 commit fb7bcd1

File tree

2 files changed

+249
-38
lines changed

2 files changed

+249
-38
lines changed

src/tools/miri/src/shims/x86/mod.rs

Lines changed: 56 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,22 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
9595
}
9696
}
9797

98-
"pclmulqdq" => {
98+
"pclmulqdq" | "pclmulqdq.256" | "pclmulqdq.512" => {
99+
let mut len = 2; // in units of 64bits
100+
this.expect_target_feature_for_intrinsic(link_name, "pclmulqdq")?;
101+
if unprefixed_name.ends_with(".256") {
102+
this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?;
103+
len = 4;
104+
} else if unprefixed_name.ends_with(".512") {
105+
this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?;
106+
this.expect_target_feature_for_intrinsic(link_name, "avx512f")?;
107+
len = 8;
108+
}
109+
99110
let [left, right, imm] =
100111
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
101112

102-
pclmulqdq(this, left, right, imm, dest)?;
113+
pclmulqdq(this, left, right, imm, dest, len)?;
103114
}
104115

105116
name if name.starts_with("bmi.") => {
@@ -1134,61 +1145,68 @@ fn pmulhrsw<'tcx>(
11341145
/// Perform a carry-less multiplication of two 64-bit integers, selected from `left` and `right` according to `imm8`,
11351146
/// and store the results in `dst`.
11361147
///
1137-
/// `left` and `right` are both vectors of type 2 x i64. Only bits 0 and 4 of `imm8` matter;
1148+
/// `left` and `right` are both vectors of type `len` x i64. Only bits 0 and 4 of `imm8` matter;
11381149
/// they select the element of `left` and `right`, respectively.
11391150
///
1151+
/// `len` is the SIMD vector length (in counts of `i64` values). It is expected to be one of
1152+
/// `2`, `4`, or `8`.
1153+
///
11401154
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128>
11411155
fn pclmulqdq<'tcx>(
11421156
this: &mut MiriInterpCx<'tcx>,
11431157
left: &OpTy<'tcx>,
11441158
right: &OpTy<'tcx>,
11451159
imm8: &OpTy<'tcx>,
11461160
dest: &MPlaceTy<'tcx>,
1161+
len: u64,
11471162
) -> InterpResult<'tcx, ()> {
11481163
assert_eq!(left.layout, right.layout);
11491164
assert_eq!(left.layout.size, dest.layout.size);
1165+
assert!([2u64, 4, 8].contains(&len));
11501166

1151-
// Transmute to `[u64; 2]`
1167+
// Transmute the input into arrays of `[u64; len]`.
1168+
// Transmute the output into an array of `[u128, len / 2]`.
11521169

1153-
let array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
1154-
let left = left.transmute(array_layout, this)?;
1155-
let right = right.transmute(array_layout, this)?;
1156-
let dest = dest.transmute(array_layout, this)?;
1170+
let src_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, len))?;
1171+
let dest_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u128, len / 2))?;
1172+
1173+
let left = left.transmute(src_layout, this)?;
1174+
let right = right.transmute(src_layout, this)?;
1175+
let dest = dest.transmute(dest_layout, this)?;
11571176

11581177
let imm8 = this.read_scalar(imm8)?.to_u8()?;
11591178

1160-
// select the 64-bit integer from left that the user specified (low or high)
1161-
let index = if (imm8 & 0x01) == 0 { 0 } else { 1 };
1162-
let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
1163-
1164-
// select the 64-bit integer from right that the user specified (low or high)
1165-
let index = if (imm8 & 0x10) == 0 { 0 } else { 1 };
1166-
let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
1167-
1168-
// Perform carry-less multiplication
1169-
//
1170-
// This operation is like long multiplication, but ignores all carries.
1171-
// That idea corresponds to the xor operator, which is used in the implementation.
1172-
//
1173-
// Uncyclopedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
1174-
let mut result: u128 = 0;
1175-
1176-
for i in 0..64 {
1177-
// if the i-th bit in right is set
1178-
if (right & (1 << i)) != 0 {
1179-
// xor result with `left` shifted to the left by i positions
1180-
result ^= u128::from(left) << i;
1179+
for i in 0..(len / 2) {
1180+
let lo = i.strict_mul(2);
1181+
let hi = i.strict_mul(2).strict_add(1);
1182+
1183+
// select the 64-bit integer from left that the user specified (low or high)
1184+
let index = if (imm8 & 0x01) == 0 { lo } else { hi };
1185+
let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
1186+
1187+
// select the 64-bit integer from right that the user specified (low or high)
1188+
let index = if (imm8 & 0x10) == 0 { lo } else { hi };
1189+
let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
1190+
1191+
// Perform carry-less multiplication.
1192+
//
1193+
// This operation is like long multiplication, but ignores all carries.
1194+
// That idea corresponds to the xor operator, which is used in the implementation.
1195+
//
1196+
// Uncyclopedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
1197+
let mut result: u128 = 0;
1198+
1199+
for i in 0..64 {
1200+
// if the i-th bit in right is set
1201+
if (right & (1 << i)) != 0 {
1202+
// xor result with `left` shifted to the left by i positions
1203+
result ^= u128::from(left) << i;
1204+
}
11811205
}
1182-
}
1183-
1184-
let result_low = (result & 0xFFFF_FFFF_FFFF_FFFF) as u64;
1185-
let result_high = (result >> 64) as u64;
1186-
1187-
let dest_low = this.project_index(&dest, 0)?;
1188-
this.write_scalar(Scalar::from_u64(result_low), &dest_low)?;
11891206

1190-
let dest_high = this.project_index(&dest, 1)?;
1191-
this.write_scalar(Scalar::from_u64(result_high), &dest_high)?;
1207+
let dest = this.project_index(&dest, i)?;
1208+
this.write_scalar(Scalar::from_u128(result), &dest)?;
1209+
}
11921210

11931211
interp_ok(())
11941212
}
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
// We're testing x86 target specific features
2+
//@revisions: avx512 avx
3+
//@only-target: x86_64 i686
4+
//@[avx512]compile-flags: -C target-feature=+vpclmulqdq,+avx512f
5+
//@[avx]compile-flags: -C target-feature=+vpclmulqdq,+avx2
6+
7+
// The constants in the tests below are just bit patterns. They should not
8+
// be interpreted as integers; signedness does not make sense for them, but
9+
// __mXXXi happens to be defined in terms of signed integers.
10+
#![allow(overflowing_literals)]
11+
#![feature(avx512_target_feature)]
12+
#![feature(stdarch_x86_avx512)]
13+
14+
#[cfg(target_arch = "x86")]
15+
use std::arch::x86::*;
16+
#[cfg(target_arch = "x86_64")]
17+
use std::arch::x86_64::*;
18+
use std::mem::transmute;
19+
20+
fn main() {
21+
// Mostly copied from library/stdarch/crates/core_arch/src/x86/vpclmulqdq.rs
22+
23+
assert!(is_x86_feature_detected!("pclmulqdq"));
24+
assert!(is_x86_feature_detected!("vpclmulqdq"));
25+
26+
unsafe {
27+
test_mm256_clmulepi64_epi128();
28+
29+
if is_x86_feature_detected!("avx512f") {
30+
test_mm512_clmulepi64_epi128();
31+
}
32+
}
33+
}
34+
35+
macro_rules! verify_kat_pclmul {
36+
($broadcast:ident, $clmul:ident, $assert:ident) => {
37+
// Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
38+
let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
39+
let a = $broadcast(a);
40+
let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
41+
let b = $broadcast(b);
42+
let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
43+
let r00 = $broadcast(r00);
44+
let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
45+
let r01 = $broadcast(r01);
46+
let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
47+
let r10 = $broadcast(r10);
48+
let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
49+
let r11 = $broadcast(r11);
50+
51+
$assert($clmul::<0x00>(a, b), r00);
52+
$assert($clmul::<0x10>(a, b), r01);
53+
$assert($clmul::<0x01>(a, b), r10);
54+
$assert($clmul::<0x11>(a, b), r11);
55+
56+
let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
57+
let a0 = $broadcast(a0);
58+
let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
59+
let r = $broadcast(r);
60+
$assert($clmul::<0x00>(a0, a0), r);
61+
}
62+
}
63+
64+
// this function tests one of the possible 4 instances
65+
// with different inputs across lanes for the 512-bit version
66+
#[target_feature(enable = "vpclmulqdq,avx512f")]
67+
unsafe fn verify_512_helper(
68+
linear: unsafe fn(__m128i, __m128i) -> __m128i,
69+
vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
70+
) {
71+
let a = _mm512_set_epi64(
72+
0xDCB4DB3657BF0B7D,
73+
0x18DB0601068EDD9F,
74+
0xB76B908233200DC5,
75+
0xE478235FA8E22D5E,
76+
0xAB05CFFA2621154C,
77+
0x1171B47A186174C9,
78+
0x8C6B6C0E7595CEC9,
79+
0xBE3E7D4934E961BD,
80+
);
81+
let b = _mm512_set_epi64(
82+
0x672F6F105A94CEA7,
83+
0x8298B8FFCA5F829C,
84+
0xA3927047B3FB61D8,
85+
0x978093862CDE7187,
86+
0xB1927AB22F31D0EC,
87+
0xA9A5DA619BE4D7AF,
88+
0xCA2590F56884FDC6,
89+
0x19BE9F660038BDB5,
90+
);
91+
92+
let a_decomp = transmute::<_, [__m128i; 4]>(a);
93+
let b_decomp = transmute::<_, [__m128i; 4]>(b);
94+
95+
let r = vectorized(a, b);
96+
97+
let e_decomp = [
98+
linear(a_decomp[0], b_decomp[0]),
99+
linear(a_decomp[1], b_decomp[1]),
100+
linear(a_decomp[2], b_decomp[2]),
101+
linear(a_decomp[3], b_decomp[3]),
102+
];
103+
let e = transmute::<_, __m512i>(e_decomp);
104+
105+
assert_eq_m512i(r, e)
106+
}
107+
108+
// this function tests one of the possible 4 instances
109+
// with different inputs across lanes for the 256-bit version
110+
#[target_feature(enable = "vpclmulqdq")]
111+
unsafe fn verify_256_helper(
112+
linear: unsafe fn(__m128i, __m128i) -> __m128i,
113+
vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
114+
) {
115+
let a = _mm256_set_epi64x(
116+
0xDCB4DB3657BF0B7D,
117+
0x18DB0601068EDD9F,
118+
0xB76B908233200DC5,
119+
0xE478235FA8E22D5E,
120+
);
121+
let b = _mm256_set_epi64x(
122+
0x672F6F105A94CEA7,
123+
0x8298B8FFCA5F829C,
124+
0xA3927047B3FB61D8,
125+
0x978093862CDE7187,
126+
);
127+
128+
let a_decomp = transmute::<_, [__m128i; 2]>(a);
129+
let b_decomp = transmute::<_, [__m128i; 2]>(b);
130+
131+
let r = vectorized(a, b);
132+
133+
let e_decomp = [linear(a_decomp[0], b_decomp[0]), linear(a_decomp[1], b_decomp[1])];
134+
let e = transmute::<_, __m256i>(e_decomp);
135+
136+
assert_eq_m256i(r, e)
137+
}
138+
139+
#[target_feature(enable = "vpclmulqdq,avx512f")]
140+
unsafe fn test_mm512_clmulepi64_epi128() {
141+
verify_kat_pclmul!(_mm512_broadcast_i32x4, _mm512_clmulepi64_epi128, assert_eq_m512i);
142+
143+
verify_512_helper(
144+
|a, b| _mm_clmulepi64_si128::<0x00>(a, b),
145+
|a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
146+
);
147+
verify_512_helper(
148+
|a, b| _mm_clmulepi64_si128::<0x01>(a, b),
149+
|a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
150+
);
151+
verify_512_helper(
152+
|a, b| _mm_clmulepi64_si128::<0x10>(a, b),
153+
|a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
154+
);
155+
verify_512_helper(
156+
|a, b| _mm_clmulepi64_si128::<0x11>(a, b),
157+
|a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
158+
);
159+
}
160+
161+
#[target_feature(enable = "vpclmulqdq")]
162+
unsafe fn test_mm256_clmulepi64_epi128() {
163+
verify_kat_pclmul!(_mm256_broadcastsi128_si256, _mm256_clmulepi64_epi128, assert_eq_m256i);
164+
165+
verify_256_helper(
166+
|a, b| _mm_clmulepi64_si128::<0x00>(a, b),
167+
|a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
168+
);
169+
verify_256_helper(
170+
|a, b| _mm_clmulepi64_si128::<0x01>(a, b),
171+
|a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
172+
);
173+
verify_256_helper(
174+
|a, b| _mm_clmulepi64_si128::<0x10>(a, b),
175+
|a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
176+
);
177+
verify_256_helper(
178+
|a, b| _mm_clmulepi64_si128::<0x11>(a, b),
179+
|a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
180+
);
181+
}
182+
183+
#[track_caller]
184+
#[target_feature(enable = "avx512f")]
185+
unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
186+
assert_eq!(transmute::<_, [u64; 8]>(a), transmute::<_, [u64; 8]>(b))
187+
}
188+
189+
#[track_caller]
190+
#[target_feature(enable = "avx")]
191+
unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
192+
assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
193+
}

0 commit comments

Comments
 (0)