Skip to content

Commit d9794fc

Browse files
p32blognzlbg
authored andcommitted
Add _mm_cvtepu8_epi{16, 32, 64}
1 parent 9e9185a commit d9794fc

File tree

3 files changed

+63
-2
lines changed

3 files changed

+63
-2
lines changed

src/lib.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,11 @@ mod v32 {
176176
define_ty! { u8x4, u8, u8, u8, u8 }
177177
define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 }
178178

179-
define_casts!((i8x4, i32x4, as_i32x4), (i16x2, i64x2, as_i64x2));
179+
define_casts!(
180+
(i8x4, i32x4, as_i32x4),
181+
(u8x4, i32x4, as_i32x4),
182+
(i16x2, i64x2, as_i64x2)
183+
);
180184
}
181185

182186
/// 16-bit wide vector tpyes
@@ -186,7 +190,13 @@ mod v16 {
186190
define_ty! { i8x2, i8, i8 }
187191
define_impl! { i8x2, i8, 2, i8x2, x0, x1 }
188192

189-
define_casts!((i8x2, i64x2, as_i64x2));
193+
define_ty! { u8x2, u8, u8 }
194+
define_impl! { u8x2, u8, 2, i8x2, x0, x1 }
195+
196+
define_casts!(
197+
(i8x2, i64x2, as_i64x2),
198+
(u8x2, i64x2, as_i64x2)
199+
);
190200
}
191201

192202
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]

src/v64.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ define_casts!(
6060
(u8x8, i8x8, as_i8x8),
6161
(i8x8, u8x8, as_u8x8),
6262
(i8x8, i16x8, as_i16x8),
63+
(u8x8, i16x8, as_i16x8),
6364
(i16x4, i32x4, as_i32x4),
6465
(i32x2, i64x2, as_i64x2),
6566
(u8x8, u16x8, as_u16x8),

src/x86/sse41.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,30 @@ pub unsafe fn _mm_cvtepi32_epi64(a: i32x4) -> i64x2 {
346346
simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]).as_i64x2()
347347
}
348348

349+
/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
350+
#[inline(always)]
351+
#[target_feature = "+sse4.1"]
352+
#[cfg_attr(test, assert_instr(pmovzxbw))]
353+
pub unsafe fn _mm_cvtepu8_epi16(a: u8x16) -> i16x8 {
354+
simd_shuffle8::<_, ::v64::u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]).as_i16x8()
355+
}
356+
357+
/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
358+
#[inline(always)]
359+
#[target_feature = "+sse4.1"]
360+
#[cfg_attr(test, assert_instr(pmovzxbd))]
361+
pub unsafe fn _mm_cvtepu8_epi32(a: u8x16) -> i32x4 {
362+
simd_shuffle4::<_, ::v32::u8x4>(a, a, [0, 1, 2, 3]).as_i32x4()
363+
}
364+
365+
/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
366+
#[inline(always)]
367+
#[target_feature = "+sse4.1"]
368+
#[cfg_attr(test, assert_instr(pmovzxbq))]
369+
pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 {
370+
simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]).as_i64x2()
371+
}
372+
349373
/// Returns the dot product of two f64x2 vectors.
350374
///
351375
/// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -1041,6 +1065,32 @@ mod tests {
10411065
assert_eq!(r, e);
10421066
}
10431067

1068+
#[simd_test = "sse4.1"]
1069+
unsafe fn _mm_cvtepu8_epi16() {
1070+
let a = u8x16::splat(10);
1071+
let r = sse41::_mm_cvtepu8_epi16(a);
1072+
let e = i16x8::splat(10);
1073+
assert_eq!(r, e);
1074+
}
1075+
1076+
#[simd_test = "sse4.1"]
1077+
unsafe fn _mm_cvtepu8_epi32() {
1078+
let a = u8x16::splat(10);
1079+
let r = sse41::_mm_cvtepu8_epi32(a);
1080+
let e = i32x4::splat(10);
1081+
assert_eq!(r, e);
1082+
}
1083+
1084+
#[simd_test = "sse4.1"]
1085+
unsafe fn _mm_cvtepu8_epi64() {
1086+
let a = u8x16::splat(10);
1087+
let r = sse41::_mm_cvtepu8_epi64(a);
1088+
let e = i64x2::splat(10);
1089+
assert_eq!(r, e);
1090+
}
1091+
1092+
1093+
10441094
#[simd_test = "sse4.1"]
10451095
unsafe fn _mm_dp_pd() {
10461096
let a = f64x2::new(2.0, 3.0);

0 commit comments

Comments
 (0)