Skip to content

Commit 16d848d

Browse files
committed
Add 256-bit vectors and some SSE intrinsics.
1 parent bcb398b commit 16d848d

File tree

9 files changed

+687
-69
lines changed

9 files changed

+687
-69
lines changed

TODO.md

Lines changed: 268 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,167 @@
1+
**TIP**: Use the following command to generate a section in this list for
2+
Intel intrinsics. Replace `SSE4.2` with the intended type.
3+
4+
```
5+
rg '^<intrinsic' intel-intrinsics-3.3.15.xml | rg "'SSE4.2'" | rg '^.*name=\x27([^\x27]+)\x27.*$' -r '* [ ] `$1`' >> TODO.md
6+
```
7+
8+
9+
sse
10+
---
11+
* [ ] `_MM_TRANSPOSE4_PS`
12+
* [ ] `_mm_getcsr`
13+
* [ ] `_mm_setcsr`
14+
* [ ] `_MM_GET_EXCEPTION_STATE`
15+
* [ ] `_MM_SET_EXCEPTION_STATE`
16+
* [ ] `_MM_GET_EXCEPTION_MASK`
17+
* [ ] `_MM_SET_EXCEPTION_MASK`
18+
* [ ] `_MM_GET_ROUNDING_MODE`
19+
* [ ] `_MM_SET_ROUNDING_MODE`
20+
* [ ] `_MM_GET_FLUSH_ZERO_MODE`
21+
* [ ] `_MM_SET_FLUSH_ZERO_MODE`
22+
* [ ] `_mm_prefetch`
23+
* [ ] `_mm_sfence`
24+
* [ ] `_mm_max_pi16`
25+
* [ ] `_m_pmaxsw`
26+
* [ ] `_mm_max_pu8`
27+
* [ ] `_m_pmaxub`
28+
* [ ] `_mm_min_pi16`
29+
* [ ] `_m_pminsw`
30+
* [ ] `_mm_min_pu8`
31+
* [ ] `_m_pminub`
32+
* [ ] `_mm_mulhi_pu16`
33+
* [ ] `_m_pmulhuw`
34+
* [ ] `_mm_avg_pu8`
35+
* [ ] `_m_pavgb`
36+
* [ ] `_mm_avg_pu16`
37+
* [ ] `_m_pavgw`
38+
* [ ] `_mm_sad_pu8`
39+
* [ ] `_m_psadbw`
40+
* [ ] `_mm_cvtsi32_ss`
41+
* [ ] `_mm_cvt_si2ss`
42+
* [ ] `_mm_cvtsi64_ss`
43+
* [ ] `_mm_cvtpi32_ps`
44+
* [ ] `_mm_cvt_pi2ps`
45+
* [ ] `_mm_cvtpi16_ps`
46+
* [ ] `_mm_cvtpu16_ps`
47+
* [ ] `_mm_cvtpi8_ps`
48+
* [ ] `_mm_cvtpu8_ps`
49+
* [ ] `_mm_cvtpi32x2_ps`
50+
* [ ] `_mm_stream_pi`
51+
* [ ] `_mm_maskmove_si64`
52+
* [ ] `_m_maskmovq`
53+
* [ ] `_mm_extract_pi16`
54+
* [ ] `_m_pextrw`
55+
* [ ] `_mm_insert_pi16`
56+
* [ ] `_m_pinsrw`
57+
* [ ] `_mm_movemask_pi8`
58+
* [ ] `_m_pmovmskb`
59+
* [ ] `_mm_shuffle_pi16`
60+
* [ ] `_m_pshufw`
61+
* [ ] `_mm_add_ss`
62+
* [ ] `_mm_add_ps`
63+
* [ ] `_mm_sub_ss`
64+
* [ ] `_mm_sub_ps`
65+
* [ ] `_mm_mul_ss`
66+
* [ ] `_mm_mul_ps`
67+
* [ ] `_mm_div_ss`
68+
* [ ] `_mm_div_ps`
69+
* [ ] `_mm_sqrt_ss`
70+
* [x] `_mm_sqrt_ps`
71+
* [ ] `_mm_rcp_ss`
72+
* [x] `_mm_rcp_ps`
73+
* [ ] `_mm_rsqrt_ss`
74+
* [x] `_mm_rsqrt_ps`
75+
* [ ] `_mm_min_ss`
76+
* [x] `_mm_min_ps`
77+
* [ ] `_mm_max_ss`
78+
* [x] `_mm_max_ps`
79+
* [ ] `_mm_and_ps`
80+
* [ ] `_mm_andnot_ps`
81+
* [ ] `_mm_or_ps`
82+
* [ ] `_mm_xor_ps`
83+
* [ ] `_mm_cmpeq_ss`
84+
* [ ] `_mm_cmpeq_ps`
85+
* [ ] `_mm_cmplt_ss`
86+
* [ ] `_mm_cmplt_ps`
87+
* [ ] `_mm_cmple_ss`
88+
* [ ] `_mm_cmple_ps`
89+
* [ ] `_mm_cmpgt_ss`
90+
* [ ] `_mm_cmpgt_ps`
91+
* [ ] `_mm_cmpge_ss`
92+
* [ ] `_mm_cmpge_ps`
93+
* [ ] `_mm_cmpneq_ss`
94+
* [ ] `_mm_cmpneq_ps`
95+
* [ ] `_mm_cmpnlt_ss`
96+
* [ ] `_mm_cmpnlt_ps`
97+
* [ ] `_mm_cmpnle_ss`
98+
* [ ] `_mm_cmpnle_ps`
99+
* [ ] `_mm_cmpngt_ss`
100+
* [ ] `_mm_cmpngt_ps`
101+
* [ ] `_mm_cmpnge_ss`
102+
* [ ] `_mm_cmpnge_ps`
103+
* [ ] `_mm_cmpord_ss`
104+
* [ ] `_mm_cmpord_ps`
105+
* [ ] `_mm_cmpunord_ss`
106+
* [ ] `_mm_cmpunord_ps`
107+
* [ ] `_mm_comieq_ss`
108+
* [ ] `_mm_comilt_ss`
109+
* [ ] `_mm_comile_ss`
110+
* [ ] `_mm_comigt_ss`
111+
* [ ] `_mm_comige_ss`
112+
* [ ] `_mm_comineq_ss`
113+
* [ ] `_mm_ucomieq_ss`
114+
* [ ] `_mm_ucomilt_ss`
115+
* [ ] `_mm_ucomile_ss`
116+
* [ ] `_mm_ucomigt_ss`
117+
* [ ] `_mm_ucomige_ss`
118+
* [ ] `_mm_ucomineq_ss`
119+
* [ ] `_mm_cvtss_si32`
120+
* [ ] `_mm_cvt_ss2si`
121+
* [ ] `_mm_cvtss_si64`
122+
* [ ] `_mm_cvtss_f32`
123+
* [ ] `_mm_cvtps_pi32`
124+
* [ ] `_mm_cvt_ps2pi`
125+
* [ ] `_mm_cvttss_si32`
126+
* [ ] `_mm_cvtt_ss2si`
127+
* [ ] `_mm_cvttss_si64`
128+
* [ ] `_mm_cvttps_pi32`
129+
* [ ] `_mm_cvtt_ps2pi`
130+
* [ ] `_mm_cvtps_pi16`
131+
* [ ] `_mm_cvtps_pi8`
132+
* [ ] `_mm_set_ss`
133+
* [ ] `_mm_set1_ps`
134+
* [ ] `_mm_set_ps1`
135+
* [ ] `_mm_set_ps`
136+
* [ ] `_mm_setr_ps`
137+
* [ ] `_mm_setzero_ps`
138+
* [ ] `_mm_loadh_pi`
139+
* [ ] `_mm_loadl_pi`
140+
* [ ] `_mm_load_ss`
141+
* [ ] `_mm_load1_ps`
142+
* [ ] `_mm_load_ps1`
143+
* [ ] `_mm_load_ps`
144+
* [ ] `_mm_loadu_ps`
145+
* [ ] `_mm_loadr_ps`
146+
* [ ] `_mm_stream_ps`
147+
* [ ] `_mm_storeh_pi`
148+
* [ ] `_mm_storel_pi`
149+
* [ ] `_mm_store_ss`
150+
* [ ] `_mm_store1_ps`
151+
* [ ] `_mm_store_ps1`
152+
* [ ] `_mm_store_ps`
153+
* [ ] `_mm_storeu_ps`
154+
* [ ] `_mm_storer_ps`
155+
* [ ] `_mm_move_ss`
156+
* [ ] `_mm_shuffle_ps`
157+
* [ ] `_mm_unpackhi_ps`
158+
* [ ] `_mm_unpacklo_ps`
159+
* [ ] `_mm_movehl_ps`
160+
* [ ] `_mm_movelh_ps`
161+
* [x] `_mm_movemask_ps`
162+
* [ ] `_mm_undefined_ps`
163+
164+
1165
sse2
2166
----
3167
* [x] `_mm_pause`
@@ -221,7 +385,7 @@ sse2
221385
* [ ] `_mm_storel_pd`
222386
* [ ] `_mm_unpackhi_pd`
223387
* [ ] `_mm_unpacklo_pd`
224-
* [ ] `_mm_movemask_pd`
388+
* [x] `_mm_movemask_pd`
225389
* [ ] `_mm_shuffle_pd`
226390
* [ ] `_mm_move_sd`
227391
* [ ] `_mm_castpd_ps`
@@ -234,6 +398,21 @@ sse2
234398
* [ ] `_mm_undefined_si128`
235399

236400

401+
sse3
402+
----
403+
* [ ] `_mm_addsub_ps`
404+
* [ ] `_mm_addsub_pd`
405+
* [ ] `_mm_hadd_pd`
406+
* [ ] `_mm_hadd_ps`
407+
* [ ] `_mm_hsub_pd`
408+
* [ ] `_mm_hsub_ps`
409+
* [ ] `_mm_lddqu_si128`
410+
* [ ] `_mm_movedup_pd`
411+
* [ ] `_mm_loaddup_pd`
412+
* [ ] `_mm_movehdup_ps`
413+
* [ ] `_mm_moveldup_ps`
414+
415+
237416
ssse3
238417
-----
239418
* [ ] `_mm_abs_pi8`
@@ -268,3 +447,91 @@ ssse3
268447
* [ ] `_mm_sign_pi8`
269448
* [ ] `_mm_sign_pi16`
270449
* [ ] `_mm_sign_pi32`
450+
451+
452+
sse4.1
453+
------
454+
* [ ] `_mm_blend_pd`
455+
* [ ] `_mm_blend_ps`
456+
* [ ] `_mm_blendv_pd`
457+
* [ ] `_mm_blendv_ps`
458+
* [ ] `_mm_blendv_epi8`
459+
* [ ] `_mm_blend_epi16`
460+
* [ ] `_mm_dp_pd`
461+
* [ ] `_mm_dp_ps`
462+
* [ ] `_mm_extract_ps`
463+
* [ ] `_mm_extract_epi8`
464+
* [ ] `_mm_extract_epi32`
465+
* [ ] `_mm_extract_epi64`
466+
* [ ] `_mm_insert_ps`
467+
* [ ] `_mm_insert_epi8`
468+
* [ ] `_mm_insert_epi32`
469+
* [ ] `_mm_insert_epi64`
470+
* [ ] `_mm_max_epi8`
471+
* [ ] `_mm_max_epi32`
472+
* [ ] `_mm_max_epu32`
473+
* [ ] `_mm_max_epu16`
474+
* [ ] `_mm_min_epi8`
475+
* [ ] `_mm_min_epi32`
476+
* [ ] `_mm_min_epu32`
477+
* [ ] `_mm_min_epu16`
478+
* [ ] `_mm_packus_epi32`
479+
* [ ] `_mm_cmpeq_epi64`
480+
* [ ] `_mm_cvtepi8_epi16`
481+
* [ ] `_mm_cvtepi8_epi32`
482+
* [ ] `_mm_cvtepi8_epi64`
483+
* [ ] `_mm_cvtepi16_epi32`
484+
* [ ] `_mm_cvtepi16_epi64`
485+
* [ ] `_mm_cvtepi32_epi64`
486+
* [ ] `_mm_cvtepu8_epi16`
487+
* [ ] `_mm_cvtepu8_epi32`
488+
* [ ] `_mm_cvtepu8_epi64`
489+
* [ ] `_mm_cvtepu16_epi32`
490+
* [ ] `_mm_cvtepu16_epi64`
491+
* [ ] `_mm_cvtepu32_epi64`
492+
* [ ] `_mm_mul_epi32`
493+
* [ ] `_mm_mullo_epi32`
494+
* [ ] `_mm_testz_si128`
495+
* [ ] `_mm_testc_si128`
496+
* [ ] `_mm_testnzc_si128`
497+
* [ ] `_mm_test_all_zeros`
498+
* [ ] `_mm_test_mix_ones_zeros`
499+
* [ ] `_mm_test_all_ones`
500+
* [ ] `_mm_round_pd`
501+
* [ ] `_mm_floor_pd`
502+
* [ ] `_mm_ceil_pd`
503+
* [ ] `_mm_round_ps`
504+
* [ ] `_mm_floor_ps`
505+
* [ ] `_mm_ceil_ps`
506+
* [ ] `_mm_round_sd`
507+
* [ ] `_mm_floor_sd`
508+
* [ ] `_mm_ceil_sd`
509+
* [ ] `_mm_round_ss`
510+
* [ ] `_mm_floor_ss`
511+
* [ ] `_mm_ceil_ss`
512+
* [ ] `_mm_minpos_epu16`
513+
* [ ] `_mm_mpsadbw_epu8`
514+
* [ ] `_mm_stream_load_si128`
515+
516+
517+
sse4.2
518+
------
519+
* [ ] `_mm_cmpistrm`
520+
* [ ] `_mm_cmpistri`
521+
* [ ] `_mm_cmpistrz`
522+
* [ ] `_mm_cmpistrc`
523+
* [ ] `_mm_cmpistrs`
524+
* [ ] `_mm_cmpistro`
525+
* [ ] `_mm_cmpistra`
526+
* [ ] `_mm_cmpestrm`
527+
* [ ] `_mm_cmpestri`
528+
* [ ] `_mm_cmpestrz`
529+
* [ ] `_mm_cmpestrc`
530+
* [ ] `_mm_cmpestrs`
531+
* [ ] `_mm_cmpestro`
532+
* [ ] `_mm_cmpestra`
533+
* [ ] `_mm_cmpgt_epi64`
534+
* [ ] `_mm_crc32_u8`
535+
* [ ] `_mm_crc32_u16`
536+
* [ ] `_mm_crc32_u32`
537+
* [ ] `_mm_crc32_u64`

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
)]
66

77
pub use v128::*;
8+
pub use v256::*;
89
pub use v64::*;
910
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1011
pub use x86::*;
@@ -13,6 +14,7 @@ pub use x86::*;
1314
mod macros;
1415
mod simd;
1516
mod v128;
17+
mod v256;
1618
mod v64;
1719
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1820
mod x86;

0 commit comments

Comments
 (0)