Skip to content

Commit 311e4e3

Browse files
mahesh-attardemattarde
andauthored
[X86][AVX10.2] Support AVX10.2 MOVZXC new Instructions. (#108537)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Chapter 14 INTEL® AVX10 ZERO-EXTENDING PARTIAL VECTOR COPY INSTRUCTIONS --------- Co-authored-by: mattarde <[email protected]>
1 parent 76347ee commit 311e4e3

File tree

15 files changed

+581
-1
lines changed

15 files changed

+581
-1
lines changed

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ set(x86_files
156156
avx10_2_512satcvtintrin.h
157157
avx10_2bf16intrin.h
158158
avx10_2convertintrin.h
159+
avx10_2copyintrin.h
159160
avx10_2minmaxintrin.h
160161
avx10_2niintrin.h
161162
avx10_2satcvtdsintrin.h

clang/lib/Headers/avx10_2copyintrin.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
#ifndef __IMMINTRIN_H
10+
#error \
11+
"Never use <avx10_2copyintrin.h> directly; include <immintrin.h> instead."
12+
#endif // __IMMINTRIN_H
13+
14+
#ifndef __AVX10_2COPYINTRIN_H
15+
#define __AVX10_2COPYINTRIN_H
16+
17+
/* Define the default attributes for the functions in this file. */
18+
#define __DEFAULT_FN_ATTRS128 \
19+
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
20+
__min_vector_width__(128)))
21+
22+
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) {
23+
return (__m128i)__builtin_shufflevector(
24+
(__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4);
25+
}
26+
27+
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) {
28+
return (__m128i)__builtin_shufflevector(
29+
(__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8);
30+
}
31+
32+
#undef __DEFAULT_FN_ATTRS128
33+
34+
#endif // __AVX10_2COPYINTRIN_H

clang/lib/Headers/immintrin.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,7 @@ _storebe_i64(void * __P, long long __D) {
651651
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
652652
#include <avx10_2bf16intrin.h>
653653
#include <avx10_2convertintrin.h>
654+
#include <avx10_2copyintrin.h>
654655
#include <avx10_2minmaxintrin.h>
655656
#include <avx10_2niintrin.h>
656657
#include <avx10_2satcvtdsintrin.h>
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx10.2-512 \
2+
// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s
3+
4+
#include <immintrin.h>
5+
#include <stddef.h>
6+
7+
__m128i test_mm_move_epi32(__m128i A) {
8+
// CHECK-LABEL: test_mm_move_epi32
9+
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
10+
return _mm_move_epi32(A);
11+
}
12+
13+
__m128i test_mm_move_epi16(__m128i A) {
14+
// CHECK-LABEL: test_mm_move_epi16
15+
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
16+
return _mm_move_epi16(A);
17+
}

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12348,7 +12348,7 @@ static SDValue lowerShuffleAsElementInsertion(
1234812348
}
1234912349
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
1235012350
} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12351-
EltVT == MVT::i16) {
12351+
(EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
1235212352
// Either not inserting from the low element of the input or the input
1235312353
// element size is too small to use VZEXT_MOVL to clear the high bits.
1235412354
return SDValue();

llvm/lib/Target/X86/X86InstrAVX10.td

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,3 +1583,67 @@ let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {
15831583
"vucomxss", SSEPackedSingle>,
15841584
TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
15851585
}
1586+
1587+
//-------------------------------------------------
1588+
// AVX10 MOVZXC (COPY) instructions
1589+
//-------------------------------------------------
1590+
let Predicates = [HasAVX10_2] in {
1591+
def VMOVZPDILo2PDIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
1592+
(ins VR128X:$src),
1593+
"vmovd\t{$src, $dst|$dst, $src}",
1594+
[(set VR128X:$dst, (v4i32 (X86vzmovl
1595+
(v4i32 VR128X:$src))))]>, EVEX,
1596+
Sched<[WriteVecMoveFromGpr]>;
1597+
1598+
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
1599+
def VMOVZPDILo2PDIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
1600+
(ins i32mem:$src),
1601+
"vmovd\t{$src, $dst|$dst, $src}", []>, EVEX,
1602+
EVEX_CD8<32, CD8VT1>,
1603+
Sched<[WriteVecLoad]>;
1604+
1605+
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
1606+
def VMOVZPDILo2PDIZmr : AVX512PDI<0xD6, MRMDestMem, (outs),
1607+
(ins i32mem:$dst, VR128X:$src),
1608+
"vmovd\t{$src, $dst|$dst, $src}", []>, EVEX,
1609+
EVEX_CD8<32, CD8VT1>,
1610+
Sched<[WriteVecStore]>;
1611+
1612+
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
1613+
def VMOVZPDILo2PDIZrr2 : AVX512PDI<0xD6, MRMSrcReg, (outs VR128X:$dst),
1614+
(ins VR128X:$src),
1615+
"vmovd\t{$src, $dst|$dst, $src}", []>, EVEX,
1616+
Sched<[WriteVecMoveFromGpr]>;
1617+
def : InstAlias<"vmovd.s\t{$src, $dst|$dst, $src}",
1618+
(VMOVZPDILo2PDIZrr2 VR128X:$dst, VR128X:$src), 0>;
1619+
1620+
def VMOVZPWILo2PWIZrr : AVX512XSI<0x6E, MRMSrcReg, (outs VR128X:$dst),
1621+
(ins VR128X:$src),
1622+
"vmovw\t{$src, $dst|$dst, $src}",
1623+
[(set VR128X:$dst, (v8i16 (X86vzmovl
1624+
(v8i16 VR128X:$src))))]>, EVEX, T_MAP5,
1625+
Sched<[WriteVecMoveFromGpr]>;
1626+
1627+
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
1628+
def VMOVZPWILo2PWIZrm : AVX512XSI<0x6E, MRMSrcMem, (outs VR128X:$dst),
1629+
(ins i16mem:$src),
1630+
"vmovw\t{$src, $dst|$dst, $src}", []>, EVEX,
1631+
EVEX_CD8<16, CD8VT1>, T_MAP5,
1632+
Sched<[WriteVecLoad]>;
1633+
1634+
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
1635+
def VMOVZPWILo2PWIZmr : AVX512XSI<0x7E, MRMDestMem, (outs),
1636+
(ins i32mem:$dst, VR128X:$src),
1637+
"vmovw\t{$src, $dst|$dst, $src}", []>, EVEX,
1638+
EVEX_CD8<16, CD8VT1>, T_MAP5,
1639+
Sched<[WriteVecStore]>;
1640+
1641+
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
1642+
def VMOVZPWILo2PWIZrr2 : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
1643+
(ins VR128X:$src),
1644+
"vmovw\t{$src, $dst|$dst, $src}",
1645+
[]>, EVEX, T_MAP5,
1646+
Sched<[WriteVecMoveFromGpr]>;
1647+
def : InstAlias<"vmovw.s\t{$src, $dst|$dst, $src}",
1648+
(VMOVZPWILo2PWIZrr2 VR128X:$dst, VR128X:$src), 0>;
1649+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX102
3+
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx512f | FileCheck %s --check-prefixes=NOAVX512MOVZXC
4+
5+
define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
6+
; AVX102-LABEL: test_mm_move_epi32:
7+
; AVX102: # %bb.0:
8+
; AVX102-NEXT: vmovd %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x7e,0xc0]
9+
; AVX102-NEXT: retq # encoding: [0xc3]
10+
;
11+
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
12+
; NOAVX512MOVZXC: # %bb.0:
13+
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
14+
; NOAVX512MOVZXC-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
15+
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
16+
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
17+
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
18+
ret <4 x i32> %res
19+
}
20+
21+
define <8 x i16> @test_mm_move_epi16(<8 x i16> %a0) nounwind {
22+
; AVX102-LABEL: test_mm_move_epi16:
23+
; AVX102: # %bb.0:
24+
; AVX102-NEXT: vmovw %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x6e,0xc0]
25+
; AVX102-NEXT: retq # encoding: [0xc3]
26+
;
27+
; NOAVX512MOVZXC-LABEL: test_mm_move_epi16:
28+
; NOAVX512MOVZXC: # %bb.0:
29+
; NOAVX512MOVZXC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xef,0xc9]
30+
; NOAVX512MOVZXC-NEXT: vpblendw $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0e,0xc0,0x01]
31+
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
32+
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
33+
%res = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
34+
ret <8 x i16> %res
35+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT
2+
# RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
3+
4+
# ATT: vmovd (%ecx), %xmm5
5+
# INTEL: vmovd xmm5, dword ptr [ecx]
6+
0x62 0xf1 0x7e 0x08 0x7e 0x29
7+
8+
# ATT: vmovd %xmm5, (%ecx)
9+
# INTEL: vmovd dword ptr [ecx], xmm5
10+
0x62 0xf1 0x7d 0x08 0xd6 0x29
11+
12+
# ATT: vmovd %xmm2, %xmm1
13+
# INTEL: vmovd xmm1, xmm2
14+
0x62 0xf1 0x7e 0x08 0x7e 0xca
15+
16+
# ATT: vmovd %xmm2, %xmm1
17+
# INTEL: vmovd xmm1, xmm2
18+
0x62 0xf1 0x7d 0x08 0xd6 0xca
19+
20+
# ATT: vmovw %xmm5, (%ecx)
21+
# INTEL: vmovw dword ptr [ecx], xmm5
22+
0x62 0xf5 0x7e 0x08 0x7e 0x29
23+
24+
# ATT: vmovw (%ecx), %xmm5
25+
# INTEL: vmovw xmm5, word ptr [ecx]
26+
0x62 0xf5 0x7e 0x08 0x6e 0x29
27+
28+
# ATT: vmovw %xmm2, %xmm1
29+
# INTEL: vmovw xmm1, xmm2
30+
0x62 0xf5 0x7e 0x08 0x6e 0xca
31+
32+
# ATT: vmovw %xmm2, %xmm1
33+
# INTEL: vmovw xmm1, xmm2
34+
0x62 0xf5 0x7e 0x08 0x7e 0xca
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
2+
# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
3+
4+
# ATT: vmovd (%rcx), %xmm29
5+
# INTEL: vmovd xmm29, dword ptr [rcx]
6+
0x62 0x61 0x7e 0x08 0x7e 0x29
7+
8+
# ATT: vmovd %xmm29, (%rcx)
9+
# INTEL: vmovd dword ptr [rcx], xmm29
10+
0x62 0x61 0x7d 0x08 0xd6 0x29
11+
12+
# ATT: vmovd %xmm22, %xmm21
13+
# INTEL: vmovd xmm21, xmm22
14+
0x62 0xa1 0x7e 0x08 0x7e 0xee
15+
16+
# ATT: vmovd %xmm22, %xmm21
17+
# INTEL: vmovd xmm21, xmm22
18+
0x62 0xa1 0x7d 0x08 0xd6 0xee
19+
20+
# ATT: vmovw %xmm29, (%rcx)
21+
# INTEL: vmovw dword ptr [rcx], xmm29
22+
0x62 0x65 0x7e 0x08 0x7e 0x29
23+
24+
# ATT: vmovw (%rcx), %xmm29
25+
# INTEL: vmovw xmm29, word ptr [rcx]
26+
0x62 0x65 0x7e 0x08 0x6e 0x29
27+
28+
# ATT: vmovw %xmm22, %xmm21
29+
# INTEL: vmovw xmm21, xmm22
30+
0x62 0xa5 0x7e 0x08 0x6e 0xee
31+
32+
# ATT: vmovw %xmm22, %xmm21
33+
# INTEL: vmovw xmm21, xmm22
34+
0x62 0xa5 0x7e 0x08 0x7e 0xee
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s
2+
3+
// CHECK: vmovd 268435456(%esp,%esi,8), %xmm2
4+
// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10]
5+
vmovd 268435456(%esp,%esi,8), %xmm2
6+
7+
// CHECK: vmovd 291(%edi,%eax,4), %xmm2
8+
// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0x87,0x23,0x01,0x00,0x00]
9+
vmovd 291(%edi,%eax,4), %xmm2
10+
11+
// CHECK: vmovd (%eax), %xmm2
12+
// CHECK: encoding: [0xc5,0xf9,0x6e,0x10]
13+
vmovd (%eax), %xmm2
14+
15+
// CHECK: vmovd -128(,%ebp,2), %xmm2
16+
// CHECK: encoding: [0xc5,0xf9,0x6e,0x14,0x6d,0x80,0xff,0xff,0xff]
17+
vmovd -128(,%ebp,2), %xmm2
18+
19+
// CHECK: vmovd %xmm3, 268435456(%esp,%esi,8)
20+
// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10]
21+
vmovd %xmm3, 268435456(%esp,%esi,8)
22+
23+
// CHECK: vmovd %xmm3, 291(%edi,%eax,4)
24+
// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00]
25+
vmovd %xmm3, 291(%edi,%eax,4)
26+
27+
// CHECK: vmovd %xmm3, (%eax)
28+
// CHECK: encoding: [0xc5,0xf9,0x7e,0x18]
29+
vmovd %xmm3, (%eax)
30+
31+
// CHECK: vmovd %xmm3, -128(,%ebp,2)
32+
// CHECK: encoding: [0xc5,0xf9,0x7e,0x1c,0x6d,0x80,0xff,0xff,0xff]
33+
vmovd %xmm3, -128(,%ebp,2)
34+
35+
// CHECK: vmovw 268435456(%esp,%esi,8), %xmm2
36+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10]
37+
vmovw 268435456(%esp,%esi,8), %xmm2
38+
39+
// CHECK: vmovw 291(%edi,%eax,4), %xmm2
40+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0x87,0x23,0x01,0x00,0x00]
41+
vmovw 291(%edi,%eax,4), %xmm2
42+
43+
// CHECK: vmovw (%eax), %xmm2
44+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x10]
45+
vmovw (%eax), %xmm2
46+
47+
// CHECK: vmovw -64(,%ebp,2), %xmm2
48+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x14,0x6d,0xc0,0xff,0xff,0xff]
49+
vmovw -64(,%ebp,2), %xmm2
50+
51+
// CHECK: vmovw 254(%ecx), %xmm2
52+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x51,0x7f]
53+
vmovw 254(%ecx), %xmm2
54+
55+
// CHECK: vmovw -256(%edx), %xmm2
56+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x52,0x80]
57+
vmovw -256(%edx), %xmm2
58+
59+
// CHECK: vmovw %xmm3, 268435456(%esp,%esi,8)
60+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10]
61+
vmovw %xmm3, 268435456(%esp,%esi,8)
62+
63+
// CHECK: vmovw %xmm3, 291(%edi,%eax,4)
64+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00]
65+
vmovw %xmm3, 291(%edi,%eax,4)
66+
67+
// CHECK: vmovw %xmm3, (%eax)
68+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x18]
69+
vmovw %xmm3, (%eax)
70+
71+
// CHECK: vmovw %xmm3, -64(,%ebp,2)
72+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x1c,0x6d,0xc0,0xff,0xff,0xff]
73+
vmovw %xmm3, -64(,%ebp,2)
74+
75+
// CHECK: vmovw %xmm3, 254(%ecx)
76+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x59,0x7f]
77+
vmovw %xmm3, 254(%ecx)
78+
79+
// CHECK: vmovw %xmm3, -256(%edx)
80+
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x5a,0x80]
81+
vmovw %xmm3, -256(%edx)
82+

0 commit comments

Comments
 (0)