Skip to content

[X86][AVX10.2] Support AVX10.2 MOVZXC new Instructions. #108537

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ set(x86_files
avx10_2_512satcvtintrin.h
avx10_2bf16intrin.h
avx10_2convertintrin.h
avx10_2copyintrin.h
avx10_2minmaxintrin.h
avx10_2niintrin.h
avx10_2satcvtdsintrin.h
Expand Down
34 changes: 34 additions & 0 deletions clang/lib/Headers/avx10_2copyintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx10_2copyintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H

#ifndef __AVX10_2COPYINTRIN_H
#define __AVX10_2COPYINTRIN_H

/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
__min_vector_width__(128)))

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) {
return (__m128i)__builtin_shufflevector(
(__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) {
return (__m128i)__builtin_shufflevector(
(__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8);
}

#undef __DEFAULT_FN_ATTRS128

#endif // __AVX10_2COPYINTRIN_H
1 change: 1 addition & 0 deletions clang/lib/Headers/immintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ _storebe_i64(void * __P, long long __D) {
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
#include <avx10_2bf16intrin.h>
#include <avx10_2convertintrin.h>
#include <avx10_2copyintrin.h>
#include <avx10_2minmaxintrin.h>
#include <avx10_2niintrin.h>
#include <avx10_2satcvtdsintrin.h>
Expand Down
17 changes: 17 additions & 0 deletions clang/test/CodeGen/X86/avx512copy-builtins.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx10.2-512 \
// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s

#include <immintrin.h>
#include <stddef.h>

__m128i test_mm_move_epi32(__m128i A) {
// CHECK-LABEL: test_mm_move_epi32
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
return _mm_move_epi32(A);
}

__m128i test_mm_move_epi16(__m128i A) {
// CHECK-LABEL: test_mm_move_epi16
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
return _mm_move_epi16(A);
}
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12348,7 +12348,7 @@ static SDValue lowerShuffleAsElementInsertion(
}
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
EltVT == MVT::i16) {
(EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
// Either not inserting from the low element of the input or the input
// element size is too small to use VZEXT_MOVL to clear the high bits.
return SDValue();
Expand Down
64 changes: 64 additions & 0 deletions llvm/lib/Target/X86/X86InstrAVX10.td
Original file line number Diff line number Diff line change
Expand Up @@ -1583,3 +1583,67 @@ let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {
"vucomxss", SSEPackedSingle>,
TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
}

//-------------------------------------------------
// AVX10 MOVZXC (COPY) instructions
//-------------------------------------------------
let Predicates = [HasAVX10_2] in {
def VMOVZPDILo2PDIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst, (v4i32 (X86vzmovl
(v4i32 VR128X:$src))))]>, EVEX,
Sched<[WriteVecMoveFromGpr]>;

let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def VMOVZPDILo2PDIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
(ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}", []>, EVEX,
EVEX_CD8<32, CD8VT1>,
Sched<[WriteVecLoad]>;

let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def VMOVZPDILo2PDIZmr : AVX512PDI<0xD6, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}", []>, EVEX,
EVEX_CD8<32, CD8VT1>,
Sched<[WriteVecStore]>;

let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def VMOVZPDILo2PDIZrr2 : AVX512PDI<0xD6, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}", []>, EVEX,
Sched<[WriteVecMoveFromGpr]>;
def : InstAlias<"vmovd.s\t{$src, $dst|$dst, $src}",
(VMOVZPDILo2PDIZrr2 VR128X:$dst, VR128X:$src), 0>;

def VMOVZPWILo2PWIZrr : AVX512XSI<0x6E, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovw\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst, (v8i16 (X86vzmovl
(v8i16 VR128X:$src))))]>, EVEX, T_MAP5,
Sched<[WriteVecMoveFromGpr]>;

let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def VMOVZPWILo2PWIZrm : AVX512XSI<0x6E, MRMSrcMem, (outs VR128X:$dst),
(ins i16mem:$src),
"vmovw\t{$src, $dst|$dst, $src}", []>, EVEX,
EVEX_CD8<16, CD8VT1>, T_MAP5,
Sched<[WriteVecLoad]>;

let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def VMOVZPWILo2PWIZmr : AVX512XSI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovw\t{$src, $dst|$dst, $src}", []>, EVEX,
EVEX_CD8<16, CD8VT1>, T_MAP5,
Sched<[WriteVecStore]>;

let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def VMOVZPWILo2PWIZrr2 : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovw\t{$src, $dst|$dst, $src}",
[]>, EVEX, T_MAP5,
Sched<[WriteVecMoveFromGpr]>;
def : InstAlias<"vmovw.s\t{$src, $dst|$dst, $src}",
(VMOVZPWILo2PWIZrr2 VR128X:$dst, VR128X:$src), 0>;
}
35 changes: 35 additions & 0 deletions llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX102
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx512f | FileCheck %s --check-prefixes=NOAVX512MOVZXC

define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
; AVX102-LABEL: test_mm_move_epi32:
; AVX102: # %bb.0:
; AVX102-NEXT: vmovd %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x7e,0xc0]
; AVX102-NEXT: retq # encoding: [0xc3]
;
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
; NOAVX512MOVZXC: # %bb.0:
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
; NOAVX512MOVZXC-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
ret <4 x i32> %res
}

define <8 x i16> @test_mm_move_epi16(<8 x i16> %a0) nounwind {
; AVX102-LABEL: test_mm_move_epi16:
; AVX102: # %bb.0:
; AVX102-NEXT: vmovw %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x6e,0xc0]
; AVX102-NEXT: retq # encoding: [0xc3]
;
; NOAVX512MOVZXC-LABEL: test_mm_move_epi16:
; NOAVX512MOVZXC: # %bb.0:
; NOAVX512MOVZXC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xef,0xc9]
; NOAVX512MOVZXC-NEXT: vpblendw $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0e,0xc0,0x01]
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
%res = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <8 x i16> %res
}
34 changes: 34 additions & 0 deletions llvm/test/MC/Disassembler/X86/avx10.2-copy-32.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT
# RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL

# ATT: vmovd (%ecx), %xmm5
# INTEL: vmovd xmm5, dword ptr [ecx]
0x62 0xf1 0x7e 0x08 0x7e 0x29

# ATT: vmovd %xmm5, (%ecx)
# INTEL: vmovd dword ptr [ecx], xmm5
0x62 0xf1 0x7d 0x08 0xd6 0x29

# ATT: vmovd %xmm2, %xmm1
# INTEL: vmovd xmm1, xmm2
0x62 0xf1 0x7e 0x08 0x7e 0xca

# ATT: vmovd %xmm2, %xmm1
# INTEL: vmovd xmm1, xmm2
0x62 0xf1 0x7d 0x08 0xd6 0xca

# ATT: vmovw %xmm5, (%ecx)
# INTEL: vmovw dword ptr [ecx], xmm5
0x62 0xf5 0x7e 0x08 0x7e 0x29

# ATT: vmovw (%ecx), %xmm5
# INTEL: vmovw xmm5, word ptr [ecx]
0x62 0xf5 0x7e 0x08 0x6e 0x29

# ATT: vmovw %xmm2, %xmm1
# INTEL: vmovw xmm1, xmm2
0x62 0xf5 0x7e 0x08 0x6e 0xca

# ATT: vmovw %xmm2, %xmm1
# INTEL: vmovw xmm1, xmm2
0x62 0xf5 0x7e 0x08 0x7e 0xca
34 changes: 34 additions & 0 deletions llvm/test/MC/Disassembler/X86/avx10.2-copy-64.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL

# ATT: vmovd (%rcx), %xmm29
# INTEL: vmovd xmm29, dword ptr [rcx]
0x62 0x61 0x7e 0x08 0x7e 0x29

# ATT: vmovd %xmm29, (%rcx)
# INTEL: vmovd dword ptr [rcx], xmm29
0x62 0x61 0x7d 0x08 0xd6 0x29

# ATT: vmovd %xmm22, %xmm21
# INTEL: vmovd xmm21, xmm22
0x62 0xa1 0x7e 0x08 0x7e 0xee

# ATT: vmovd %xmm22, %xmm21
# INTEL: vmovd xmm21, xmm22
0x62 0xa1 0x7d 0x08 0xd6 0xee

# ATT: vmovw %xmm29, (%rcx)
# INTEL: vmovw dword ptr [rcx], xmm29
0x62 0x65 0x7e 0x08 0x7e 0x29

# ATT: vmovw (%rcx), %xmm29
# INTEL: vmovw xmm29, word ptr [rcx]
0x62 0x65 0x7e 0x08 0x6e 0x29

# ATT: vmovw %xmm22, %xmm21
# INTEL: vmovw xmm21, xmm22
0x62 0xa5 0x7e 0x08 0x6e 0xee

# ATT: vmovw %xmm22, %xmm21
# INTEL: vmovw xmm21, xmm22
0x62 0xa5 0x7e 0x08 0x7e 0xee
82 changes: 82 additions & 0 deletions llvm/test/MC/X86/avx10.2-copy-32-att.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s

// CHECK: vmovd 268435456(%esp,%esi,8), %xmm2
// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10]
vmovd 268435456(%esp,%esi,8), %xmm2

// CHECK: vmovd 291(%edi,%eax,4), %xmm2
// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0x87,0x23,0x01,0x00,0x00]
vmovd 291(%edi,%eax,4), %xmm2

// CHECK: vmovd (%eax), %xmm2
// CHECK: encoding: [0xc5,0xf9,0x6e,0x10]
vmovd (%eax), %xmm2

// CHECK: vmovd -128(,%ebp,2), %xmm2
// CHECK: encoding: [0xc5,0xf9,0x6e,0x14,0x6d,0x80,0xff,0xff,0xff]
vmovd -128(,%ebp,2), %xmm2

// CHECK: vmovd %xmm3, 268435456(%esp,%esi,8)
// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10]
vmovd %xmm3, 268435456(%esp,%esi,8)

// CHECK: vmovd %xmm3, 291(%edi,%eax,4)
// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00]
vmovd %xmm3, 291(%edi,%eax,4)

// CHECK: vmovd %xmm3, (%eax)
// CHECK: encoding: [0xc5,0xf9,0x7e,0x18]
vmovd %xmm3, (%eax)

// CHECK: vmovd %xmm3, -128(,%ebp,2)
// CHECK: encoding: [0xc5,0xf9,0x7e,0x1c,0x6d,0x80,0xff,0xff,0xff]
vmovd %xmm3, -128(,%ebp,2)

// CHECK: vmovw 268435456(%esp,%esi,8), %xmm2
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10]
vmovw 268435456(%esp,%esi,8), %xmm2

// CHECK: vmovw 291(%edi,%eax,4), %xmm2
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0x87,0x23,0x01,0x00,0x00]
vmovw 291(%edi,%eax,4), %xmm2

// CHECK: vmovw (%eax), %xmm2
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x10]
vmovw (%eax), %xmm2

// CHECK: vmovw -64(,%ebp,2), %xmm2
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x14,0x6d,0xc0,0xff,0xff,0xff]
vmovw -64(,%ebp,2), %xmm2

// CHECK: vmovw 254(%ecx), %xmm2
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x51,0x7f]
vmovw 254(%ecx), %xmm2

// CHECK: vmovw -256(%edx), %xmm2
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x52,0x80]
vmovw -256(%edx), %xmm2

// CHECK: vmovw %xmm3, 268435456(%esp,%esi,8)
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10]
vmovw %xmm3, 268435456(%esp,%esi,8)

// CHECK: vmovw %xmm3, 291(%edi,%eax,4)
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00]
vmovw %xmm3, 291(%edi,%eax,4)

// CHECK: vmovw %xmm3, (%eax)
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x18]
vmovw %xmm3, (%eax)

// CHECK: vmovw %xmm3, -64(,%ebp,2)
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x1c,0x6d,0xc0,0xff,0xff,0xff]
vmovw %xmm3, -64(,%ebp,2)

// CHECK: vmovw %xmm3, 254(%ecx)
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x59,0x7f]
vmovw %xmm3, 254(%ecx)

// CHECK: vmovw %xmm3, -256(%edx)
// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x5a,0x80]
vmovw %xmm3, -256(%edx)

Loading
Loading