Skip to content

[X86] Support SM4 EVEX version intrinsics/instructions. #113402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,10 @@ X86 Support
* Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
``*_(mask(z)))_minmax_s[s|d|h]``.

- Supported intrinsics for ``SM4 and AVX10.2``.
* Supported SM4 intrinsics of ``_mm512_sm4key4_epi32`` and
``_mm512_sm4rnds4_epi32``.

- All intrinsics in adcintrin.h can now be used in constant expressions.

- All intrinsics in adxintrin.h can now be used in constant expressions.
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/BuiltinsX86.def
Original file line number Diff line number Diff line change
Expand Up @@ -2179,6 +2179,10 @@ TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")

// SM4_EVEX
TARGET_BUILTIN(__builtin_ia32_vsm4key4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")

// AVX10 MINMAX
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ set(x86_files
shaintrin.h
sm3intrin.h
sm4intrin.h
sm4evexintrin.h
smmintrin.h
tbmintrin.h
tmmintrin.h
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/Headers/immintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,11 @@ _storebe_i64(void * __P, long long __D) {
#include <avx10_2_512satcvtintrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX10_2_512__) && defined(__SM4__))
#include <sm4evexintrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
Expand Down
32 changes: 32 additions & 0 deletions clang/lib/Headers/sm4evexintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*===--------------- sm4evexintrin.h - SM4 EVEX intrinsics -----------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <sm4evexintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H

#ifndef __SM4EVEXINTRIN_H
#define __SM4EVEXINTRIN_H

#define __DEFAULT_FN_ATTRS512 \
__attribute__((__always_inline__, __nodebug__, \
__target__("sm4,avx10.2-512"), __min_vector_width__(512)))

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sm4key4_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_vsm4key4512((__v16su)__A, (__v16su)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_vsm4rnds4512((__v16su)__A, (__v16su)__B);
}

#undef __DEFAULT_FN_ATTRS512

#endif // __SM4EVEXINTRIN_H
19 changes: 19 additions & 0 deletions clang/test/CodeGen/X86/sm4-evex-builtins.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-- -target-feature +sm4 \
// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 %s -ffreestanding -triple=i386-- -target-feature +sm4 \
// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror | FileCheck %s

#include <immintrin.h>
#include <stddef.h>

__m512i test_mm512_sm4key4_epi32(__m512i __A, __m512i __B) {
// CHECK-LABEL: @test_mm512_sm4key4_epi32(
// CHECK: call <16 x i32> @llvm.x86.vsm4key4512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
return _mm512_sm4key4_epi32(__A, __B);
}

__m512i test_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) {
// CHECK-LABEL: @test_mm512_sm4rnds4_epi32(
// CHECK: call <16 x i32> @llvm.x86.vsm4rnds4512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
return _mm512_sm4rnds4_epi32(__A, __B);
}
2 changes: 2 additions & 0 deletions llvm/docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ Changes to the X86 Backend

* Supported instructions of `MOVRS AND AVX10.2`

* Supported ISA of `SM4(EVEX)`.

Changes to the OCaml bindings
-----------------------------

Expand Down
10 changes: 10 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -6099,6 +6099,11 @@ let TargetPrefix = "x86" in {
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
[llvm_v8i32_ty, llvm_v8i32_ty],
[IntrNoMem]>;
def int_x86_vsm4key4512
: ClangBuiltin<"__builtin_ia32_vsm4key4512">,
DefaultAttrsIntrinsic<[llvm_v16i32_ty],
[llvm_v16i32_ty, llvm_v16i32_ty],
[IntrNoMem]>;
def int_x86_vsm4rnds4128
: ClangBuiltin<"__builtin_ia32_vsm4rnds4128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
Expand All @@ -6109,6 +6114,11 @@ let TargetPrefix = "x86" in {
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
[llvm_v8i32_ty, llvm_v8i32_ty],
[IntrNoMem]>;
def int_x86_vsm4rnds4512
: ClangBuiltin<"__builtin_ia32_vsm4rnds4512">,
DefaultAttrsIntrinsic<[llvm_v16i32_ty],
[llvm_v16i32_ty, llvm_v16i32_ty],
[IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
// RAO-INT intrinsics
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/X86/X86InstrAVX10.td
Original file line number Diff line number Diff line change
Expand Up @@ -1675,3 +1675,17 @@ defm VMOVRSD : vmovrs_p_vl<0x6f, "vmovrsd", avx512vl_i32_info>,
T_MAP5, XS, EVEX_CD8<32, CD8VF>, Sched<[WriteVecLoad]>;
defm VMOVRSQ : vmovrs_p_vl<0x6f, "vmovrsq", avx512vl_i64_info>,
T_MAP5, XS, REX_W, EVEX_CD8<64, CD8VF>, Sched<[WriteVecLoad]>;

// SM4(EVEX)
multiclass avx10_sm4_base<string OpStr> {
// SM4_Base is in X86InstrSSE.td.
let Predicates = [HasSM4, HasAVX10_2], AddedComplexity = 1 in {
defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128;
defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256;
}
let Predicates = [HasSM4, HasAVX10_2_512] in
defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512;
}

defm VSM4KEY4 : avx10_sm4_base<"vsm4key4">, T8, XS, EVEX, VVVV;
defm VSM4RNDS4 : avx10_sm4_base<"vsm4rnds4">, T8, XD, EVEX, VVVV;
64 changes: 64 additions & 0 deletions llvm/test/CodeGen/X86/sm4-evex-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-- --show-mc-encoding -mattr=+sm4,+avx10.2-512 | FileCheck %s
; RUN: llc < %s -verify-machineinstrs -mtriple=i686-- --show-mc-encoding -mattr=+sm4,+avx10.2-512 | FileCheck %s

define <4 x i32> @test_int_x86_vsm4key4128(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_int_x86_vsm4key4128:
; CHECK: # %bb.0:
; CHECK-NEXT: vsm4key4 %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0xda,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %A, <4 x i32> %B)
ret <4 x i32> %ret
}
declare <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %A, <4 x i32> %B)

define <8 x i32> @test_int_x86_vsm4key4256(<8 x i32> %A, <8 x i32> %B) {
; CHECK-LABEL: test_int_x86_vsm4key4256:
; CHECK: # %bb.0:
; CHECK-NEXT: vsm4key4 %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7e,0xda,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %A, <8 x i32> %B)
ret <8 x i32> %ret
}
declare <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %A, <8 x i32> %B)

define <16 x i32> @test_int_x86_vsm4key4512(<16 x i32> %A, <16 x i32> %B) {
; CHECK-LABEL: test_int_x86_vsm4key4512:
; CHECK: # %bb.0:
; CHECK-NEXT: vsm4key4 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7e,0x48,0xda,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <16 x i32> @llvm.x86.vsm4key4512(<16 x i32> %A, <16 x i32> %B)
ret <16 x i32> %ret
}
declare <16 x i32> @llvm.x86.vsm4key4512(<16 x i32> %A, <16 x i32> %B)

define <4 x i32> @test_int_x86_vsm4rnds4128(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_int_x86_vsm4rnds4128:
; CHECK: # %bb.0:
; CHECK-NEXT: vsm4rnds4 %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0xda,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %A, <4 x i32> %B)
ret <4 x i32> %ret
}
declare <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %A, <4 x i32> %B)

define <8 x i32> @test_int_x86_vsm4rnds4256(<8 x i32> %A, <8 x i32> %B) {
; CHECK-LABEL: test_int_x86_vsm4rnds4256:
; CHECK: # %bb.0:
; CHECK-NEXT: vsm4rnds4 %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7f,0xda,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %A, <8 x i32> %B)
ret <8 x i32> %ret
}
declare <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %A, <8 x i32> %B)

define <16 x i32> @test_int_x86_vsm4rnds4512(<16 x i32> %A, <16 x i32> %B) {
; CHECK-LABEL: test_int_x86_vsm4rnds4512:
; CHECK: # %bb.0:
; CHECK-NEXT: vsm4rnds4 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0xda,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <16 x i32> @llvm.x86.vsm4rnds4512(<16 x i32> %A, <16 x i32> %B)
ret <16 x i32> %ret
}
declare <16 x i32> @llvm.x86.vsm4rnds4512(<16 x i32> %A, <16 x i32> %B)

170 changes: 170 additions & 0 deletions llvm/test/MC/Disassembler/X86/sm4-evex-32.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown | FileCheck %s --check-prefixes=ATT
# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL

# ATT: vsm4key4 %zmm4, %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmm4
0x62,0xf2,0x66,0x48,0xda,0xd4

# ATT: vsm4key4 268435456(%esp,%esi,8), %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
0x62,0xf2,0x66,0x48,0xda,0x94,0xf4,0x00,0x00,0x00,0x10

# ATT: vsm4key4 291(%edi,%eax,4), %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmmword ptr [edi + 4*eax + 291]
0x62,0xf2,0x66,0x48,0xda,0x94,0x87,0x23,0x01,0x00,0x00

# ATT: vsm4key4 (%eax), %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmmword ptr [eax]
0x62,0xf2,0x66,0x48,0xda,0x10

# ATT: vsm4key4 -2048(,%ebp,2), %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
0x62,0xf2,0x66,0x48,0xda,0x14,0x6d,0x00,0xf8,0xff,0xff

# ATT: vsm4key4 8128(%ecx), %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmmword ptr [ecx + 8128]
0x62,0xf2,0x66,0x48,0xda,0x51,0x7f

# ATT: vsm4key4 -8192(%edx), %zmm3, %zmm2
# INTEL: vsm4key4 zmm2, zmm3, zmmword ptr [edx - 8192]
0x62,0xf2,0x66,0x48,0xda,0x52,0x80

# ATT: vsm4rnds4 %zmm4, %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmm4
0x62,0xf2,0x67,0x48,0xda,0xd4

# ATT: vsm4rnds4 268435456(%esp,%esi,8), %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
0x62,0xf2,0x67,0x48,0xda,0x94,0xf4,0x00,0x00,0x00,0x10

# ATT: vsm4rnds4 291(%edi,%eax,4), %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmmword ptr [edi + 4*eax + 291]
0x62,0xf2,0x67,0x48,0xda,0x94,0x87,0x23,0x01,0x00,0x00

# ATT: vsm4rnds4 (%eax), %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmmword ptr [eax]
0x62,0xf2,0x67,0x48,0xda,0x10

# ATT: vsm4rnds4 -2048(,%ebp,2), %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
0x62,0xf2,0x67,0x48,0xda,0x14,0x6d,0x00,0xf8,0xff,0xff

# ATT: vsm4rnds4 8128(%ecx), %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmmword ptr [ecx + 8128]
0x62,0xf2,0x67,0x48,0xda,0x51,0x7f

# ATT: vsm4rnds4 -8192(%edx), %zmm3, %zmm2
# INTEL: vsm4rnds4 zmm2, zmm3, zmmword ptr [edx - 8192]
0x62,0xf2,0x67,0x48,0xda,0x52,0x80

# ATT: vsm4key4 %ymm4, %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymm4
0x62,0xf2,0x66,0x28,0xda,0xd4

# ATT: vsm4key4 %xmm4, %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmm4
0x62,0xf2,0x66,0x08,0xda,0xd4

# ATT: vsm4key4 268435456(%esp,%esi,8), %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
0x62,0xf2,0x66,0x28,0xda,0x94,0xf4,0x00,0x00,0x00,0x10

# ATT: vsm4key4 291(%edi,%eax,4), %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
0x62,0xf2,0x66,0x28,0xda,0x94,0x87,0x23,0x01,0x00,0x00

# ATT: vsm4key4 (%eax), %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymmword ptr [eax]
0x62,0xf2,0x66,0x28,0xda,0x10

# ATT: vsm4key4 -1024(,%ebp,2), %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
0x62,0xf2,0x66,0x28,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff

# ATT: vsm4key4 4064(%ecx), %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymmword ptr [ecx + 4064]
0x62,0xf2,0x66,0x28,0xda,0x51,0x7f

# ATT: vsm4key4 -4096(%edx), %ymm3, %ymm2
# INTEL: vsm4key4 ymm2, ymm3, ymmword ptr [edx - 4096]
0x62,0xf2,0x66,0x28,0xda,0x52,0x80

# ATT: vsm4key4 268435456(%esp,%esi,8), %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
0x62,0xf2,0x66,0x08,0xda,0x94,0xf4,0x00,0x00,0x00,0x10

# ATT: vsm4key4 291(%edi,%eax,4), %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
0x62,0xf2,0x66,0x08,0xda,0x94,0x87,0x23,0x01,0x00,0x00

# ATT: vsm4key4 (%eax), %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmmword ptr [eax]
0x62,0xf2,0x66,0x08,0xda,0x10

# ATT: vsm4key4 -512(,%ebp,2), %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
0x62,0xf2,0x66,0x08,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff

# ATT: vsm4key4 2032(%ecx), %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmmword ptr [ecx + 2032]
0x62,0xf2,0x66,0x08,0xda,0x51,0x7f

# ATT: vsm4key4 -2048(%edx), %xmm3, %xmm2
# INTEL: vsm4key4 xmm2, xmm3, xmmword ptr [edx - 2048]
0x62,0xf2,0x66,0x08,0xda,0x52,0x80

# ATT: vsm4rnds4 %ymm4, %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymm4
0x62,0xf2,0x67,0x28,0xda,0xd4

# ATT: vsm4rnds4 %xmm4, %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmm4
0x62,0xf2,0x67,0x08,0xda,0xd4

# ATT: vsm4rnds4 268435456(%esp,%esi,8), %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
0x62,0xf2,0x67,0x28,0xda,0x94,0xf4,0x00,0x00,0x00,0x10

# ATT: vsm4rnds4 291(%edi,%eax,4), %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
0x62,0xf2,0x67,0x28,0xda,0x94,0x87,0x23,0x01,0x00,0x00

# ATT: vsm4rnds4 (%eax), %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymmword ptr [eax]
0x62,0xf2,0x67,0x28,0xda,0x10

# ATT: vsm4rnds4 -1024(,%ebp,2), %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
0x62,0xf2,0x67,0x28,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff

# ATT: vsm4rnds4 4064(%ecx), %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymmword ptr [ecx + 4064]
0x62,0xf2,0x67,0x28,0xda,0x51,0x7f

# ATT: vsm4rnds4 -4096(%edx), %ymm3, %ymm2
# INTEL: vsm4rnds4 ymm2, ymm3, ymmword ptr [edx - 4096]
0x62,0xf2,0x67,0x28,0xda,0x52,0x80

# ATT: vsm4rnds4 268435456(%esp,%esi,8), %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
0x62,0xf2,0x67,0x08,0xda,0x94,0xf4,0x00,0x00,0x00,0x10

# ATT: vsm4rnds4 291(%edi,%eax,4), %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
0x62,0xf2,0x67,0x08,0xda,0x94,0x87,0x23,0x01,0x00,0x00

# ATT: vsm4rnds4 (%eax), %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmmword ptr [eax]
0x62,0xf2,0x67,0x08,0xda,0x10

# ATT: vsm4rnds4 -512(,%ebp,2), %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
0x62,0xf2,0x67,0x08,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff

# ATT: vsm4rnds4 2032(%ecx), %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmmword ptr [ecx + 2032]
0x62,0xf2,0x67,0x08,0xda,0x51,0x7f

# ATT: vsm4rnds4 -2048(%edx), %xmm3, %xmm2
# INTEL: vsm4rnds4 xmm2, xmm3, xmmword ptr [edx - 2048]
0x62,0xf2,0x67,0x08,0xda,0x52,0x80
Loading
Loading