Skip to content

[X86] Correct the cdisp8 encoding for VGF2P8AFFINEINVQB and VGF2P8AFFINEQB. #120340

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 18, 2024

Conversation

topperc
Copy link
Collaborator

@topperc topperc commented Dec 18, 2024

These instructions use a 64-bit broadcast size so the element size for CD8 should be 64.

Pointed out by larkmjc on discord.

…INEQB.

These instructions use a 64-bit broadcast size so the element size
for CD8 should be 64.
@llvmbot llvmbot added backend:X86 mc Machine (object) code labels Dec 18, 2024
@llvmbot
Copy link
Member

llvmbot commented Dec 18, 2024

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-x86

Author: Craig Topper (topperc)

Changes

These instructions use a 64-bit broadcast size so the element size for CD8 should be 64.

Pointed out by larkmjc on discord.


Full diff: https://github.com/llvm/llvm-project/pull/120340.diff

3 Files Affected:

  • (modified) llvm/lib/Target/X86/X86InstrAVX512.td (+2-2)
  • (modified) llvm/test/MC/X86/avx512gfni-att.s (+7)
  • (modified) llvm/test/MC/X86/avx512vl_gfni-att.s (+15)
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 83a2e981ffd7a8..e899807cd1b7c5 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12569,10 +12569,10 @@ multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
 
 defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
                          X86GF2P8affineinvqb, SchedWriteVecIMul>,
-                         EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+                         EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W, AVX512AIi8Base;
 defm VGF2P8AFFINEQB    : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
                          X86GF2P8affineqb, SchedWriteVecIMul>,
-                         EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+                         EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W, AVX512AIi8Base;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/X86/avx512gfni-att.s b/llvm/test/MC/X86/avx512gfni-att.s
index 975595deef58ea..1100b49aa444d4 100644
--- a/llvm/test/MC/X86/avx512gfni-att.s
+++ b/llvm/test/MC/X86/avx512gfni-att.s
@@ -176,3 +176,10 @@
 // CHECK: encoding: [0x62,0xf3,0xdd,0x50,0xce,0x09,0x07]
           vgf2p8affineqb  $7, (%rcx){1to8}, %zmm20, %zmm1
 
+// CHECK: vgf2p8affineinvqb $7, 8(%rcx){1to8}, %zmm20, %zmm1
+// CHECK: encoding: [0x62,0xf3,0xdd,0x50,0xcf,0x49,0x01,0x07]
+          vgf2p8affineinvqb $7, 8(%rcx){1to8}, %zmm20, %zmm1
+
+// CHECK: vgf2p8affineqb  $7, 8(%rcx){1to8}, %zmm20, %zmm1
+// CHECK: encoding: [0x62,0xf3,0xdd,0x50,0xce,0x49,0x01,0x07]
+          vgf2p8affineqb  $7, 8(%rcx){1to8}, %zmm20, %zmm1
diff --git a/llvm/test/MC/X86/avx512vl_gfni-att.s b/llvm/test/MC/X86/avx512vl_gfni-att.s
index a44211332de601..33d7e45a7b32d2 100644
--- a/llvm/test/MC/X86/avx512vl_gfni-att.s
+++ b/llvm/test/MC/X86/avx512vl_gfni-att.s
@@ -352,3 +352,18 @@
 // CHECK: encoding: [0x62,0xf3,0xdd,0x30,0xce,0x09,0x07]
           vgf2p8affineqb  $7, (%rcx){1to4}, %ymm20, %ymm1
 
+// CHECK: vgf2p8affineinvqb $7, 8(%rcx){1to2}, %xmm20, %xmm1
+// CHECK: encoding: [0x62,0xf3,0xdd,0x10,0xcf,0x49,0x01,0x07]
+          vgf2p8affineinvqb $7, 8(%rcx){1to2}, %xmm20, %xmm1
+
+// CHECK: vgf2p8affineinvqb $7, 8(%rcx){1to4}, %ymm20, %ymm1
+// CHECK: encoding: [0x62,0xf3,0xdd,0x30,0xcf,0x49,0x01,0x07]
+          vgf2p8affineinvqb $7, 8(%rcx){1to4}, %ymm20, %ymm1
+
+// CHECK: vgf2p8affineqb  $7, 8(%rcx){1to2}, %xmm20, %xmm1
+// CHECK: encoding: [0x62,0xf3,0xdd,0x10,0xce,0x49,0x01,0x07]
+          vgf2p8affineqb  $7, 8(%rcx){1to2}, %xmm20, %xmm1
+
+// CHECK: vgf2p8affineqb  $7, 8(%rcx){1to4}, %ymm20, %ymm1
+// CHECK: encoding: [0x62,0xf3,0xdd,0x30,0xce,0x49,0x01,0x07]
+          vgf2p8affineqb  $7, 8(%rcx){1to4}, %ymm20, %ymm1

@michaeljclark
Copy link
Contributor

larkmjc here. I was performing some differential fuzzing and checking word size with the broadcast cdisp8 multiplier. just to add to the notes. here are my exemplars from before the patch along with binutils output. these differ slightly from the test cases as they are using SIB and the ib is 0 instead of 7. I have checked this patch on my side but haven't run the tests in the test suite yet. thanks.

llvm (18:18.1.3-1ubuntu1)

62 f3 fd 19 cf 44 51 01 00     vgf2p8affineinvqb	xmm0 {k1}, xmm0, qword ptr [rcx + 2*rdx + 1]{1to2}, 0
62 f3 fd 39 cf 44 51 01 00     vgf2p8affineinvqb	ymm0 {k1}, ymm0, qword ptr [rcx + 2*rdx + 1]{1to4}, 0
62 f3 fd 59 cf 44 51 01 00     vgf2p8affineinvqb	zmm0 {k1}, zmm0, qword ptr [rcx + 2*rdx + 1]{1to8}, 0
62 f3 fd 19 ce 44 51 01 00     vgf2p8affineqb	xmm0 {k1}, xmm0, qword ptr [rcx + 2*rdx + 1]{1to2}, 0
62 f3 fd 39 ce 44 51 01 00     vgf2p8affineqb	ymm0 {k1}, ymm0, qword ptr [rcx + 2*rdx + 1]{1to4}, 0
62 f3 fd 59 ce 44 51 01 00     vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rcx + 2*rdx + 1]{1to8}, 0

binutils (2.42-4ubuntu2.3)

62 f3 fd 19 cf 44 51 01 00     vgf2p8affineinvqb xmm0{k1},xmm0,QWORD BCST [rcx+rdx*2+0x8],0x0
62 f3 fd 39 cf 44 51 01 00     vgf2p8affineinvqb ymm0{k1},ymm0,QWORD BCST [rcx+rdx*2+0x8],0x0
62 f3 fd 59 cf 44 51 01 00     vgf2p8affineinvqb zmm0{k1},zmm0,QWORD BCST [rcx+rdx*2+0x8],0x0
62 f3 fd 19 ce 44 51 01 00     vgf2p8affineqb xmm0{k1},xmm0,QWORD BCST [rcx+rdx*2+0x8],0x0
62 f3 fd 39 ce 44 51 01 00     vgf2p8affineqb ymm0{k1},ymm0,QWORD BCST [rcx+rdx*2+0x8],0x0
62 f3 fd 59 ce 44 51 01 00     vgf2p8affineqb zmm0{k1},zmm0,QWORD BCST [rcx+rdx*2+0x8],0x0

Copy link
Contributor

@KanRobert KanRobert left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@topperc topperc merged commit efc3671 into llvm:main Dec 18, 2024
11 checks passed
@topperc topperc deleted the pr/cd8-affine branch December 18, 2024 05:36
@llvm-ci
Copy link
Collaborator

llvm-ci commented Dec 18, 2024

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/12496

Here is the relevant piece of the build log for the reference
Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libomp :: tasking/issue-94260-2.c' FAILED ********************
Exit Code: -11

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp   -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -fno-omit-frame-pointer -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/ompt /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/ompt /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11

--

********************


KanRobert pushed a commit that referenced this pull request Jan 11, 2025
)

during differential fuzzing, I found 8 more instructions with disp8
offset multiplier differences to binutils. somewhat sure there is a bug
in the X86 LLVM disp8 offset multipliers for this subset of vector
scatter and gather prefetch instructions. please check and refer to the
previous pull request: #120340

these vector scatter and gather prefetch instructions also have an
unusual k mask operand position but I have not addressed this with this
patch as I am unsure how to change the Intel format in the tablegen
file.

```
hex:	62 f2 fd 49 c6 4c 51 01
llvm:	vgatherpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf0dpd 	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 4c 51 01
llvm:	vgatherpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 54 51 01
llvm:	vgatherpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf1dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 54 51 01
llvm:	vgatherpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf1qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 6c 51 01
llvm:	vscatterpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf0dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 6c 51 01
llvm:	vscatterpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 74 51 01
llvm:	vscatterpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf1dpd QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 74 51 01
llvm:	vscatterpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf1qps DWORD PTR [rcx+zmm2*2+0x4]{k1}
```
github-actions bot pushed a commit to arm/arm-toolchain that referenced this pull request Jan 11, 2025
…fetch (#122051)

during differential fuzzing, I found 8 more instructions with disp8
offset multiplier differences to binutils. somewhat sure there is a bug
in the X86 LLVM disp8 offset multipliers for this subset of vector
scatter and gather prefetch instructions. please check and refer to the
previous pull request: llvm/llvm-project#120340

these vector scatter and gather prefetch instructions also have an
unusual k mask operand position but I have not addressed this with this
patch as I am unsure how to change the Intel format in the tablegen
file.

```
hex:	62 f2 fd 49 c6 4c 51 01
llvm:	vgatherpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf0dpd 	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 4c 51 01
llvm:	vgatherpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 54 51 01
llvm:	vgatherpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf1dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 54 51 01
llvm:	vgatherpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf1qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 6c 51 01
llvm:	vscatterpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf0dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 6c 51 01
llvm:	vscatterpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 74 51 01
llvm:	vscatterpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf1dpd QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 74 51 01
llvm:	vscatterpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf1qps DWORD PTR [rcx+zmm2*2+0x4]{k1}
```
BaiXilin pushed a commit to BaiXilin/llvm-fix-vnni-instr-types that referenced this pull request Jan 12, 2025
…#122051)

during differential fuzzing, I found 8 more instructions with disp8
offset multiplier differences to binutils. somewhat sure there is a bug
in the X86 LLVM disp8 offset multipliers for this subset of vector
scatter and gather prefetch instructions. please check and refer to the
previous pull request: llvm#120340

these vector scatter and gather prefetch instructions also have an
unusual k mask operand position but I have not addressed this with this
patch as I am unsure how to change the Intel format in the tablegen
file.

```
hex:	62 f2 fd 49 c6 4c 51 01
llvm:	vgatherpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf0dpd 	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 4c 51 01
llvm:	vgatherpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 54 51 01
llvm:	vgatherpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vgatherpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vgatherpf1dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 54 51 01
llvm:	vgatherpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vgatherpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vgatherpf1qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 6c 51 01
llvm:	vscatterpf0dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf0dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf0dpd	QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 6c 51 01
llvm:	vscatterpf0qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf0qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf0qps	DWORD PTR [rcx+zmm2*2+0x4]{k1}

hex:	62 f2 fd 49 c6 74 51 01
llvm:	vscatterpf1dpd	{k1}, zmmword ptr [rcx + 2*ymm2 + 4]
ours:	vscatterpf1dpd	qword ptr * 8 [rcx + 2*ymm2 + 8] {k1}
gnu:	vscatterpf1dpd QWORD PTR [rcx+ymm2*2+0x8]{k1}

hex:	62 f2 7d 49 c7 74 51 01
llvm:	vscatterpf1qps	{k1}, ymmword ptr [rcx + 2*zmm2 + 8]
ours:	vscatterpf1qps	dword ptr * 8 [rcx + 2*zmm2 + 4] {k1}
gnu:	vscatterpf1qps DWORD PTR [rcx+zmm2*2+0x4]{k1}
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:X86 mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants