Skip to content

Commit d35d8da

Browse files
committed
[AMDGPU] Mark additional VOP3 as commutable
Note, only src0 and src1 will be commuted if the isCommutable flag is set. This patch does not change that, it just makes it possible to commute src0 and src1 of more instructions. Reviewed By: foad, rampitec Differential Revision: https://reviews.llvm.org/D99376 Change-Id: I61e20490962d95ea429beb355c55f55c024dafdc
1 parent a43f588 commit d35d8da

File tree

8 files changed

+143
-67
lines changed

8 files changed

+143
-67
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -364,22 +364,25 @@ defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGP
364364
defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
365365
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
366366

367-
let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
368-
defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
369-
defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
370-
defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
371-
defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
372-
defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
373-
defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
374-
defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
375-
defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
376-
defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
377-
} // End mayRaiseFPException = 0
367+
// XXX - No FPException seems suspect but manual doesn't say it does
368+
let mayRaiseFPException = 0, isCommutable = 1 in {
369+
defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
370+
defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
371+
defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
372+
defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
373+
defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
374+
defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
375+
defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
376+
defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
377+
defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
378+
} // End mayRaiseFPException = 0, isCommutable = 1
378379

379-
defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
380-
defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
381-
defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
382-
defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
380+
let isCommutable = 1 in {
381+
defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
382+
defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
383+
defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
384+
defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
385+
} // End isCommutable = 1
383386
defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
384387

385388
defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
@@ -619,41 +622,36 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
619622
}
620623

621624
let SubtargetPredicate = isGFX9Plus in {
625+
let isCommutable = 1 in {
626+
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
627+
defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
628+
defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
629+
defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
630+
defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
631+
defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
632+
defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
633+
defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
634+
defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
635+
defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
636+
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
637+
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
638+
defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
639+
defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
640+
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
641+
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
642+
defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
643+
defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
644+
} // End isCommutable = 1
645+
defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
646+
defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
622647
defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
623648
defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
624-
defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
625-
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
626649
defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
627-
defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
628-
defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
629-
630-
defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
631650

632-
defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
633-
defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
634-
defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
635-
636-
defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
637-
defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
638-
defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
639-
640-
defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
641-
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
642-
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
643-
644-
defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
645-
defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
646-
647-
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
648-
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
649651

650652
defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
651653
defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
652654

653-
defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
654-
defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
655-
656-
657655
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
658656
// This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
659657
(ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
@@ -730,7 +728,9 @@ class PermlaneDiscardVDstIn<SDPatternOperator permlane,
730728

731729

732730
let SubtargetPredicate = isGFX10Plus in {
733-
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
731+
let isCommutable = 1 in {
732+
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
733+
} // End isCommutable = 1
734734
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
735735

736736
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {

llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
112112
;
113113
; GFX10-LABEL: add_shl_vgpr_const_inline_const:
114114
; GFX10: ; %bb.0:
115-
; GFX10-NEXT: v_add_lshl_u32 v0, v0, 0x3f4, 9
115+
; GFX10-NEXT: v_add_lshl_u32 v0, 0x3f4, v0, 9
116116
; GFX10-NEXT: ; return to shader part epilog
117117
%x = add i32 %a, 1012
118118
%result = shl i32 %x, 9

llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3555,7 +3555,7 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
35553555
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3
35563556
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
35573557
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
3558-
; GFX10-NEXT: v_and_or_b32 v2, v3, 0xffff, v2
3558+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2
35593559
; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1]
35603560
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
35613561
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1

llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3455,7 +3455,7 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
34553455
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3
34563456
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
34573457
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
3458-
; GFX10-NEXT: v_and_or_b32 v2, v3, 0xffff, v2
3458+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2
34593459
; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1]
34603460
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1
34613461
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16
469469
; GFX10-NEXT: s_mov_b32 s1, s3
470470
; GFX10-NEXT: s_mov_b32 s2, s4
471471
; GFX10-NEXT: s_mov_b32 s3, s5
472-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
472+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
473473
; GFX10-NEXT: s_mov_b32 s4, s6
474474
; GFX10-NEXT: s_mov_b32 s5, s7
475475
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -596,7 +596,7 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data,
596596
; GFX10-NEXT: s_mov_b32 s1, s3
597597
; GFX10-NEXT: s_mov_b32 s2, s4
598598
; GFX10-NEXT: s_mov_b32 s3, s5
599-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
599+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
600600
; GFX10-NEXT: s_mov_b32 s4, s6
601601
; GFX10-NEXT: s_mov_b32 s5, s7
602602
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -1243,7 +1243,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data
12431243
; GFX10-NEXT: s_mov_b32 s1, s3
12441244
; GFX10-NEXT: s_mov_b32 s2, s4
12451245
; GFX10-NEXT: s_mov_b32 s3, s5
1246-
; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
1246+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
12471247
; GFX10-NEXT: s_mov_b32 s4, s6
12481248
; GFX10-NEXT: s_mov_b32 s5, s7
12491249
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -1370,7 +1370,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64
13701370
; GFX10-NEXT: s_mov_b32 s1, s3
13711371
; GFX10-NEXT: s_mov_b32 s2, s4
13721372
; GFX10-NEXT: s_mov_b32 s3, s5
1373-
; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
1373+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
13741374
; GFX10-NEXT: s_mov_b32 s4, s6
13751375
; GFX10-NEXT: s_mov_b32 s5, s7
13761376
; GFX10-NEXT: s_mov_b32 s6, s8

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
4444
; GFX10NSA-NEXT: s_mov_b32 s9, s11
4545
; GFX10NSA-NEXT: s_mov_b32 s10, s12
4646
; GFX10NSA-NEXT: s_mov_b32 s11, s13
47-
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
47+
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
4848
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
4949
; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
5050
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -210,7 +210,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
210210
; GFX10NSA-NEXT: s_mov_b32 s9, s11
211211
; GFX10NSA-NEXT: s_mov_b32 s10, s12
212212
; GFX10NSA-NEXT: s_mov_b32 s11, s13
213-
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
213+
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
214214
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
215215
; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
216216
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -376,7 +376,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
376376
; GFX10NSA-NEXT: s_mov_b32 s9, s11
377377
; GFX10NSA-NEXT: s_mov_b32 s10, s12
378378
; GFX10NSA-NEXT: s_mov_b32 s11, s13
379-
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
379+
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
380380
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
381381
; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
382382
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -428,7 +428,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
428428
; GFX10NSA-NEXT: s_mov_b32 s9, s11
429429
; GFX10NSA-NEXT: s_mov_b32 s10, s12
430430
; GFX10NSA-NEXT: s_mov_b32 s11, s13
431-
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
431+
; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
432432
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
433433
; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
434434
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -683,7 +683,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
683683
; GFX10NSA-NEXT: s_mov_b32 s1, s3
684684
; GFX10NSA-NEXT: s_mov_b32 s2, s4
685685
; GFX10NSA-NEXT: s_mov_b32 s3, s5
686-
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
686+
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
687687
; GFX10NSA-NEXT: s_mov_b32 s4, s6
688688
; GFX10NSA-NEXT: s_mov_b32 s5, s7
689689
; GFX10NSA-NEXT: s_mov_b32 s6, s8
@@ -729,7 +729,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
729729
; GFX10NSA-NEXT: s_mov_b32 s1, s3
730730
; GFX10NSA-NEXT: s_mov_b32 s2, s4
731731
; GFX10NSA-NEXT: s_mov_b32 s3, s5
732-
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
732+
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
733733
; GFX10NSA-NEXT: s_mov_b32 s4, s6
734734
; GFX10NSA-NEXT: s_mov_b32 s5, s7
735735
; GFX10NSA-NEXT: s_mov_b32 s6, s8

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
7272
; GFX10-NEXT: s_mov_b32 s1, s3
7373
; GFX10-NEXT: s_mov_b32 s2, s4
7474
; GFX10-NEXT: s_mov_b32 s3, s5
75-
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
75+
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
7676
; GFX10-NEXT: s_mov_b32 s4, s6
7777
; GFX10-NEXT: s_mov_b32 s5, s7
7878
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -121,7 +121,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
121121
; GFX10-NEXT: s_mov_b32 s10, s12
122122
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
123123
; GFX10-NEXT: s_mov_b32 s1, s3
124-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, s12
124+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
125125
; GFX10-NEXT: s_mov_b32 s3, s5
126126
; GFX10-NEXT: s_mov_b32 s5, s7
127127
; GFX10-NEXT: s_mov_b32 s7, s9
@@ -164,7 +164,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
164164
; GFX10-NEXT: s_mov_b32 s1, s3
165165
; GFX10-NEXT: s_mov_b32 s2, s4
166166
; GFX10-NEXT: s_mov_b32 s3, s5
167-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
167+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
168168
; GFX10-NEXT: s_mov_b32 s4, s6
169169
; GFX10-NEXT: s_mov_b32 s5, s7
170170
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -213,7 +213,7 @@ define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
213213
; GFX10-NEXT: s_mov_b32 s10, s12
214214
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
215215
; GFX10-NEXT: s_mov_b32 s1, s3
216-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, s12
216+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
217217
; GFX10-NEXT: s_mov_b32 s3, s5
218218
; GFX10-NEXT: s_mov_b32 s5, s7
219219
; GFX10-NEXT: s_mov_b32 s7, s9
@@ -256,7 +256,7 @@ define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
256256
; GFX10-NEXT: s_mov_b32 s1, s3
257257
; GFX10-NEXT: s_mov_b32 s2, s4
258258
; GFX10-NEXT: s_mov_b32 s3, s5
259-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
259+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
260260
; GFX10-NEXT: s_mov_b32 s4, s6
261261
; GFX10-NEXT: s_mov_b32 s5, s7
262262
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -305,7 +305,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> i
305305
; GFX10-NEXT: s_mov_b32 s10, s12
306306
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
307307
; GFX10-NEXT: s_mov_b32 s1, s3
308-
; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, s12
308+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
309309
; GFX10-NEXT: s_mov_b32 s3, s5
310310
; GFX10-NEXT: s_mov_b32 s5, s7
311311
; GFX10-NEXT: s_mov_b32 s7, s9
@@ -348,7 +348,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> i
348348
; GFX10-NEXT: s_mov_b32 s1, s3
349349
; GFX10-NEXT: s_mov_b32 s2, s4
350350
; GFX10-NEXT: s_mov_b32 s3, s5
351-
; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
351+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
352352
; GFX10-NEXT: s_mov_b32 s4, s6
353353
; GFX10-NEXT: s_mov_b32 s5, s7
354354
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -394,7 +394,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
394394
; GFX10-NEXT: s_mov_b32 s1, s3
395395
; GFX10-NEXT: s_mov_b32 s2, s4
396396
; GFX10-NEXT: s_mov_b32 s3, s5
397-
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
397+
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
398398
; GFX10-NEXT: s_mov_b32 s4, s6
399399
; GFX10-NEXT: s_mov_b32 s5, s7
400400
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -440,7 +440,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
440440
; GFX10-NEXT: s_mov_b32 s1, s3
441441
; GFX10-NEXT: s_mov_b32 s2, s4
442442
; GFX10-NEXT: s_mov_b32 s3, s5
443-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
443+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
444444
; GFX10-NEXT: s_mov_b32 s4, s6
445445
; GFX10-NEXT: s_mov_b32 s5, s7
446446
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> in
486486
; GFX10-NEXT: s_mov_b32 s1, s3
487487
; GFX10-NEXT: s_mov_b32 s2, s4
488488
; GFX10-NEXT: s_mov_b32 s3, s5
489-
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
489+
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
490490
; GFX10-NEXT: s_mov_b32 s4, s6
491491
; GFX10-NEXT: s_mov_b32 s5, s7
492492
; GFX10-NEXT: s_mov_b32 s6, s8
@@ -532,7 +532,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
532532
; GFX10-NEXT: s_mov_b32 s1, s3
533533
; GFX10-NEXT: s_mov_b32 s2, s4
534534
; GFX10-NEXT: s_mov_b32 s3, s5
535-
; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
535+
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
536536
; GFX10-NEXT: s_mov_b32 s4, s6
537537
; GFX10-NEXT: s_mov_b32 s5, s7
538538
; GFX10-NEXT: s_mov_b32 s6, s8

0 commit comments

Comments
 (0)