-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LoongArch] Enable FeatureExtLSX for generic-la64 processor #113421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-loongarch Author: None (Ami-zhang) ChangesThis commit makes the Patch is 104.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113421.diff 23 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index ddb27dc6404fa8..67ee2fd791bcfb 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -129,7 +129,9 @@ include "LoongArchInstrInfo.td"
//===----------------------------------------------------------------------===//
def : ProcessorModel<"generic-la32", NoSchedModel, [Feature32Bit]>;
-def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit, FeatureUAL]>;
+def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit,
+ FeatureUAL,
+ FeatureExtLSX]>;
// Generic 64-bit processor with double-precision floating-point support.
def : ProcessorModel<"loongarch64", NoSchedModel, [Feature64Bit,
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
index 06dfe00d908475..5c9575b2baab18 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
@@ -123,13 +123,12 @@ define i64 @caller_large_scalars() nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -80
; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
; CHECK-NEXT: st.d $zero, $sp, 24
-; CHECK-NEXT: st.d $zero, $sp, 16
-; CHECK-NEXT: st.d $zero, $sp, 8
+; CHECK-NEXT: vrepli.b $vr0, 0
+; CHECK-NEXT: vst $vr0, $sp, 8
; CHECK-NEXT: ori $a0, $zero, 2
; CHECK-NEXT: st.d $a0, $sp, 0
; CHECK-NEXT: st.d $zero, $sp, 56
-; CHECK-NEXT: st.d $zero, $sp, 48
-; CHECK-NEXT: st.d $zero, $sp, 40
+; CHECK-NEXT: vst $vr0, $sp, 40
; CHECK-NEXT: ori $a2, $zero, 1
; CHECK-NEXT: addi.d $a0, $sp, 32
; CHECK-NEXT: addi.d $a1, $sp, 0
@@ -182,14 +181,13 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
; CHECK-NEXT: ori $a0, $zero, 9
; CHECK-NEXT: st.d $a0, $sp, 0
; CHECK-NEXT: st.d $zero, $sp, 40
-; CHECK-NEXT: st.d $zero, $sp, 32
-; CHECK-NEXT: st.d $zero, $sp, 24
+; CHECK-NEXT: vrepli.b $vr0, 0
+; CHECK-NEXT: vst $vr0, $sp, 24
; CHECK-NEXT: ori $a0, $zero, 10
; CHECK-NEXT: st.d $a0, $sp, 16
; CHECK-NEXT: st.d $zero, $sp, 72
-; CHECK-NEXT: st.d $zero, $sp, 64
-; CHECK-NEXT: st.d $zero, $sp, 56
-; CHECK-NEXT: ori $t0, $zero, 8
+; CHECK-NEXT: ori $a0, $zero, 8
+; CHECK-NEXT: st.d $a0, $sp, 48
; CHECK-NEXT: ori $a0, $zero, 1
; CHECK-NEXT: ori $a1, $zero, 2
; CHECK-NEXT: ori $a2, $zero, 3
@@ -198,7 +196,7 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
; CHECK-NEXT: ori $a5, $zero, 6
; CHECK-NEXT: ori $a6, $zero, 7
; CHECK-NEXT: addi.d $a7, $sp, 48
-; CHECK-NEXT: st.d $t0, $sp, 48
+; CHECK-NEXT: vst $vr0, $sp, 56
; CHECK-NEXT: bl %plt(callee_large_scalars_exhausted_regs)
; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 96
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
index 34fbec03c535b0..35186b660c1e66 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
@@ -63,26 +63,17 @@ define i64 @caller_double_in_gpr_exhausted_fprs() nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: fld.d $fa1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI3_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_2)
-; CHECK-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI3_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_3)
-; CHECK-NEXT: fld.d $fa4, $a0, %pc_lo12(.LCPI3_3)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_4)
-; CHECK-NEXT: fld.d $fa5, $a0, %pc_lo12(.LCPI3_4)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_5)
-; CHECK-NEXT: fld.d $fa6, $a0, %pc_lo12(.LCPI3_5)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_6)
-; CHECK-NEXT: fld.d $fa7, $a0, %pc_lo12(.LCPI3_6)
-; CHECK-NEXT: addi.d $a0, $zero, 1
-; CHECK-NEXT: movgr2fr.d $fa0, $a0
-; CHECK-NEXT: ffint.d.l $fa0, $fa0
; CHECK-NEXT: ori $a0, $zero, 0
; CHECK-NEXT: lu32i.d $a0, 131072
; CHECK-NEXT: lu52i.d $a0, $a0, 1026
+; CHECK-NEXT: vldi $vr0, -912
+; CHECK-NEXT: vldi $vr1, -1024
+; CHECK-NEXT: vldi $vr2, -1016
+; CHECK-NEXT: vldi $vr3, -1008
+; CHECK-NEXT: vldi $vr4, -1004
+; CHECK-NEXT: vldi $vr5, -1000
+; CHECK-NEXT: vldi $vr6, -996
+; CHECK-NEXT: vldi $vr7, -992
; CHECK-NEXT: bl %plt(callee_double_in_gpr_exhausted_fprs)
; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -98,9 +89,7 @@ define i64 @caller_double_in_gpr_exhausted_fprs() nounwind {
define double @callee_double_ret() nounwind {
; CHECK-LABEL: callee_double_ret:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $a0, $zero, 1
-; CHECK-NEXT: movgr2fr.d $fa0, $a0
-; CHECK-NEXT: ffint.d.l $fa0, $fa0
+; CHECK-NEXT: vldi $vr0, -912
; CHECK-NEXT: ret
ret double 1.0
}
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll
index 558b9457239c13..79e66c8a6a1e38 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll
@@ -6,16 +6,10 @@
define i64 @callee_float_in_regs(i64 %a, float %b) nounwind {
; CHECK-LABEL: callee_float_in_regs:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
-; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill
-; CHECK-NEXT: move $fp, $a0
-; CHECK-NEXT: move $a0, $a1
-; CHECK-NEXT: bl %plt(__fixsfdi)
-; CHECK-NEXT: add.d $a0, $fp, $a0
-; CHECK-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload
-; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; CHECK-NEXT: addi.d $sp, $sp, 16
+; CHECK-NEXT: movgr2fr.w $fa0, $a1
+; CHECK-NEXT: ftintrz.l.s $fa0, $fa0
+; CHECK-NEXT: movfr2gr.d $a1, $fa0
+; CHECK-NEXT: add.d $a0, $a0, $a1
; CHECK-NEXT: ret
%b_fptosi = fptosi float %b to i64
%1 = add i64 %a, %b_fptosi
@@ -27,7 +21,8 @@ define i64 @caller_float_in_regs() nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT: lu12i.w $a1, 262144
+; CHECK-NEXT: vldi $vr0, -1280
+; CHECK-NEXT: movfr2gr.s $a1, $fa0
; CHECK-NEXT: ori $a0, $zero, 1
; CHECK-NEXT: bl %plt(callee_float_in_regs)
; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
@@ -59,7 +54,7 @@ define i64 @caller_float_on_stack() nounwind {
; CHECK-NEXT: ori $a2, $zero, 2
; CHECK-NEXT: ori $a4, $zero, 3
; CHECK-NEXT: ori $a6, $zero, 4
-; CHECK-NEXT: st.d $a1, $sp, 0
+; CHECK-NEXT: st.w $a1, $sp, 0
; CHECK-NEXT: move $a1, $zero
; CHECK-NEXT: move $a3, $zero
; CHECK-NEXT: move $a5, $zero
@@ -75,7 +70,8 @@ define i64 @caller_float_on_stack() nounwind {
define float @callee_tiny_scalar_ret() nounwind {
; CHECK-LABEL: callee_tiny_scalar_ret:
; CHECK: # %bb.0:
-; CHECK-NEXT: lu12i.w $a0, 260096
+; CHECK-NEXT: vldi $vr0, -1168
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
; CHECK-NEXT: ret
ret float 1.0
}
@@ -86,7 +82,8 @@ define i64 @caller_tiny_scalar_ret() nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; CHECK-NEXT: bl %plt(callee_tiny_scalar_ret)
-; CHECK-NEXT: addi.w $a0, $a0, 0
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index a26102710cbebe..161ed573c81f02 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -175,16 +175,11 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
;
; LA64-LABEL: test_ctpop_i8:
; LA64: # %bb.0:
-; LA64-NEXT: srli.d $a1, $a0, 1
-; LA64-NEXT: andi $a1, $a1, 85
-; LA64-NEXT: sub.d $a0, $a0, $a1
-; LA64-NEXT: andi $a1, $a0, 51
-; LA64-NEXT: srli.d $a0, $a0, 2
-; LA64-NEXT: andi $a0, $a0, 51
-; LA64-NEXT: add.d $a0, $a1, $a0
-; LA64-NEXT: srli.d $a1, $a0, 4
-; LA64-NEXT: add.d $a0, $a0, $a1
-; LA64-NEXT: andi $a0, $a0, 15
+; LA64-NEXT: andi $a0, $a0, 255
+; LA64-NEXT: vldi $vr0, 0
+; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT: vpcnt.d $vr0, $vr0
+; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0
; LA64-NEXT: ret
%1 = call i8 @llvm.ctpop.i8(i8 %a)
ret i8 %1
@@ -213,22 +208,11 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
;
; LA64-LABEL: test_ctpop_i16:
; LA64: # %bb.0:
-; LA64-NEXT: srli.d $a1, $a0, 1
-; LA64-NEXT: lu12i.w $a2, 5
-; LA64-NEXT: ori $a2, $a2, 1365
-; LA64-NEXT: and $a1, $a1, $a2
-; LA64-NEXT: sub.d $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 3
-; LA64-NEXT: ori $a1, $a1, 819
-; LA64-NEXT: and $a2, $a0, $a1
-; LA64-NEXT: srli.d $a0, $a0, 2
-; LA64-NEXT: and $a0, $a0, $a1
-; LA64-NEXT: add.d $a0, $a2, $a0
-; LA64-NEXT: srli.d $a1, $a0, 4
-; LA64-NEXT: add.d $a0, $a0, $a1
-; LA64-NEXT: bstrpick.d $a1, $a0, 11, 8
-; LA64-NEXT: andi $a0, $a0, 15
-; LA64-NEXT: add.d $a0, $a0, $a1
+; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
+; LA64-NEXT: vldi $vr0, 0
+; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT: vpcnt.d $vr0, $vr0
+; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0
; LA64-NEXT: ret
%1 = call i16 @llvm.ctpop.i16(i16 %a)
ret i16 %1
@@ -261,26 +245,11 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
;
; LA64-LABEL: test_ctpop_i32:
; LA64: # %bb.0:
-; LA64-NEXT: srli.d $a1, $a0, 1
-; LA64-NEXT: lu12i.w $a2, 349525
-; LA64-NEXT: ori $a2, $a2, 1365
-; LA64-NEXT: and $a1, $a1, $a2
-; LA64-NEXT: sub.d $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 209715
-; LA64-NEXT: ori $a1, $a1, 819
-; LA64-NEXT: and $a2, $a0, $a1
-; LA64-NEXT: srli.d $a0, $a0, 2
-; LA64-NEXT: and $a0, $a0, $a1
-; LA64-NEXT: add.d $a0, $a2, $a0
-; LA64-NEXT: srli.d $a1, $a0, 4
-; LA64-NEXT: add.d $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 61680
-; LA64-NEXT: ori $a1, $a1, 3855
-; LA64-NEXT: and $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 4112
-; LA64-NEXT: ori $a1, $a1, 257
-; LA64-NEXT: mul.d $a0, $a0, $a1
-; LA64-NEXT: bstrpick.d $a0, $a0, 31, 24
+; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0
+; LA64-NEXT: vldi $vr0, 0
+; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT: vpcnt.d $vr0, $vr0
+; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0
; LA64-NEXT: ret
%1 = call i32 @llvm.ctpop.i32(i32 %a)
ret i32 %1
@@ -327,30 +296,10 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
;
; LA64-LABEL: test_ctpop_i64:
; LA64: # %bb.0:
-; LA64-NEXT: srli.d $a1, $a0, 1
-; LA64-NEXT: lu12i.w $a2, 349525
-; LA64-NEXT: ori $a2, $a2, 1365
-; LA64-NEXT: bstrins.d $a2, $a2, 62, 32
-; LA64-NEXT: and $a1, $a1, $a2
-; LA64-NEXT: sub.d $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 209715
-; LA64-NEXT: ori $a1, $a1, 819
-; LA64-NEXT: bstrins.d $a1, $a1, 61, 32
-; LA64-NEXT: and $a2, $a0, $a1
-; LA64-NEXT: srli.d $a0, $a0, 2
-; LA64-NEXT: and $a0, $a0, $a1
-; LA64-NEXT: add.d $a0, $a2, $a0
-; LA64-NEXT: srli.d $a1, $a0, 4
-; LA64-NEXT: add.d $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 61680
-; LA64-NEXT: ori $a1, $a1, 3855
-; LA64-NEXT: bstrins.d $a1, $a1, 59, 32
-; LA64-NEXT: and $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 4112
-; LA64-NEXT: ori $a1, $a1, 257
-; LA64-NEXT: bstrins.d $a1, $a1, 56, 32
-; LA64-NEXT: mul.d $a0, $a0, $a1
-; LA64-NEXT: srli.d $a0, $a0, 56
+; LA64-NEXT: vldi $vr0, 0
+; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT: vpcnt.d $vr0, $vr0
+; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0
; LA64-NEXT: ret
%1 = call i64 @llvm.ctpop.i64(i64 %a)
ret i64 %1
diff --git a/llvm/test/CodeGen/LoongArch/double-imm.ll b/llvm/test/CodeGen/LoongArch/double-imm.ll
index 8d50b27907d72b..fe403ec532d8e8 100644
--- a/llvm/test/CodeGen/LoongArch/double-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/double-imm.ll
@@ -59,9 +59,7 @@ define double @f64_add_fimm1(double %a) nounwind {
;
; LA64-LABEL: f64_add_fimm1:
; LA64: # %bb.0:
-; LA64-NEXT: addi.d $a0, $zero, 1
-; LA64-NEXT: movgr2fr.d $fa1, $a0
-; LA64-NEXT: ffint.d.l $fa1, $fa1
+; LA64-NEXT: vldi $vr1, -912
; LA64-NEXT: fadd.d $fa0, $fa0, $fa1
; LA64-NEXT: ret
%1 = fadd double %a, 1.0
@@ -79,9 +77,7 @@ define double @f64_positive_fimm1() nounwind {
;
; LA64-LABEL: f64_positive_fimm1:
; LA64: # %bb.0:
-; LA64-NEXT: addi.d $a0, $zero, 1
-; LA64-NEXT: movgr2fr.d $fa0, $a0
-; LA64-NEXT: ffint.d.l $fa0, $fa0
+; LA64-NEXT: vldi $vr0, -912
; LA64-NEXT: ret
ret double 1.0
}
diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll
index ac5cb3c7e72115..cf15fd8bdb4372 100644
--- a/llvm/test/CodeGen/LoongArch/frame.ll
+++ b/llvm/test/CodeGen/LoongArch/frame.ll
@@ -12,8 +12,8 @@ define i32 @test() nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -32
; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill
; CHECK-NEXT: st.w $zero, $sp, 16
-; CHECK-NEXT: st.d $zero, $sp, 8
-; CHECK-NEXT: st.d $zero, $sp, 0
+; CHECK-NEXT: vrepli.b $vr0, 0
+; CHECK-NEXT: vst $vr0, $sp, 0
; CHECK-NEXT: addi.d $a0, $sp, 4
; CHECK-NEXT: bl %plt(test1)
; CHECK-NEXT: move $a0, $zero
diff --git a/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll b/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll
index 6cf9d7d75b9963..3d6e22b5eeb102 100644
--- a/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll
+++ b/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll
@@ -5,22 +5,9 @@
define void @getSetCCResultType(ptr %p) {
; CHECK-LABEL: getSetCCResultType:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: ld.w $a1, $a0, 0
-; CHECK-NEXT: ld.w $a2, $a0, 12
-; CHECK-NEXT: ld.w $a3, $a0, 4
-; CHECK-NEXT: ld.w $a4, $a0, 8
-; CHECK-NEXT: sltui $a1, $a1, 1
-; CHECK-NEXT: sub.d $a1, $zero, $a1
-; CHECK-NEXT: sltui $a3, $a3, 1
-; CHECK-NEXT: sub.d $a3, $zero, $a3
-; CHECK-NEXT: sltui $a4, $a4, 1
-; CHECK-NEXT: sub.d $a4, $zero, $a4
-; CHECK-NEXT: sltui $a2, $a2, 1
-; CHECK-NEXT: sub.d $a2, $zero, $a2
-; CHECK-NEXT: st.w $a2, $a0, 12
-; CHECK-NEXT: st.w $a4, $a0, 8
-; CHECK-NEXT: st.w $a3, $a0, 4
-; CHECK-NEXT: st.w $a1, $a0, 0
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vseqi.w $vr0, $vr0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%0 = load <4 x i32>, ptr %p, align 16
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll
index 570fd438be97bf..83f796f73934c9 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll
@@ -1,4 +1,4 @@
-; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s
+; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LA32
; RUN: not llc --mtriple=loongarch64 < %s 2>&1 | FileCheck %s
define void @constraint_l() {
@@ -32,9 +32,9 @@ define void @constraint_K() {
}
define void @constraint_f() nounwind {
-; CHECK: error: couldn't allocate input reg for constraint 'f'
+; LA32: error: couldn't allocate input reg for constraint 'f'
tail call void asm "fadd.s $$fa0, $$fa0, $0", "f"(float 0.0)
-; CHECK: error: couldn't allocate input reg for constraint 'f'
+; LA32: error: couldn't allocate input reg for constraint 'f'
tail call void asm "fadd.s $$fa0, $$fa0, $0", "f"(double 0.0)
ret void
}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-error.ll b/llvm/test/CodeGen/LoongArch/intrinsic-error.ll
index a839ab149c3338..176e3f60c56252 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-error.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-error.ll
@@ -1,4 +1,4 @@
-; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s
+; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LA32
; RUN: not llc --mtriple=loongarch64 < %s 2>&1 | FileCheck %s
declare void @llvm.loongarch.dbar(i32)
@@ -54,7 +54,7 @@ entry:
}
define void @movgr2fcsr(i32 %a) nounwind {
-; CHECK: llvm.loongarch.movgr2fcsr: requires basic 'f' target feature.
+; LA32: llvm.loongarch.movgr2fcsr: requires basic 'f' target feature.
entry:
call void @llvm.loongarch.movgr2fcsr(i32 1, i32 %a)
ret void
@@ -75,7 +75,7 @@ entry:
}
define i32 @movfcsr2gr() nounwind {
-; CHECK: llvm.loongarch.movfcsr2gr: requires basic 'f' target feature.
+; LA32: llvm.loongarch.movfcsr2gr: requires basic 'f' target feature.
entry:
%res = call i32 @llvm.loongarch.movfcsr2gr(i32 1)
ret i32 %res
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
index 622001db329551..402ddb9ad941b4 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -12,18 +12,12 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der
; CHECK-NEXT: alsl.d $a1, $a1, $a2, 4
; CHECK-NEXT: addi.d $a2, $sp, 0
; CHECK-NEXT: add.d $a3, $a2, $a1
-; CHECK-NEXT: ldx.d $a1, $a1, $a2
-; CHECK-NEXT: ld.d $a2, $a3, 40
-; CHECK-NEXT: st.d $a1, $a0, 0
-; CHECK-NEXT: st.d $a2, $a0, 40
-; CHECK-NEXT: ld.d $a1, $a3, 32
-; CHECK-NEXT: ld.d $a2, $a3, 24
-; CHECK-NEXT: ld.d $a4, $a3, 16
-; CHECK-NEXT: ld.d $a3, $a3, 8
-; CHECK-NEXT: st.d $a1, $a0, 32
-; CHECK-NEXT: st.d $a2, $a0, 24
-; CHECK-NEXT: st.d $a4, $a0, 16
-; CHECK-NEXT: st.d $a3, $a0, 8
+; CHECK-NEXT: vldx $vr0, $a1, $a2
+; CHECK-NEXT: vld $vr1, $a3, 32
+; CHECK-NEXT: vld $vr2, $a3, 16
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: vst $vr1, $a0, 32
+; CHECK-NEXT: vst $vr2, $a0, 16
; CHECK-NEXT: addi.d $sp, $sp, 96
; CHECK-NEXT: ret
%1 = alloca [2 x %Box], align 16
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 7e320d9245f1c2..6ea658acdd7172 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -40,9 +40,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
; LA64D-LABEL: float_fadd_acquire:
; LA64D: # %bb.0:
; LA64D-NEXT: fld.s $fa0, $a0, 0
-; LA64D-NEXT: addi.w $a1, $zero, 1
-; LA64D-NEXT: movgr2fr.w $fa1, $a1
-; LA64D-NEXT: ffint.s.w $fa1, $fa1
+; LA64D-NEXT: vldi $vr1, -1168
; LA64D-NEXT: .p2align 4, , 16
; LA64D-NEXT: .LBB0_1: # %atomicrmw.start
; LA64D-NEXT: # =>This Loop Header: Depth=1
@@ -111,8 +109,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
; LA64D-LABEL: float_fsub_acquire:
; LA64D: # %bb.0:
; LA64D-NEXT: fld.s $fa0, $a0, 0
-; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; LA64D-NEXT: fld.s $fa1, $a1, %pc_lo12(.LCPI1_0)
+; LA64D-NEXT: vldi $vr1, -1040
; LA64D-NEXT: .p2align 4, , 16
; LA64D-NEXT: .LBB1_1: # %atomicrmw.start
; LA64D-NEXT: # =>This Loop Header: Depth=1
@@ -183,9 +180,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
; LA64D-LABEL: float_fmin_acquire:
; LA64D: # %bb.0:
; LA64D-NEXT: fld.s $fa0, $a0, 0
-; LA64D-NEXT: addi.w $a1, $zero, 1
-; LA64D-NEXT: movgr2fr.w $fa1, $a1
-; LA64D-NEXT: ffint.s.w $fa1, $fa1
+; LA64D-NEXT: vldi $vr1, -1168
; LA64D-NEXT: .p2align 4, , 16
; LA64D-NEXT: .LBB2_1: # %atomicrmw.start
; LA64D-NEXT: # =>This Loop Header: Depth=1
@@ -257,9 +252,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
; LA64D-LABEL: float_fmax_acquire:
; LA64D: # %bb.0:
; LA64D-NEXT: fld.s $fa0, $a0, 0
-; LA64D-NEXT: addi.w $a1, $zero, 1
-; LA64D-NEXT: movgr2fr.w $fa1, $a1
-; LA64D-NEXT: ffint.s.w $fa1, $fa1
+; LA64D-NEXT: vldi $vr1, -1168
; LA64D-NEXT: .p2align 4, , 16
; LA64D-NEXT: .LBB3_1: # %atomicrmw.start
; LA64D-NEXT: # =>This Loo...
[truncated]
|
cc @xen0n |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's nice to see vectorization being leveraged across the board, as per the recommended baseline!
Note to myself: While this clearly affects -march=generic
for Clang, I haven't verified whether -march=loongarch64
behavior stays the same as GCC (no LSX).
2nd note to myself: GCC needs -march=generic
implemented too for feature parity.
; CHECK-NEXT: addi.d $sp, $sp, -16 | ||
; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill | ||
; CHECK-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill | ||
; CHECK-NEXT: move $fp, $a0 | ||
; CHECK-NEXT: move $a0, $a1 | ||
; CHECK-NEXT: bl %plt(__fixsfdi) | ||
; CHECK-NEXT: add.d $a0, $fp, $a0 | ||
; CHECK-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload | ||
; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload | ||
; CHECK-NEXT: addi.d $sp, $sp, 16 | ||
; CHECK-NEXT: movgr2fr.w $fa0, $a1 | ||
; CHECK-NEXT: ftintrz.l.s $fa0, $fa0 | ||
; CHECK-NEXT: movfr2gr.d $a1, $fa0 | ||
; CHECK-NEXT: add.d $a0, $a0, $a1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The LP64S tests require updating of compilation flags so it stays soft-float.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
Added --mattr=-f
to stay soft-float.
; RUN: llc --mtriple=loongarch64 --verify-machineinstrs --stop-after=prologepilog < %s | FileCheck %s | ||
|
||
;; Check that STATEPOINT instruction has an early clobber implicit def for R1. | ||
|
||
define void @test() gc "statepoint-example" { | ||
entry: | ||
%safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" ()] | ||
; CHECK: STATEPOINT 0, 0, 0, target-flags(loongarch-call-plt) @return_i1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, csr_ilp32s_lp64s, implicit-def $r3, implicit-def dead early-clobber $r1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this behavioral change intentional?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The automated updating of this case with update_llc_test_checks.py
led to this behavior.
To keep the original style for easy-reviewing, I manually updated it this time.
@@ -2,7 +2,7 @@ | |||
; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP | |||
; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP | |||
; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -S < %s | FileCheck %s --check-prefix=NOCPOP | |||
; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -S < %s | FileCheck %s --check-prefix=NOCPOP | |||
; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -S < %s | FileCheck %s --check-prefix=CPOP |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can additionally test LA64 without LSX somehow, to ensure that coverage doesn't shrink?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, adjusted the testing way.
@@ -1,3 +1,4 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The checking prefixes for this test were written manually. I prefer keeping this style unchanged this time for easy-reviewing. Then maybe you can submit a separate NFC
PR switching to use update_llc_test_checks.py
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -1,13 +1,15 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Thanks for your thoughtfulness and for sharing your insights! Of course, We will also continue to take note of matters related to |
Sorry, I misremembered things and there's actually no |
This patch enables LSX by default in Clang, whereas GCC does not. Should we align the behavior with GCC? |
This commit makes the `generic` target to support FP and LSX, as discussed in llvm#110211. Thereby, it allows 128-bit vector to be enabled by default in the loongarch64 backend.
Actually it has been enabled in #100056. Yes, clang and gcc have different mechanisms to enable lsx by default. Clang achieves this by how its source code is written while gcc achieves it by how it is configured when being built. AFAIK, the upcoming soft-dev-conv requires developers must enable LSX by default when building desktop and server operating systems (but not for embedded systems). Maybe maintainer of linux distribution will turn on LSX when they configure GCC. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIK, the upcoming soft-dev-conv requires developers must enable LSX by default when building desktop and server operating systems (but not for embedded systems).
Upcoming? Indeed it seems so because v0.1 says "should".
This matters mostly from a business perspective, more so than the technical (performance) perspective, because one of the indirect consequences of mandating LSX (without also mandating scalar fallbacks be provided for all code) is that a body of closed-source LSX-containing components will begin to form, forcing the ecosystem building upon it to require LSX as well. This allows for easy market segmentation by e.g. providing specific CPU models without LSX/LASX -- while being lower-priced overall, they cannot run most general-purpose distributions unmodified, and even if the devs went to the great length of assembling a sysroot themselves e.g. with buildroot, they'd be still out of luck if their business depends on a closed-source blob requiring LSX. And with certain LSX/LASX features likely patent-encumbered, this makes third-party LoongArch implementations less competitive as well.
However, the status quo is already LSX all over the place -- memcpy
/memset
getting auto-vectorized once it's enable-by-default -- as one can verify on Compiler Explorer on on their own machine. In this case having consistency is probably better, and we should approach the embedded/indie-core problem separately.
As I have now verified several important -march
choices to match GCC behavior for a piece of trivial loop eligible for vectorization (simple i32 adds):
- no
-march
given: LSX -march=loongarch64
: scalar-march=la64v1.0
: LSX
I'm accepting the patch as well. Thanks!
) This commit makes the `generic` target to support FP and LSX, as discussed in llvm#110211. Thereby, it allows 128-bit vector to be enabled by default in the loongarch64 backend.
) This commit makes the `generic` target to support FP and LSX, as discussed in llvm#110211. Thereby, it allows 128-bit vector to be enabled by default in the loongarch64 backend.
This commit makes the
generic
target to support FP and LSX, as discussed in #110211. Thereby, it allows 128-bit vector to be enabled by default in the loongarch64 backend.