Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 7230d55

Browse files
committed
Merging r275981 and r276740:
------------------------------------------------------------------------ r275981 | rksimon | 2016-07-19 08:07:43 -0700 (Tue, 19 Jul 2016) | 13 lines [X86][SSE] Reimplement SSE fp2si conversion intrinsics instead of using generic IR D20859 and D20860 attempted to replace the SSE (V)CVTTPS2DQ and VCVTTPD2DQ truncating conversions with generic IR instead. It turns out that the behaviour of these intrinsics is different enough from generic IR that this will cause problems, INF/NAN/out of range values are guaranteed to result in a 0x80000000 value - which plays havoc with constant folding which converts them to either zero or UNDEF. This is also an issue with the scalar implementations (which were already generic IR and what I was trying to match). This patch changes both scalar and packed versions back to using x86-specific builtins. It also deals with the other scalar conversion cases that are runtime rounding mode dependent and can have similar issues with constant folding. A companion clang patch is at D22105 Differential Revision: https://reviews.llvm.org/D22106 ------------------------------------------------------------------------ ------------------------------------------------------------------------ r276740 | rksimon | 2016-07-26 03:41:28 -0700 (Tue, 26 Jul 2016) | 5 lines [X86][SSE] Fixed issue with memory folding of (v)cvtsd2ss intrinsics Fixed typo in the intrinsic definitions of (v)cvtsd2ss with memory folding. This was only unearthed when rL276102 started using the intrinsic again..... ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@276990 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent cd00acd commit 7230d55

14 files changed

+196
-97
lines changed

include/llvm/IR/IntrinsicsX86.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,8 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
479479
Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
480480
def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
481481
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
482+
def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
483+
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
482484
def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
483485
Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
484486
def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
@@ -1512,8 +1514,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
15121514
Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
15131515
def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
15141516
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
1517+
def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
1518+
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
15151519
def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
15161520
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
1521+
def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
1522+
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
15171523
}
15181524

15191525
// Vector bit test

lib/Analysis/ConstantFolding.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
14241424
/// integer type Ty is used to select how many bits are available for the
14251425
/// result. Returns null if the conversion cannot be performed, otherwise
14261426
/// returns the Constant value resulting from the conversion.
1427-
Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
1428-
Type *Ty) {
1427+
Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
1428+
Type *Ty) {
14291429
// All of these conversion intrinsics form an integer of at most 64bits.
14301430
unsigned ResultWidth = Ty->getIntegerBitWidth();
14311431
assert(ResultWidth <= 64 &&
@@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
14381438
APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
14391439
/*isSigned=*/true, mode,
14401440
&isExact);
1441-
if (status != APFloat::opOK && status != APFloat::opInexact)
1441+
if (status != APFloat::opOK &&
1442+
(!roundTowardZero || status != APFloat::opInexact))
14421443
return nullptr;
14431444
return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
14441445
}
@@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
16761677
case Intrinsic::x86_sse2_cvtsd2si:
16771678
case Intrinsic::x86_sse2_cvtsd2si64:
16781679
if (ConstantFP *FPOp =
1679-
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
1680-
return ConstantFoldConvertToInt(FPOp->getValueAPF(),
1681-
/*roundTowardZero=*/false, Ty);
1680+
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
1681+
return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
1682+
/*roundTowardZero=*/false, Ty);
16821683
case Intrinsic::x86_sse_cvttss2si:
16831684
case Intrinsic::x86_sse_cvttss2si64:
16841685
case Intrinsic::x86_sse2_cvttsd2si:
16851686
case Intrinsic::x86_sse2_cvttsd2si64:
16861687
if (ConstantFP *FPOp =
1687-
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
1688-
return ConstantFoldConvertToInt(FPOp->getValueAPF(),
1689-
/*roundTowardZero=*/true, Ty);
1688+
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
1689+
return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
1690+
/*roundTowardZero=*/true, Ty);
16901691
}
16911692
}
16921693

lib/IR/AutoUpgrade.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
251251
Name == "sse2.cvtps2pd" ||
252252
Name == "avx.cvtdq2.pd.256" ||
253253
Name == "avx.cvt.ps2.pd.256" ||
254-
Name == "sse2.cvttps2dq" ||
255-
Name.startswith("avx.cvtt.") ||
256254
Name.startswith("avx.vinsertf128.") ||
257255
Name == "avx2.vinserti128" ||
258256
Name.startswith("avx.vextractf128.") ||
@@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
712710
Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
713711
else
714712
Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
715-
} else if (IsX86 && (Name == "sse2.cvttps2dq" ||
716-
Name.startswith("avx.cvtt."))) {
717-
// Truncation (round to zero) float/double to i32 vector conversion.
718-
Value *Src = CI->getArgOperand(0);
719-
VectorType *DstTy = cast<VectorType>(CI->getType());
720-
Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
721713
} else if (IsX86 && Name.startswith("sse4a.movnt.")) {
722714
Module *M = F->getParent();
723715
SmallVector<Metadata *, 1> Elts;

lib/Target/X86/X86InstrSSE.td

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
18201820
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
18211821
IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
18221822
Sched<[WriteCvtF2F]>;
1823-
def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1823+
def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
18241824
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
18251825
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
18261826
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
18361836
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
18371837
IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
18381838
Sched<[WriteCvtF2F]>;
1839-
def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1839+
def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
18401840
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
18411841
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
18421842
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -2009,24 +2009,35 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
20092009
// SSE2 packed instructions with XS prefix
20102010
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
20112011
"cvttps2dq\t{$src, $dst|$dst, $src}",
2012-
[], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2012+
[(set VR128:$dst,
2013+
(int_x86_sse2_cvttps2dq VR128:$src))],
2014+
IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
20132015
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
20142016
"cvttps2dq\t{$src, $dst|$dst, $src}",
2015-
[], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2017+
[(set VR128:$dst, (int_x86_sse2_cvttps2dq
2018+
(loadv4f32 addr:$src)))],
2019+
IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
20162020
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
20172021
"cvttps2dq\t{$src, $dst|$dst, $src}",
2018-
[], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2022+
[(set VR256:$dst,
2023+
(int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2024+
IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
20192025
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
20202026
"cvttps2dq\t{$src, $dst|$dst, $src}",
2021-
[], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2027+
[(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2028+
(loadv8f32 addr:$src)))],
2029+
IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
20222030
Sched<[WriteCvtF2ILd]>;
20232031

20242032
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
20252033
"cvttps2dq\t{$src, $dst|$dst, $src}",
2026-
[], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2034+
[(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2035+
IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
20272036
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
20282037
"cvttps2dq\t{$src, $dst|$dst, $src}",
2029-
[], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2038+
[(set VR128:$dst,
2039+
(int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2040+
IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
20302041

20312042
let Predicates = [HasAVX] in {
20322043
def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
20962107
// YMM only
20972108
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
20982109
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2099-
[], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2110+
[(set VR128:$dst,
2111+
(int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2112+
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
21002113
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
21012114
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2102-
[], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2115+
[(set VR128:$dst,
2116+
(int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2117+
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
21032118
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
21042119
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
21052120

test/CodeGen/X86/avx-intrinsics-fast-isel.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -681,10 +681,11 @@ define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
681681
; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
682682
; X64-NEXT: vzeroupper
683683
; X64-NEXT: retq
684-
%cvt = fptosi <4 x double> %a0 to <4 x i32>
684+
%cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
685685
%res = bitcast <4 x i32> %cvt to <2 x i64>
686686
ret <2 x i64> %res
687687
}
688+
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
688689

689690
define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
690691
; X32-LABEL: test_mm256_cvttps_epi32:
@@ -696,10 +697,11 @@ define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
696697
; X64: # BB#0:
697698
; X64-NEXT: vcvttps2dq %ymm0, %ymm0
698699
; X64-NEXT: retq
699-
%cvt = fptosi <8 x float> %a0 to <8 x i32>
700+
%cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
700701
%res = bitcast <8 x i32> %cvt to <4 x i64>
701702
ret <4 x i64> %res
702703
}
704+
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
703705

704706
define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
705707
; X32-LABEL: test_mm256_div_pd:

test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -359,35 +359,12 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
359359
declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
360360

361361

362-
define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
363-
; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
364-
; CHECK: ## BB#0:
365-
; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
366-
; CHECK-NEXT: vzeroupper
367-
; CHECK-NEXT: retl
368-
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
369-
ret <4 x i32> %res
370-
}
371-
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
372-
373-
374-
define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
375-
; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
376-
; CHECK: ## BB#0:
377-
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
378-
; CHECK-NEXT: retl
379-
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
380-
ret <8 x i32> %res
381-
}
382-
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
383-
384-
385362
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
386363
; add operation forces the execution domain.
387364
; CHECK-LABEL: test_x86_sse2_storeu_dq:
388365
; CHECK: ## BB#0:
389366
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
390-
; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
367+
; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
391368
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
392369
; CHECK-NEXT: retl
393370
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>

test/CodeGen/X86/avx-intrinsics-x86.ll

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3431,6 +3431,39 @@ define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
34313431
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
34323432

34333433

3434+
define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
3435+
; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
3436+
; AVX: ## BB#0:
3437+
; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
3438+
; AVX-NEXT: vzeroupper
3439+
; AVX-NEXT: retl
3440+
;
3441+
; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
3442+
; AVX512VL: ## BB#0:
3443+
; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0
3444+
; AVX512VL-NEXT: retl
3445+
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
3446+
ret <4 x i32> %res
3447+
}
3448+
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
3449+
3450+
3451+
define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
3452+
; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
3453+
; AVX: ## BB#0:
3454+
; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
3455+
; AVX-NEXT: retl
3456+
;
3457+
; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
3458+
; AVX512VL: ## BB#0:
3459+
; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
3460+
; AVX512VL-NEXT: retl
3461+
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
3462+
ret <8 x i32> %res
3463+
}
3464+
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
3465+
3466+
34343467
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
34353468
; AVX-LABEL: test_x86_avx_dp_ps_256:
34363469
; AVX: ## BB#0:
@@ -4552,15 +4585,15 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
45524585
; AVX-LABEL: movnt_dq:
45534586
; AVX: ## BB#0:
45544587
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
4555-
; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
4588+
; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
45564589
; AVX-NEXT: vmovntdq %ymm0, (%eax)
45574590
; AVX-NEXT: vzeroupper
45584591
; AVX-NEXT: retl
45594592
;
45604593
; AVX512VL-LABEL: movnt_dq:
45614594
; AVX512VL: ## BB#0:
45624595
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
4563-
; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
4596+
; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
45644597
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
45654598
; AVX512VL-NEXT: retl
45664599
%a2 = add <2 x i64> %a1, <i64 1, i64 1>

test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,12 @@
66
define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
77
; X64-LABEL: test_mm_cvtsi64_ss:
88
; X64: # BB#0:
9-
; X64-NEXT: cvtsi2ssq %rdi, %xmm1
10-
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
9+
; X64-NEXT: cvtsi2ssq %rdi, %xmm0
1110
; X64-NEXT: retq
12-
%cvt = sitofp i64 %a1 to float
13-
%res = insertelement <4 x float> %a0, float %cvt, i32 0
11+
%res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
1412
ret <4 x float> %res
1513
}
14+
declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
1615

1716
define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
1817
; X64-LABEL: test_mm_cvtss_si64:
@@ -29,7 +28,7 @@ define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
2928
; X64: # BB#0:
3029
; X64-NEXT: cvttss2si %xmm0, %rax
3130
; X64-NEXT: retq
32-
%cvt = extractelement <4 x float> %a0, i32 0
33-
%res = fptosi float %cvt to i64
31+
%res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
3432
ret i64 %res
3533
}
34+
declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone

test/CodeGen/X86/sse-intrinsics-fast-isel.ll

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -707,20 +707,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
707707
define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
708708
; X32-LABEL: test_mm_cvtsi32_ss:
709709
; X32: # BB#0:
710-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
711-
; X32-NEXT: cvtsi2ssl %eax, %xmm1
712-
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
710+
; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
713711
; X32-NEXT: retl
714712
;
715713
; X64-LABEL: test_mm_cvtsi32_ss:
716714
; X64: # BB#0:
717-
; X64-NEXT: cvtsi2ssl %edi, %xmm1
718-
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
715+
; X64-NEXT: cvtsi2ssl %edi, %xmm0
719716
; X64-NEXT: retq
720-
%cvt = sitofp i32 %a1 to float
721-
%res = insertelement <4 x float> %a0, float %cvt, i32 0
717+
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
722718
ret <4 x float> %res
723719
}
720+
declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
724721

725722
define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
726723
; X32-LABEL: test_mm_cvtss_f32:
@@ -762,10 +759,10 @@ define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
762759
; X64: # BB#0:
763760
; X64-NEXT: cvttss2si %xmm0, %eax
764761
; X64-NEXT: retq
765-
%cvt = extractelement <4 x float> %a0, i32 0
766-
%res = fptosi float %cvt to i32
762+
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
767763
ret i32 %res
768764
}
765+
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
769766

770767
define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
771768
; X32-LABEL: test_mm_cvttss_si32:
@@ -777,8 +774,7 @@ define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
777774
; X64: # BB#0:
778775
; X64-NEXT: cvttss2si %xmm0, %eax
779776
; X64-NEXT: retq
780-
%cvt = extractelement <4 x float> %a0, i32 0
781-
%res = fptosi float %cvt to i32
777+
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
782778
ret i32 %res
783779
}
784780

test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,12 @@ define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
2525
define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
2626
; X64-LABEL: test_mm_cvtsi64_sd:
2727
; X64: # BB#0:
28-
; X64-NEXT: cvtsi2sdq %rdi, %xmm1
29-
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
28+
; X64-NEXT: cvtsi2sdq %rdi, %xmm0
3029
; X64-NEXT: retq
31-
%cvt = sitofp i64 %a1 to double
32-
%res = insertelement <2 x double> %a0, double %cvt, i32 0
30+
%res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
3331
ret <2 x double> %res
3432
}
33+
declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
3534

3635
define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
3736
; X64-LABEL: test_mm_cvtsi64_si128:
@@ -48,10 +47,10 @@ define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
4847
; X64: # BB#0:
4948
; X64-NEXT: cvttsd2si %xmm0, %rax
5049
; X64-NEXT: retq
51-
%ext = extractelement <2 x double> %a0, i32 0
52-
%res = fptosi double %ext to i64
50+
%res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
5351
ret i64 %res
5452
}
53+
declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
5554

5655
define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
5756
; X64-LABEL: test_mm_loadu_si64:

0 commit comments

Comments
 (0)