Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit 84066df

Browse files
author
Mohammed Agabaria
committed
[LV][X86] update the cost of interleaving mem. access of floats
Recommit: This patch contains update of the costs of interleaved loads of v8f32 of stride 3 and 8. fixed the location of the lit test it works with make check-all. Differential Revision: https://reviews.llvm.org/D39403 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317471 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 194c54b commit 84066df

File tree

2 files changed

+145
-1
lines changed

2 files changed

+145
-1
lines changed

lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2644,12 +2644,15 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
26442644
{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
26452645
{ 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
26462646
{ 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2647+
{ 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
26472648

26482649
{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
26492650
{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
26502651
{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
26512652
{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2652-
{ 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
2653+
{ 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2654+
2655+
{ 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
26532656
};
26542657

26552658
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
3+
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
4+
target triple = "i386-unknown-linux-gnu"
5+
6+
@src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
7+
@dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
8+
9+
; Function Attrs: norecurse nounwind
10+
define void @stride8(float %k, i32 %width_) {
11+
entry:
12+
13+
; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float
14+
15+
%cmp72 = icmp sgt i32 %width_, 0
16+
br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
17+
18+
for.body.lr.ph: ; preds = %entry
19+
br label %for.body
20+
21+
for.cond.cleanup.loopexit: ; preds = %for.body
22+
br label %for.cond.cleanup
23+
24+
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
25+
ret void
26+
27+
for.body: ; preds = %for.body.lr.ph, %for.body
28+
%i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
29+
%arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
30+
%0 = load float, float* %arrayidx, align 4
31+
%mul = fmul fast float %0, %k
32+
%arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
33+
%1 = load float, float* %arrayidx2, align 4
34+
%add3 = fadd fast float %1, %mul
35+
store float %add3, float* %arrayidx2, align 4
36+
%add4 = or i32 %i.073, 1
37+
%arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
38+
%2 = load float, float* %arrayidx5, align 4
39+
%mul6 = fmul fast float %2, %k
40+
%arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
41+
%3 = load float, float* %arrayidx8, align 4
42+
%add9 = fadd fast float %3, %mul6
43+
store float %add9, float* %arrayidx8, align 4
44+
%add10 = or i32 %i.073, 2
45+
%arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
46+
%4 = load float, float* %arrayidx11, align 4
47+
%mul12 = fmul fast float %4, %k
48+
%arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
49+
%5 = load float, float* %arrayidx14, align 4
50+
%add15 = fadd fast float %5, %mul12
51+
store float %add15, float* %arrayidx14, align 4
52+
%add16 = or i32 %i.073, 3
53+
%arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
54+
%6 = load float, float* %arrayidx17, align 4
55+
%mul18 = fmul fast float %6, %k
56+
%arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
57+
%7 = load float, float* %arrayidx20, align 4
58+
%add21 = fadd fast float %7, %mul18
59+
store float %add21, float* %arrayidx20, align 4
60+
%add22 = or i32 %i.073, 4
61+
%arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
62+
%8 = load float, float* %arrayidx23, align 4
63+
%mul24 = fmul fast float %8, %k
64+
%arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
65+
%9 = load float, float* %arrayidx26, align 4
66+
%add27 = fadd fast float %9, %mul24
67+
store float %add27, float* %arrayidx26, align 4
68+
%add28 = or i32 %i.073, 5
69+
%arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
70+
%10 = load float, float* %arrayidx29, align 4
71+
%mul30 = fmul fast float %10, %k
72+
%arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
73+
%11 = load float, float* %arrayidx32, align 4
74+
%add33 = fadd fast float %11, %mul30
75+
store float %add33, float* %arrayidx32, align 4
76+
%add34 = or i32 %i.073, 6
77+
%arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
78+
%12 = load float, float* %arrayidx35, align 4
79+
%mul36 = fmul fast float %12, %k
80+
%arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
81+
%13 = load float, float* %arrayidx38, align 4
82+
%add39 = fadd fast float %13, %mul36
83+
store float %add39, float* %arrayidx38, align 4
84+
%add40 = or i32 %i.073, 7
85+
%arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
86+
%14 = load float, float* %arrayidx41, align 4
87+
%mul42 = fmul fast float %14, %k
88+
%arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
89+
%15 = load float, float* %arrayidx44, align 4
90+
%add45 = fadd fast float %15, %mul42
91+
store float %add45, float* %arrayidx44, align 4
92+
%add46 = add nuw nsw i32 %i.073, 8
93+
%cmp = icmp slt i32 %add46, %width_
94+
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
95+
}
96+
97+
; Function Attrs: norecurse nounwind
98+
define void @stride3(float %k, i32 %width_) {
99+
entry:
100+
101+
; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float
102+
103+
%cmp27 = icmp sgt i32 %width_, 0
104+
br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
105+
106+
for.body.lr.ph: ; preds = %entry
107+
br label %for.body
108+
109+
for.cond.cleanup: ; preds = %for.body, %entry
110+
ret void
111+
112+
for.body: ; preds = %for.body.lr.ph, %for.body
113+
%i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
114+
%arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
115+
%0 = load float, float* %arrayidx, align 4
116+
%mul = fmul fast float %0, %k
117+
%arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
118+
%1 = load float, float* %arrayidx2, align 4
119+
%add3 = fadd fast float %1, %mul
120+
store float %add3, float* %arrayidx2, align 4
121+
%add4 = add nuw nsw i32 %i.028, 1
122+
%arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
123+
%2 = load float, float* %arrayidx5, align 4
124+
%mul6 = fmul fast float %2, %k
125+
%arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
126+
%3 = load float, float* %arrayidx8, align 4
127+
%add9 = fadd fast float %3, %mul6
128+
store float %add9, float* %arrayidx8, align 4
129+
%add10 = add nuw nsw i32 %i.028, 2
130+
%arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
131+
%4 = load float, float* %arrayidx11, align 4
132+
%mul12 = fmul fast float %4, %k
133+
%arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
134+
%5 = load float, float* %arrayidx14, align 4
135+
%add15 = fadd fast float %5, %mul12
136+
store float %add15, float* %arrayidx14, align 4
137+
%add16 = add nuw nsw i32 %i.028, 3
138+
%cmp = icmp slt i32 %add16, %width_
139+
br i1 %cmp, label %for.body, label %for.cond.cleanup
140+
}
141+

0 commit comments

Comments
 (0)