Skip to content

Commit 17570a9

Browse files
mundaymlaboger
authored andcommitted
cmd/compile: emit fused multiply-{add,subtract} on ppc64x
A follow on to CL 36963 adding support for ppc64x. Performance changes (as posted on the issue): poly1305: benchmark old ns/op new ns/op delta Benchmark64-16 172 151 -12.21% Benchmark1K-16 1828 1523 -16.68% Benchmark64Unaligned-16 172 151 -12.21% Benchmark1KUnaligned-16 1827 1523 -16.64% math: BenchmarkAcos-16 43.9 39.9 -9.11% BenchmarkAcosh-16 57.0 45.8 -19.65% BenchmarkAsin-16 35.8 33.0 -7.82% BenchmarkAsinh-16 68.6 60.8 -11.37% BenchmarkAtan-16 19.8 16.2 -18.18% BenchmarkAtanh-16 65.5 57.5 -12.21% BenchmarkAtan2-16 45.4 34.2 -24.67% BenchmarkGamma-16 37.6 26.0 -30.85% BenchmarkLgamma-16 40.0 28.2 -29.50% BenchmarkLog1p-16 35.1 29.1 -17.09% BenchmarkSin-16 22.7 18.4 -18.94% BenchmarkSincos-16 31.7 23.7 -25.24% BenchmarkSinh-16 146 131 -10.27% BenchmarkY0-16 130 107 -17.69% BenchmarkY1-16 127 107 -15.75% BenchmarkYn-16 278 235 -15.47% Updates #17895. Change-Id: I1c16199715d20c9c4bd97c4a950bcfa69eb688c1 Reviewed-on: https://go-review.googlesource.com/38095 Reviewed-by: Carlos Eduardo Seo <[email protected]> Reviewed-by: Lynn Boger <[email protected]>
1 parent 01ac5b8 commit 17570a9

File tree

8 files changed

+302
-9
lines changed

8 files changed

+302
-9
lines changed

src/cmd/compile/internal/gc/asm_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,11 @@ var allAsmTests = []*asmTests{
196196
imports: []string{"math/bits"},
197197
tests: linuxMIPSTests,
198198
},
199+
{
200+
arch: "ppc64le",
201+
os: "linux",
202+
tests: linuxPPC64LETests,
203+
},
199204
}
200205

201206
var linuxAMD64Tests = []*asmTest{
@@ -1329,6 +1334,42 @@ var linuxMIPSTests = []*asmTest{
13291334
},
13301335
}
13311336

1337+
var linuxPPC64LETests = []*asmTest{
1338+
// Fused multiply-add/sub instructions.
1339+
{
1340+
`
1341+
func f0(x, y, z float64) float64 {
1342+
return x * y + z
1343+
}
1344+
`,
1345+
[]string{"\tFMADD\t"},
1346+
},
1347+
{
1348+
`
1349+
func f1(x, y, z float64) float64 {
1350+
return x * y - z
1351+
}
1352+
`,
1353+
[]string{"\tFMSUB\t"},
1354+
},
1355+
{
1356+
`
1357+
func f2(x, y, z float32) float32 {
1358+
return x * y + z
1359+
}
1360+
`,
1361+
[]string{"\tFMADDS\t"},
1362+
},
1363+
{
1364+
`
1365+
func f3(x, y, z float32) float32 {
1366+
return x * y - z
1367+
}
1368+
`,
1369+
[]string{"\tFMSUBS\t"},
1370+
},
1371+
}
1372+
13321373
// TestLineNumber checks to make sure the generated assembly has line numbers
13331374
// see issue #16214
13341375
func TestLineNumber(t *testing.T) {

src/cmd/compile/internal/ppc64/prog.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ var progtable = [ppc64.ALAST & obj.AMask]gc.ProgInfo{
8181
ppc64.AFMULS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
8282
ppc64.AFDIV & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
8383
ppc64.AFDIVS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
84+
ppc64.AFMADD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
85+
ppc64.AFMADDS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
86+
ppc64.AFMSUB & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
87+
ppc64.AFMSUBS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
8488
ppc64.AFCTIDZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
8589
ppc64.AFCTIWZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
8690
ppc64.AFCFID & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},

src/cmd/compile/internal/ppc64/ssa.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
458458
// Closure pointer is R11 (already)
459459
gc.CheckLoweredGetClosurePtr(v)
460460

461+
case ssa.OpPPC64LoweredRound32F, ssa.OpPPC64LoweredRound64F:
462+
// input is already rounded
463+
461464
case ssa.OpLoadReg:
462465
loadOp := loadByType(v.Type)
463466
p := gc.Prog(loadOp)
@@ -565,6 +568,22 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
565568
p.To.Type = obj.TYPE_REG
566569
p.To.Reg = r
567570

571+
case ssa.OpPPC64FMADD, ssa.OpPPC64FMADDS, ssa.OpPPC64FMSUB, ssa.OpPPC64FMSUBS:
572+
r := v.Reg()
573+
r1 := v.Args[0].Reg()
574+
r2 := v.Args[1].Reg()
575+
r3 := v.Args[2].Reg()
576+
// r = r1*r2 ± r3
577+
p := gc.Prog(v.Op.Asm())
578+
p.From.Type = obj.TYPE_REG
579+
p.From.Reg = r1
580+
p.Reg = r3
581+
p.From3 = new(obj.Addr)
582+
p.From3.Type = obj.TYPE_REG
583+
p.From3.Reg = r2
584+
p.To.Type = obj.TYPE_REG
585+
p.To.Reg = r
586+
568587
case ssa.OpPPC64MaskIfNotCarry:
569588
r := v.Reg()
570589
p := gc.Prog(v.Op.Asm())

src/cmd/compile/internal/ssa/gen/PPC64.rules

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@
7070
(Cvt32Fto64F x) -> x // Note x will have the wrong type for patterns dependent on Float32/Float64
7171
(Cvt64Fto32F x) -> (FRSP x)
7272

73-
(Round32F x) -> x
74-
(Round64F x) -> x
73+
(Round32F x) -> (LoweredRound32F x)
74+
(Round64F x) -> (LoweredRound64F x)
7575

7676
(Sqrt x) -> (FSQRT x)
7777

@@ -849,3 +849,11 @@
849849
// A particular pattern seen in cgo code:
850850
(AND (MOVDconst [c]) x:(MOVBZload _ _)) -> (ANDconst [c&0xFF] x)
851851
(AND x:(MOVBZload _ _) (MOVDconst [c])) -> (ANDconst [c&0xFF] x)
852+
853+
// floating-point fused multiply-add/sub
854+
(FADD z (FMUL x y)) -> (FMADD x y z)
855+
(FADD (FMUL x y) z) -> (FMADD x y z)
856+
(FSUB (FMUL x y) z) -> (FMSUB x y z)
857+
(FADDS z (FMULS x y)) -> (FMADDS x y z)
858+
(FADDS (FMULS x y) z) -> (FMADDS x y z)
859+
(FSUBS (FMULS x y) z) -> (FMSUBS x y z)

src/cmd/compile/internal/ssa/gen/PPC64Ops.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ func init() {
147147
fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
148148
gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
149149
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
150+
fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
150151
fp2cr = regInfo{inputs: []regMask{fp, fp}}
151152
fpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{fp}}
152153
fpstore = regInfo{inputs: []regMask{gp | sp | sb, fp}}
@@ -172,6 +173,11 @@ func init() {
172173
{name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true}, // arg0*arg1
173174
{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1
174175

176+
{name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD"}, // arg0*arg1 + arg2
177+
{name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // arg0*arg1 + arg2
178+
{name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB"}, // arg0*arg1 - arg2
179+
{name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // arg0*arg1 - arg2
180+
175181
{name: "SRAD", argLength: 2, reg: gp21, asm: "SRAD"}, // arg0 >>a arg1, 64 bits (all sign if arg1 & 64 != 0)
176182
{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >>a arg1, 32 bits (all sign if arg1 & 32 != 0)
177183
{name: "SRD", argLength: 2, reg: gp21, asm: "SRD"}, // arg0 >> arg1, 64 bits (0 if arg1 & 64 != 0)
@@ -293,6 +299,9 @@ func init() {
293299

294300
//arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
295301
{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
302+
// Round ops to block fused-multiply-add extraction.
303+
{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true},
304+
{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true},
296305

297306
// Convert pointer to integer, takes a memory operand for ordering.
298307
{name: "MOVDconvert", argLength: 2, reg: gp11, asm: "MOVD"},

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,10 @@ const (
12631263
OpPPC64MULHWU
12641264
OpPPC64FMUL
12651265
OpPPC64FMULS
1266+
OpPPC64FMADD
1267+
OpPPC64FMADDS
1268+
OpPPC64FMSUB
1269+
OpPPC64FMSUBS
12661270
OpPPC64SRAD
12671271
OpPPC64SRAW
12681272
OpPPC64SRD
@@ -1353,6 +1357,8 @@ const (
13531357
OpPPC64FGreaterEqual
13541358
OpPPC64LoweredGetClosurePtr
13551359
OpPPC64LoweredNilCheck
1360+
OpPPC64LoweredRound32F
1361+
OpPPC64LoweredRound64F
13561362
OpPPC64MOVDconvert
13571363
OpPPC64CALLstatic
13581364
OpPPC64CALLclosure
@@ -16059,6 +16065,66 @@ var opcodeTable = [...]opInfo{
1605916065
},
1606016066
},
1606116067
},
16068+
{
16069+
name: "FMADD",
16070+
argLen: 3,
16071+
asm: ppc64.AFMADD,
16072+
reg: regInfo{
16073+
inputs: []inputInfo{
16074+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16075+
{1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16076+
{2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16077+
},
16078+
outputs: []outputInfo{
16079+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16080+
},
16081+
},
16082+
},
16083+
{
16084+
name: "FMADDS",
16085+
argLen: 3,
16086+
asm: ppc64.AFMADDS,
16087+
reg: regInfo{
16088+
inputs: []inputInfo{
16089+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16090+
{1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16091+
{2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16092+
},
16093+
outputs: []outputInfo{
16094+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16095+
},
16096+
},
16097+
},
16098+
{
16099+
name: "FMSUB",
16100+
argLen: 3,
16101+
asm: ppc64.AFMSUB,
16102+
reg: regInfo{
16103+
inputs: []inputInfo{
16104+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16105+
{1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16106+
{2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16107+
},
16108+
outputs: []outputInfo{
16109+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16110+
},
16111+
},
16112+
},
16113+
{
16114+
name: "FMSUBS",
16115+
argLen: 3,
16116+
asm: ppc64.AFMSUBS,
16117+
reg: regInfo{
16118+
inputs: []inputInfo{
16119+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16120+
{1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16121+
{2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16122+
},
16123+
outputs: []outputInfo{
16124+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
16125+
},
16126+
},
16127+
},
1606216128
{
1606316129
name: "SRAD",
1606416130
argLen: 2,
@@ -17222,6 +17288,32 @@ var opcodeTable = [...]opInfo{
1722217288
clobbers: 2147483648, // R31
1722317289
},
1722417290
},
17291+
{
17292+
name: "LoweredRound32F",
17293+
argLen: 1,
17294+
resultInArg0: true,
17295+
reg: regInfo{
17296+
inputs: []inputInfo{
17297+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
17298+
},
17299+
outputs: []outputInfo{
17300+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
17301+
},
17302+
},
17303+
},
17304+
{
17305+
name: "LoweredRound64F",
17306+
argLen: 1,
17307+
resultInArg0: true,
17308+
reg: regInfo{
17309+
inputs: []inputInfo{
17310+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
17311+
},
17312+
outputs: []outputInfo{
17313+
{0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
17314+
},
17315+
},
17316+
},
1722517317
{
1722617318
name: "MOVDconvert",
1722717319
argLen: 2,

0 commit comments

Comments
 (0)