Skip to content

Commit 2fec790

Browse files
committed
cmd/compile: boost inlining into FORs
As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. The forContext contains a liveCounter which shows for how many nodes this FOR is ancestor. Current constants are the following: A "big" FOR is a FOR which contains >=inlineBigForNodes(50) nodes or has more than inlineBigForCallNodes(5) inlinable call nodes. In such FORs no boost is applied. Other FORs are considired to be small and boost callsites with an extra budget equals to inlineExtraForBudget(20). Updates #17566 The following results on GO1, while binary size not increased significantly 10441232 -> 10465920, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.15s ± 1% ~ (p=0.589 n=6+6) Fannkuch11-8 2.70s ± 0% 2.70s ± 0% -0.08% (p=0.002 n=6+6) FmtFprintfEmpty-8 31.9ns ± 0% 31.9ns ± 3% ~ (p=0.907 n=6+6) FmtFprintfString-8 57.0ns ± 0% 57.6ns ± 0% +1.19% (p=0.004 n=5+6) FmtFprintfInt-8 65.2ns ± 0% 64.1ns ± 0% -1.57% (p=0.002 n=6+6) FmtFprintfIntInt-8 103ns ± 0% 103ns ± 0% ~ (p=0.079 n=5+4) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.37% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 173ns ± 0% +2.55% (p=0.004 n=5+6) FmtManyArgs-8 450ns ± 1% 450ns ± 0% ~ (p=1.000 n=6+6) GobDecode-8 4.38ms ± 1% 4.35ms ± 1% ~ (p=0.132 n=6+6) GobEncode-8 3.07ms ± 0% 3.06ms ± 0% -0.38% (p=0.009 n=6+6) Gzip-8 195ms ± 0% 195ms ± 0% ~ (p=0.095 n=5+5) Gunzip-8 28.2ms ± 0% 28.4ms ± 0% +0.57% (p=0.004 n=6+6) HTTPClientServer-8 45.1µs ± 1% 45.3µs ± 1% ~ (p=0.082 n=5+6) JSONEncode-8 7.98ms ± 1% 7.94ms ± 0% -0.47% (p=0.015 n=6+6) JSONDecode-8 35.4ms ± 1% 35.1ms ± 0% -1.04% (p=0.002 n=6+6) Mandelbrot200-8 4.50ms ± 0% 4.50ms ± 0% ~ (p=0.699 n=6+6) GoParse-8 2.98ms ± 0% 2.99ms ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy0_32-8 55.5ns ± 1% 52.8ns ± 2% -4.94% (p=0.002 n=6+6) RegexpMatchEasy0_1K-8 178ns ± 0% 162ns ± 1% -9.18% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 48.4ns ± 2% -3.34% (p=0.002 n=6+6) RegexpMatchEasy1_1K-8 272ns ± 2% 268ns ± 1% ~ (p=0.065 n=6+6) RegexpMatchMedium_32-8 907ns ± 5% 897ns ± 7% ~ (p=0.660 n=6+6) RegexpMatchMedium_1K-8 26.5µs ± 0% 26.6µs ± 0% +0.41% (p=0.008 n=5+5) RegexpMatchHard_32-8 1.28µs ± 0% 1.29µs ± 1% ~ (p=0.167 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.6µs ± 0% ~ (p=0.126 n=6+5) Revcomp-8 398ms ± 0% 395ms ± 0% -0.64% (p=0.010 n=6+4) Template-8 48.4ms ± 0% 47.8ms ± 0% -1.30% (p=0.008 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.108 n=6+6) TimeFormat-8 294ns ± 0% 259ns ± 0% -11.86% (p=0.000 n=5+6) [Geo mean] 40.4µs 40.0µs -1.11% name old speed new speed delta GobDecode-8 175MB/s ± 1% 176MB/s ± 1% ~ (p=0.132 n=6+6) GobEncode-8 250MB/s ± 0% 251MB/s ± 0% +0.38% (p=0.009 n=6+6) Gzip-8 99.3MB/s ± 0% 99.4MB/s ± 0% ~ (p=0.095 n=5+5) Gunzip-8 687MB/s ± 0% 683MB/s ± 0% -0.57% (p=0.004 n=6+6) JSONEncode-8 243MB/s ± 1% 244MB/s ± 0% +0.47% (p=0.015 n=6+6) JSONDecode-8 54.8MB/s ± 1% 55.3MB/s ± 0% +1.04% (p=0.002 n=6+6) GoParse-8 19.4MB/s ± 0% 19.4MB/s ± 1% ~ (p=0.103 n=5+5) RegexpMatchEasy0_32-8 576MB/s ± 1% 606MB/s ± 2% +5.21% (p=0.002 n=6+6) RegexpMatchEasy0_1K-8 5.75GB/s ± 0% 6.33GB/s ± 1% +10.10% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 661MB/s ± 2% +3.47% (p=0.002 n=6+6) RegexpMatchEasy1_1K-8 3.76GB/s ± 2% 3.82GB/s ± 1% ~ (p=0.065 n=6+6) RegexpMatchMedium_32-8 35.4MB/s ± 5% 35.7MB/s ± 7% ~ (p=0.615 n=6+6) RegexpMatchMedium_1K-8 38.6MB/s ± 0% 38.4MB/s ± 0% -0.40% (p=0.008 n=5+5) RegexpMatchHard_32-8 25.0MB/s ± 0% 24.8MB/s ± 1% ~ (p=0.167 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.6MB/s ± 0% ~ (p=0.238 n=5+5) Revcomp-8 639MB/s ± 0% 643MB/s ± 0% +0.65% (p=0.010 n=6+4) Template-8 40.1MB/s ± 0% 40.6MB/s ± 0% +1.32% (p=0.008 n=5+5) [Geo mean] 176MB/s 178MB/s +1.38%
1 parent ab7c904 commit 2fec790

File tree

3 files changed

+199
-14
lines changed

3 files changed

+199
-14
lines changed

src/cmd/compile/internal/inline/inl.go

+120-14
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,102 @@ const (
5151

5252
inlineBigFunctionNodes = 5000 // Functions with this many nodes are considered "big".
5353
inlineBigFunctionMaxCost = 20 // Max cost of inlinee when inlining into a "big" function.
54+
55+
inlineBigForNodes = 50 // FORs with this many nodes are considered "big" and functions are not forced to be inlined.
56+
inlineBigForCallNodes = 5 // FORs with this many call nodes are considered "big" and functions are not forced to be inlined.
57+
inlineExtraForBudget = 20 // Extra budget to inline into not a "big" FOR.
58+
59+
// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
60+
inlineVisitorBudget = inlineMaxBudget + inlineExtraForBudget
5461
)
5562

63+
type forContext struct {
64+
liveCounter int
65+
totalNodes int
66+
callNodes int
67+
}
68+
69+
type inlContext struct {
70+
inlMap map[*ir.Func]bool
71+
forsStack []forContext
72+
}
73+
74+
// isinlinable checks if the function can be inlined in a 'typical' scenario
75+
// when no boosts are applied.
76+
func isinlinable(fn *ir.Func) bool {
77+
return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
78+
}
79+
80+
// countNodes returns count of child nodes and inlinable child call nodes.
81+
func countInlinableCallNodes(n ir.Node) (int, int) {
82+
child_nodes := 0
83+
child_inlinable_call_nodes := 0
84+
ir.Any(n, func(n ir.Node) bool {
85+
child_nodes++
86+
switch n.Op() {
87+
case ir.OCALLFUNC:
88+
call := n.(*ir.CallExpr)
89+
if call.NoInline {
90+
break
91+
}
92+
if ir.IsIntrinsicCall(call) {
93+
break
94+
}
95+
if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
96+
child_inlinable_call_nodes++
97+
}
98+
}
99+
return false
100+
})
101+
return child_nodes, child_inlinable_call_nodes
102+
}
103+
104+
// updateForsStack maintains forsStack, which is used to recognise
105+
// which call nodes are located inside fors, while doing inlnode.
106+
func updateForsStack(inlCtx *inlContext, n ir.Node) {
107+
outdated := 0
108+
for i := len(inlCtx.forsStack) - 1; i >= 0; i-- {
109+
inlCtx.forsStack[i].liveCounter--
110+
if inlCtx.forsStack[i].liveCounter < 0 {
111+
outdated++
112+
}
113+
}
114+
inlCtx.forsStack = inlCtx.forsStack[:len(inlCtx.forsStack)-outdated]
115+
116+
// If we are in a "big" FOR, it's useless to calculate node count
117+
// for this FOR, since no function will be inlined.
118+
if n.Op() == ir.OFOR && (len(inlCtx.forsStack) == 0 || ancestorForsAreSmall(inlCtx)) {
119+
child_nodes, child_inlinable_call_nodes := countInlinableCallNodes(n)
120+
inlCtx.forsStack = append(inlCtx.forsStack, forContext{child_nodes - 1, child_nodes - 1, child_inlinable_call_nodes})
121+
122+
if base.Flag.LowerM > 1 {
123+
fmt.Printf("%v: add for to stack %v\n", ir.Line(n), inlCtx.forsStack)
124+
}
125+
}
126+
}
127+
128+
// fixupForStackAfterInline fixes forsStack after a call node was replaced with inlined node.
129+
func fixupForStackAfterInline(inlCtx *inlContext, n ir.Node, call *ir.InlinedCallExpr) {
130+
if len(inlCtx.forsStack) == 0 {
131+
return
132+
}
133+
134+
child_nodes, child_inlinable_call_nodes := countInlinableCallNodes(call)
135+
136+
for i := 0; i < len(inlCtx.forsStack); i++ {
137+
inlCtx.forsStack[i].liveCounter += child_nodes - 1
138+
inlCtx.forsStack[i].callNodes += child_inlinable_call_nodes
139+
}
140+
141+
if base.Flag.LowerM > 1 {
142+
fmt.Printf("%v: fixup inline %v\n", ir.Line(n), inlCtx.forsStack)
143+
}
144+
}
145+
146+
func ancestorForsAreSmall(inlCtx *inlContext) bool {
147+
return len(inlCtx.forsStack) > 0 && inlCtx.forsStack[0].totalNodes < inlineBigForNodes && inlCtx.forsStack[0].callNodes < inlineBigForCallNodes
148+
}
149+
56150
// InlinePackage finds functions that can be inlined and clones them before walk expands them.
57151
func InlinePackage() {
58152
ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -167,7 +261,7 @@ func CanInline(fn *ir.Func) {
167261
// list. See issue 25249 for more context.
168262

169263
visitor := hairyVisitor{
170-
budget: inlineMaxBudget,
264+
budget: inlineVisitorBudget,
171265
extraCallCost: cc,
172266
}
173267
if visitor.tooHairy(fn) {
@@ -176,20 +270,24 @@ func CanInline(fn *ir.Func) {
176270
}
177271

178272
n.Func.Inl = &ir.Inline{
179-
Cost: inlineMaxBudget - visitor.budget,
273+
Cost: inlineVisitorBudget - visitor.budget,
180274
Dcl: pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
181275
Body: inlcopylist(fn.Body),
182276

183277
CanDelayResults: canDelayResults(fn),
184278
}
185279

186280
if base.Flag.LowerM > 1 {
187-
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
188-
} else if base.Flag.LowerM != 0 {
281+
if isinlinable(n.Func) {
282+
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
283+
} else {
284+
fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
285+
}
286+
} else if base.Flag.LowerM != 0 && isinlinable(n.Func) {
189287
fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
190288
}
191289
if logopt.Enabled() {
192-
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
290+
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
193291
}
194292
}
195293

@@ -241,7 +339,7 @@ func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
241339
return true
242340
}
243341
if v.budget < 0 {
244-
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
342+
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-v.budget, inlineVisitorBudget)
245343
return true
246344
}
247345
return false
@@ -503,10 +601,11 @@ func InlineCalls(fn *ir.Func) {
503601
// but allow inlining if there is a recursion cycle of many functions.
504602
// Most likely, the inlining will stop before we even hit the beginning of
505603
// the cycle again, but the map catches the unusual case.
506-
inlMap := make(map[*ir.Func]bool)
604+
inlCtx := inlContext{make(map[*ir.Func]bool), make([]forContext, 0)}
605+
507606
var edit func(ir.Node) ir.Node
508607
edit = func(n ir.Node) ir.Node {
509-
return inlnode(n, maxCost, inlMap, edit)
608+
return inlnode(n, maxCost, &inlCtx, edit)
510609
}
511610
ir.EditChildren(fn, edit)
512611
ir.CurFunc = savefn
@@ -525,11 +624,16 @@ func InlineCalls(fn *ir.Func) {
525624
// shorter and less complicated.
526625
// The result of inlnode MUST be assigned back to n, e.g.
527626
// n.Left = inlnode(n.Left)
528-
func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
627+
func inlnode(n ir.Node, maxCost int32, inlCtx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
529628
if n == nil {
530629
return n
531630
}
532631

632+
if updateForsStack(inlCtx, n); ancestorForsAreSmall(inlCtx) && maxCost == inlineMaxBudget {
633+
// Boosts only regular functions
634+
maxCost += inlineExtraForBudget
635+
}
636+
533637
switch n.Op() {
534638
case ir.ODEFER, ir.OGO:
535639
n := n.(*ir.GoDeferStmt)
@@ -584,7 +688,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
584688
break
585689
}
586690
if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
587-
n = mkinlcall(call, fn, maxCost, inlMap, edit)
691+
n = mkinlcall(call, fn, maxCost, inlCtx, edit)
588692
}
589693
}
590694

@@ -657,7 +761,7 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
657761
// parameters.
658762
// The result of mkinlcall MUST be assigned back to n, e.g.
659763
// n.Left = mkinlcall(n.Left, fn, isddd)
660-
func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
764+
func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlCtx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
661765
if fn.Inl == nil {
662766
if logopt.Enabled() {
663767
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
@@ -693,15 +797,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
693797
return n
694798
}
695799

696-
if inlMap[fn] {
800+
if inlCtx.inlMap[fn] {
697801
if base.Flag.LowerM > 1 {
698802
fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
699803
}
700804
return n
701805
}
702-
inlMap[fn] = true
806+
inlCtx.inlMap[fn] = true
703807
defer func() {
704-
inlMap[fn] = false
808+
inlCtx.inlMap[fn] = false
705809
}()
706810

707811
typecheck.FixVariadicCall(n)
@@ -730,6 +834,8 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
730834
res = oldInline(n, fn, inlIndex)
731835
}
732836

837+
fixupForStackAfterInline(inlCtx, n, res)
838+
733839
// transitive inlining
734840
// might be nice to do this before exporting the body,
735841
// but can't emit the body with inlining expanded.

test/inline.go

+40
Original file line numberDiff line numberDiff line change
@@ -292,3 +292,43 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2"
292292
func conv1(v uint64) uint64 { // ERROR "can inline conv1"
293293
return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v)))))))))))
294294
}
295+
296+
// Inline into FORs
297+
func func_with_cost_88() {
298+
x := 200
299+
for i := 0; i < x; i++ {
300+
if i%2 == 0 {
301+
runtime.GC()
302+
} else {
303+
i += 2
304+
x += 1
305+
}
306+
}
307+
}
308+
309+
func func_with_fors() {
310+
func_with_cost_88()
311+
312+
for i := 0; i < 100; i++ {
313+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
314+
for j := 0; j < 100; j++ {
315+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
316+
}
317+
if i == 4 {
318+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
319+
}
320+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
321+
}
322+
323+
func_with_cost_88()
324+
func_with_cost_88()
325+
326+
for i := 0; i < 100; i++ {
327+
for j := 0; j < 100; j++ {
328+
for x := 0; x < 100; x++ {
329+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
330+
}
331+
}
332+
}
333+
func_with_cost_88()
334+
}

test/inline_for.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// errorcheck -0 -m=2
2+
3+
// Copyright 2015 The Go Authors. All rights reserved.
4+
// Use of this source code is governed by a BSD-style
5+
// license that can be found in the LICENSE file.
6+
7+
// Test, using compiler diagnostic flags, that inlining is working.
8+
// Compiles but does not run.
9+
10+
package foo
11+
12+
import "runtime"
13+
14+
func func_with() int { // ERROR "can inline func_with .*"
15+
return 10
16+
}
17+
18+
func func_with_cost_88() { // ERROR "can inline only into small FORs .*"
19+
x := 200
20+
for i := 0; i < x; i++ { // ERROR "add for to stack \[\{25 25 0\}\]"
21+
if i%2 == 0 {
22+
runtime.GC()
23+
} else {
24+
i += 2
25+
x += 1
26+
}
27+
}
28+
}
29+
30+
func func_with_fors() { // ERROR "can inline only into small FORs .*"
31+
for { // ERROR "add for to stack \[\{6 6 2\}\]"
32+
for { // ERROR "add for to stack \[\{5 6 2\} \{2 2 1\}\]"
33+
func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "fixup inline \[\{36 6 2\} \{33 2 1\}\]" "add for to stack \[\{29 6 2\} \{26 2 1\} \{25 25 0\}\]"
34+
}
35+
for { // ERROR "add for to stack"
36+
func_with() // ERROR "inlining call to func_with" "fixup inline \[\{10 6 2\} \{10 2 1\}\]"
37+
}
38+
}
39+
}

0 commit comments

Comments
 (0)