Skip to content

Commit 7eb224f

Browse files
committed
cmd/compile: boost inlining into FORs
As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. There is "big" FOR which cost is >= inlineBigForCost(105). In such FORs no boost is applied. Updates #17566 The following results on GO1, while binary size not increased significantly 10454800 -> 10475120, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.17s ± 1% ~ (p=0.065 n=6+6) Fannkuch11-8 2.70s ± 0% 2.69s ± 0% -0.25% (p=0.010 n=6+4) FmtFprintfEmpty-8 31.9ns ± 0% 31.4ns ± 0% -1.61% (p=0.008 n=5+5) FmtFprintfString-8 57.0ns ± 0% 57.1ns ± 0% +0.26% (p=0.013 n=6+5) FmtFprintfInt-8 65.2ns ± 0% 63.9ns ± 0% -1.95% (p=0.008 n=5+5) FmtFprintfIntInt-8 103ns ± 0% 102ns ± 0% -1.01% (p=0.000 n=5+4) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.50% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 174ns ± 0% +2.75% (p=0.008 n=5+5) FmtManyArgs-8 445ns ± 0% 447ns ± 0% +0.46% (p=0.002 n=6+6) GobDecode-8 4.37ms ± 1% 4.40ms ± 0% +0.62% (p=0.009 n=6+6) GobEncode-8 3.07ms ± 0% 3.04ms ± 0% -0.78% (p=0.004 n=5+6) Gzip-8 195ms ± 0% 195ms ± 0% ~ (p=0.429 n=5+6) Gunzip-8 28.2ms ± 0% 28.2ms ± 0% ~ (p=0.662 n=5+6) HTTPClientServer-8 45.0µs ± 1% 45.4µs ± 1% ~ (p=0.093 n=6+6) JSONEncode-8 8.01ms ± 0% 8.03ms ± 0% +0.31% (p=0.008 n=5+5) JSONDecode-8 35.3ms ± 1% 35.1ms ± 0% -0.72% (p=0.008 n=5+5) Mandelbrot200-8 4.50ms ± 0% 4.49ms ± 1% ~ (p=0.937 n=6+6) GoParse-8 3.03ms ± 1% 3.00ms ± 1% ~ (p=0.180 n=6+6) RegexpMatchEasy0_32-8 55.4ns ± 0% 53.2ns ± 3% -3.92% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 178ns ± 0% 175ns ± 1% -1.57% (p=0.004 n=5+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 48.3ns ± 5% ~ (p=0.082 n=5+6) RegexpMatchEasy1_1K-8 271ns ± 1% 262ns ± 1% -3.26% (p=0.004 n=6+5) RegexpMatchMedium_32-8 949ns ± 0% 886ns ± 7% ~ (p=0.329 n=5+6) RegexpMatchMedium_1K-8 27.1µs ± 7% 28.1µs ± 6% ~ (p=0.394 n=6+6) RegexpMatchHard_32-8 1.28µs ± 2% 1.29µs ± 0% ~ (p=0.056 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.4µs ± 0% -0.25% (p=0.009 n=6+5) Revcomp-8 397ms ± 0% 396ms ± 0% ~ (p=0.429 n=6+5) Template-8 48.1ms ± 1% 48.1ms ± 0% ~ (p=0.222 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.210 n=4+6) TimeFormat-8 295ns ± 1% 259ns ± 0% -12.22% (p=0.002 n=6+6) [Geo mean] 40.5µs 40.1µs -1.00% name old speed new speed delta GobDecode-8 176MB/s ± 1% 174MB/s ± 0% -0.61% (p=0.009 n=6+6) GobEncode-8 250MB/s ± 0% 252MB/s ± 0% +0.79% (p=0.004 n=5+6) Gzip-8 100MB/s ± 0% 100MB/s ± 0% ~ (p=0.351 n=5+6) Gunzip-8 687MB/s ± 0% 687MB/s ± 0% ~ (p=0.662 n=5+6) JSONEncode-8 242MB/s ± 0% 242MB/s ± 0% -0.31% (p=0.008 n=5+5) JSONDecode-8 54.9MB/s ± 1% 55.3MB/s ± 0% +0.71% (p=0.008 n=5+5) GoParse-8 19.1MB/s ± 1% 19.3MB/s ± 1% ~ (p=0.143 n=6+6) RegexpMatchEasy0_32-8 578MB/s ± 0% 601MB/s ± 3% +4.10% (p=0.004 n=5+6) RegexpMatchEasy0_1K-8 5.74GB/s ± 1% 5.85GB/s ± 1% +1.90% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 663MB/s ± 4% ~ (p=0.082 n=5+6) RegexpMatchEasy1_1K-8 3.78GB/s ± 1% 3.91GB/s ± 1% +3.38% (p=0.004 n=6+5) RegexpMatchMedium_32-8 33.7MB/s ± 0% 36.2MB/s ± 7% ~ (p=0.268 n=5+6) RegexpMatchMedium_1K-8 37.9MB/s ± 6% 36.5MB/s ± 6% ~ (p=0.411 n=6+6) RegexpMatchHard_32-8 24.9MB/s ± 2% 24.8MB/s ± 0% ~ (p=0.063 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.7MB/s ± 0% +0.25% (p=0.009 n=6+5) Revcomp-8 640MB/s ± 0% 641MB/s ± 0% ~ (p=0.429 n=6+5) Template-8 40.4MB/s ± 1% 40.3MB/s ± 0% ~ (p=0.222 n=5+5) [Geo mean] 175MB/s 177MB/s +1.05%
1 parent d0dd26a commit 7eb224f

File tree

3 files changed

+258
-45
lines changed

3 files changed

+258
-45
lines changed

src/cmd/compile/internal/inline/inl.go

+165-45
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,97 @@ const (
4545
inlineMaxBudget = 80
4646
inlineExtraAppendCost = 0
4747
// default is to inline if there's at most one call. -l=4 overrides this by using 1 instead.
48-
inlineExtraCallCost = 57 // 57 was benchmarked to provided most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
48+
inlineExtraCallCost = 57 // 57 was benchmarked to provide most benefit with no bad surprises; see https://github.com/golang/go/issues/19348#issuecomment-439370742
4949
inlineExtraPanicCost = 1 // do not penalize inlining panics.
5050
inlineExtraThrowCost = inlineMaxBudget // with current (2018-05/1.11) code, inlining runtime.throw does not help.
5151

5252
inlineBigFunctionNodes = 5000 // Functions with this many nodes are considered "big".
5353
inlineBigFunctionMaxCost = 20 // Max cost of inlinee when inlining into a "big" function.
54+
55+
// These values were benchmarked to provide most benefit with no bad surprises.
56+
inlineBigForCost = 105 // FORs with at least this cost are considered "big".
57+
inlineIntoForExtraCallCost = 14
58+
inlineIntoForExtraBudget = 18 // Extra budget when inlining into FORs which are not "big".
59+
60+
// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
61+
inlineVisitorBudget = inlineMaxBudget + inlineIntoForExtraBudget
5462
)
5563

64+
// isInlinable checks if the function can be inlined in a 'typical' scenario
65+
// when no boosts are applied.
66+
func isInlinable(fn *ir.Func) bool {
67+
return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
68+
}
69+
70+
type forContext struct {
71+
cost int32 // Cost helps to determine if FOR is a "big" one.
72+
}
73+
74+
type inlContext struct {
75+
// Map to keep track of functions that have been inlined at a particular
76+
// call site, in order to stop inlining when we reach the beginning of a
77+
// recursion cycle again. We don't inline immediately recursive functions,
78+
// but allow inlining if there is a recursion cycle of many functions.
79+
// Most likely, the inlining will stop before we even hit the beginning of
80+
// the cycle again, but the map catches the unusual case.
81+
inlinedCallees map[*ir.Func]bool
82+
83+
// Stack to recognise which call nodes are located inside fors, while doing inlnode.
84+
forsStack []forContext
85+
initialInlineBudget int32 // Initial inline budget. Boosts are calculated related to this.
86+
}
87+
88+
// Current decision is made on whether all FORs in current scope are not "big".
89+
func (ctx inlContext) canBoostInliningIntoFor() bool {
90+
for i := 0; i < len(ctx.forsStack); i++ {
91+
if ctx.forsStack[i].cost >= inlineBigForCost {
92+
return false
93+
}
94+
}
95+
return len(ctx.forsStack) > 0
96+
}
97+
98+
func (ctx *inlContext) Init(fn *ir.Func) {
99+
ctx.inlinedCallees = make(map[*ir.Func]bool)
100+
101+
if isBigFunc(fn) {
102+
ctx.initialInlineBudget = inlineBigFunctionMaxCost
103+
} else {
104+
ctx.initialInlineBudget = inlineMaxBudget
105+
}
106+
}
107+
108+
func (ctx *inlContext) PushFor(n ir.Node) {
109+
ctx.forsStack = append(ctx.forsStack, forContext{forCost(n)})
110+
111+
if base.Flag.LowerM > 1 {
112+
fmt.Printf("%v: add FOR to stack %v\n", ir.Line(n), ctx.forsStack)
113+
}
114+
}
115+
116+
func (ctx *inlContext) PopFor() {
117+
ctx.forsStack = ctx.forsStack[:len(ctx.forsStack)-1]
118+
}
119+
120+
func (ctx inlContext) InlineBudget() int32 {
121+
finalBudget := ctx.initialInlineBudget
122+
if ctx.canBoostInliningIntoFor() && ctx.initialInlineBudget == inlineMaxBudget {
123+
// Boosts only regular functions
124+
finalBudget += inlineIntoForExtraBudget
125+
}
126+
127+
return finalBudget
128+
}
129+
130+
func forCost(n ir.Node) int32 {
131+
visitor := hairyVisitor{
132+
budget: inlineBigForCost,
133+
extraCallCost: inlineIntoForExtraCallCost,
134+
}
135+
visitor.forTooHairy(n)
136+
return inlineBigForCost - visitor.budget
137+
}
138+
56139
// InlinePackage finds functions that can be inlined and clones them before walk expands them.
57140
func InlinePackage() {
58141
ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -167,29 +250,33 @@ func CanInline(fn *ir.Func) {
167250
// list. See issue 25249 for more context.
168251

169252
visitor := hairyVisitor{
170-
budget: inlineMaxBudget,
253+
budget: inlineVisitorBudget,
171254
extraCallCost: cc,
172255
}
173-
if visitor.tooHairy(fn) {
256+
if visitor.funcTooHairy(fn) {
174257
reason = visitor.reason
175258
return
176259
}
177260

178261
n.Func.Inl = &ir.Inline{
179-
Cost: inlineMaxBudget - visitor.budget,
262+
Cost: inlineVisitorBudget - visitor.budget,
180263
Dcl: pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
181264
Body: inlcopylist(fn.Body),
182265

183266
CanDelayResults: canDelayResults(fn),
184267
}
185268

186269
if base.Flag.LowerM > 1 {
187-
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
188-
} else if base.Flag.LowerM != 0 {
270+
if isInlinable(n.Func) {
271+
fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
272+
} else {
273+
fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
274+
}
275+
} else if base.Flag.LowerM != 0 && isInlinable(n.Func) {
189276
fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
190277
}
191278
if logopt.Enabled() {
192-
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
279+
logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
193280
}
194281
}
195282

@@ -228,20 +315,35 @@ func canDelayResults(fn *ir.Func) bool {
228315
// hairyVisitor visits a function body to determine its inlining
229316
// hairiness and whether or not it can be inlined.
230317
type hairyVisitor struct {
231-
budget int32
232-
reason string
233-
extraCallCost int32
234-
usedLocals ir.NameSet
235-
do func(ir.Node) bool
318+
budget int32
319+
extraCallCost int32
320+
stopIfNodeNotInlinable bool
321+
reason string
322+
usedLocals ir.NameSet
323+
do func(ir.Node) bool
236324
}
237325

238-
func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
239-
v.do = v.doNode // cache closure
326+
func (v *hairyVisitor) funcTooHairy(fn *ir.Func) bool {
327+
v.stopIfNodeNotInlinable = true // No reason to continue, if node of the fn is not inlinable.
328+
v.do = v.doNode // cache closure
240329
if ir.DoChildren(fn, v.do) {
241330
return true
242331
}
243332
if v.budget < 0 {
244-
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
333+
v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-v.budget, inlineVisitorBudget)
334+
return true
335+
}
336+
return false
337+
}
338+
339+
func (v *hairyVisitor) forTooHairy(n ir.Node) bool {
340+
v.stopIfNodeNotInlinable = false // Use doNode to calculate the final cost only.
341+
v.do = v.doNode // cache closure
342+
if ir.DoChildren(n, v.do) {
343+
return true
344+
}
345+
if v.budget < 0 {
346+
v.reason = fmt.Sprintf("for is big: cost %d", inlineBigForCost-v.budget)
245347
return true
246348
}
247349
return false
@@ -265,7 +367,11 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
265367
fn := name.Sym().Name
266368
if fn == "getcallerpc" || fn == "getcallersp" {
267369
v.reason = "call to " + fn
268-
return true
370+
if v.stopIfNodeNotInlinable {
371+
return true
372+
} else {
373+
break
374+
}
269375
}
270376
if fn == "throw" {
271377
v.budget -= inlineExtraThrowCost
@@ -292,7 +398,7 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
292398
break
293399
}
294400

295-
if fn := inlCallee(n.X); fn != nil && fn.Inl != nil {
401+
if fn := inlCallee(n.X); isInlinable(fn) {
296402
v.budget -= fn.Inl.Cost
297403
break
298404
}
@@ -322,12 +428,18 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
322428
// recover matches the argument frame pointer to find
323429
// the right panic value, so it needs an argument frame.
324430
v.reason = "call to recover"
325-
return true
431+
if v.stopIfNodeNotInlinable {
432+
return true
433+
}
326434

327435
case ir.OCLOSURE:
328436
if base.Debug.InlFuncsWithClosures == 0 {
329437
v.reason = "not inlining functions with closures"
330-
return true
438+
if v.stopIfNodeNotInlinable {
439+
return true
440+
} else {
441+
break
442+
}
331443
}
332444

333445
// TODO(danscales): Maybe make budget proportional to number of closure
@@ -338,7 +450,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
338450
// do) to check for disallowed ops in the body and include the
339451
// body in the budget.
340452
if doList(n.(*ir.ClosureExpr).Func.Body, v.do) {
341-
return true
453+
if v.stopIfNodeNotInlinable {
454+
return true
455+
}
342456
}
343457

344458
case ir.ORANGE,
@@ -348,7 +462,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
348462
ir.ODCLTYPE, // can't print yet
349463
ir.OTAILCALL:
350464
v.reason = "unhandled op " + n.Op().String()
351-
return true
465+
if v.stopIfNodeNotInlinable {
466+
return true
467+
}
352468

353469
case ir.OAPPEND:
354470
v.budget -= inlineExtraAppendCost
@@ -377,21 +493,27 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
377493
n := n.(*ir.ForStmt)
378494
if n.Label != nil {
379495
v.reason = "labeled control"
380-
return true
496+
if v.stopIfNodeNotInlinable {
497+
return true
498+
}
381499
}
382500
case ir.OSWITCH:
383501
n := n.(*ir.SwitchStmt)
384502
if n.Label != nil {
385503
v.reason = "labeled control"
386-
return true
504+
if v.stopIfNodeNotInlinable {
505+
return true
506+
}
387507
}
388508
// case ir.ORANGE, ir.OSELECT in "unhandled" above
389509

390510
case ir.OBREAK, ir.OCONTINUE:
391511
n := n.(*ir.BranchStmt)
392512
if n.Label != nil {
393-
// Should have short-circuited due to labeled control error above.
394-
base.Fatalf("unexpected labeled break/continue: %v", n)
513+
if v.stopIfNodeNotInlinable {
514+
// Should have short-circuited due to labeled control error above.
515+
base.Fatalf("unexpected labeled break/continue: %v", n)
516+
}
395517
}
396518

397519
case ir.OIF:
@@ -497,20 +619,13 @@ func inlcopy(n ir.Node) ir.Node {
497619
func InlineCalls(fn *ir.Func) {
498620
savefn := ir.CurFunc
499621
ir.CurFunc = fn
500-
maxCost := int32(inlineMaxBudget)
501-
if isBigFunc(fn) {
502-
maxCost = inlineBigFunctionMaxCost
503-
}
504-
// Map to keep track of functions that have been inlined at a particular
505-
// call site, in order to stop inlining when we reach the beginning of a
506-
// recursion cycle again. We don't inline immediately recursive functions,
507-
// but allow inlining if there is a recursion cycle of many functions.
508-
// Most likely, the inlining will stop before we even hit the beginning of
509-
// the cycle again, but the map catches the unusual case.
510-
inlMap := make(map[*ir.Func]bool)
622+
623+
var inlCtx inlContext
624+
inlCtx.Init(fn)
625+
511626
var edit func(ir.Node) ir.Node
512627
edit = func(n ir.Node) ir.Node {
513-
return inlnode(n, maxCost, inlMap, edit)
628+
return inlnode(n, &inlCtx, edit)
514629
}
515630
ir.EditChildren(fn, edit)
516631
ir.CurFunc = savefn
@@ -529,11 +644,16 @@ func InlineCalls(fn *ir.Func) {
529644
// shorter and less complicated.
530645
// The result of inlnode MUST be assigned back to n, e.g.
531646
// n.Left = inlnode(n.Left)
532-
func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
647+
func inlnode(n ir.Node, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
533648
if n == nil {
534649
return n
535650
}
536651

652+
if n.Op() == ir.OFOR {
653+
ctx.PushFor(n)
654+
defer ctx.PopFor()
655+
}
656+
537657
switch n.Op() {
538658
case ir.ODEFER, ir.OGO:
539659
n := n.(*ir.GoDeferStmt)
@@ -591,7 +711,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
591711
break
592712
}
593713
if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
594-
n = mkinlcall(call, fn, maxCost, inlMap, edit)
714+
n = mkinlcall(call, fn, ctx, edit)
595715
}
596716
}
597717

@@ -664,20 +784,20 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
664784
// parameters.
665785
// The result of mkinlcall MUST be assigned back to n, e.g.
666786
// n.Left = mkinlcall(n.Left, fn, isddd)
667-
func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
787+
func mkinlcall(n *ir.CallExpr, fn *ir.Func, ctx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
668788
if fn.Inl == nil {
669789
if logopt.Enabled() {
670790
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
671791
fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(fn)))
672792
}
673793
return n
674794
}
675-
if fn.Inl.Cost > maxCost {
795+
if fn.Inl.Cost > ctx.InlineBudget() {
676796
// The inlined function body is too big. Typically we use this check to restrict
677797
// inlining into very big functions. See issue 26546 and 17566.
678798
if logopt.Enabled() {
679799
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
680-
fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), maxCost))
800+
fmt.Sprintf("cost %d of %s exceeds max large caller cost %d", fn.Inl.Cost, ir.PkgFuncName(fn), ctx.InlineBudget()))
681801
}
682802
return n
683803
}
@@ -700,15 +820,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
700820
return n
701821
}
702822

703-
if inlMap[fn] {
823+
if ctx.inlinedCallees[fn] {
704824
if base.Flag.LowerM > 1 {
705825
fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
706826
}
707827
return n
708828
}
709-
inlMap[fn] = true
829+
ctx.inlinedCallees[fn] = true
710830
defer func() {
711-
inlMap[fn] = false
831+
ctx.inlinedCallees[fn] = false
712832
}()
713833

714834
typecheck.FixVariadicCall(n)

0 commit comments

Comments
 (0)