cmd/compile: boost inlining into FORs

nimelehin · nimelehin · commit 2fec7901f366 · 2021-09-06T10:49:20.000+03:00
As already Than McIntosh mentioned it's a common practise to boost inlining to FORs, since the callsite could be "hotter". This patch implements this functionality. The implementation uses a stack of FORs to recognise calls which are in a loop. The stack is maintained alongside inlnode function works and contains information about ancenstor FORs relative to a current node in inlnode. The forContext contains a liveCounter which shows for how many nodes this FOR is ancestor. Current constants are the following: A "big" FOR is a FOR which contains >=inlineBigForNodes(50) nodes or has more than inlineBigForCallNodes(5) inlinable call nodes. In such FORs no boost is applied. Other FORs are considired to be small and boost callsites with an extra budget equals to inlineExtraForBudget(20). Updates #17566 The following results on GO1, while binary size not increased significantly 10441232 -> 10465920, which is less than 0.3%. goos: linux goarch: amd64 pkg: test/bench/go1 cpu: Intel(R) Xeon(R) Gold 6230N CPU @ 2.30GHz name old time/op new time/op delta BinaryTree17-8 2.15s ± 1% 2.15s ± 1% ~ (p=0.589 n=6+6) Fannkuch11-8 2.70s ± 0% 2.70s ± 0% -0.08% (p=0.002 n=6+6) FmtFprintfEmpty-8 31.9ns ± 0% 31.9ns ± 3% ~ (p=0.907 n=6+6) FmtFprintfString-8 57.0ns ± 0% 57.6ns ± 0% +1.19% (p=0.004 n=5+6) FmtFprintfInt-8 65.2ns ± 0% 64.1ns ± 0% -1.57% (p=0.002 n=6+6) FmtFprintfIntInt-8 103ns ± 0% 103ns ± 0% ~ (p=0.079 n=5+4) FmtFprintfPrefixedInt-8 119ns ± 0% 118ns ± 0% -0.37% (p=0.008 n=5+5) FmtFprintfFloat-8 169ns ± 0% 173ns ± 0% +2.55% (p=0.004 n=5+6) FmtManyArgs-8 450ns ± 1% 450ns ± 0% ~ (p=1.000 n=6+6) GobDecode-8 4.38ms ± 1% 4.35ms ± 1% ~ (p=0.132 n=6+6) GobEncode-8 3.07ms ± 0% 3.06ms ± 0% -0.38% (p=0.009 n=6+6) Gzip-8 195ms ± 0% 195ms ± 0% ~ (p=0.095 n=5+5) Gunzip-8 28.2ms ± 0% 28.4ms ± 0% +0.57% (p=0.004 n=6+6) HTTPClientServer-8 45.1µs ± 1% 45.3µs ± 1% ~ (p=0.082 n=5+6) JSONEncode-8 7.98ms ± 1% 7.94ms ± 0% -0.47% (p=0.015 n=6+6) JSONDecode-8 35.4ms ± 1% 35.1ms ± 0% -1.04% (p=0.002 n=6+6) Mandelbrot200-8 4.50ms ± 0% 4.50ms ± 0% ~ (p=0.699 n=6+6) GoParse-8 2.98ms ± 0% 2.99ms ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy0_32-8 55.5ns ± 1% 52.8ns ± 2% -4.94% (p=0.002 n=6+6) RegexpMatchEasy0_1K-8 178ns ± 0% 162ns ± 1% -9.18% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 50.1ns ± 0% 48.4ns ± 2% -3.34% (p=0.002 n=6+6) RegexpMatchEasy1_1K-8 272ns ± 2% 268ns ± 1% ~ (p=0.065 n=6+6) RegexpMatchMedium_32-8 907ns ± 5% 897ns ± 7% ~ (p=0.660 n=6+6) RegexpMatchMedium_1K-8 26.5µs ± 0% 26.6µs ± 0% +0.41% (p=0.008 n=5+5) RegexpMatchHard_32-8 1.28µs ± 0% 1.29µs ± 1% ~ (p=0.167 n=6+6) RegexpMatchHard_1K-8 38.5µs ± 0% 38.6µs ± 0% ~ (p=0.126 n=6+5) Revcomp-8 398ms ± 0% 395ms ± 0% -0.64% (p=0.010 n=6+4) Template-8 48.4ms ± 0% 47.8ms ± 0% -1.30% (p=0.008 n=5+5) TimeParse-8 213ns ± 0% 213ns ± 0% ~ (p=0.108 n=6+6) TimeFormat-8 294ns ± 0% 259ns ± 0% -11.86% (p=0.000 n=5+6) [Geo mean] 40.4µs 40.0µs -1.11% name old speed new speed delta GobDecode-8 175MB/s ± 1% 176MB/s ± 1% ~ (p=0.132 n=6+6) GobEncode-8 250MB/s ± 0% 251MB/s ± 0% +0.38% (p=0.009 n=6+6) Gzip-8 99.3MB/s ± 0% 99.4MB/s ± 0% ~ (p=0.095 n=5+5) Gunzip-8 687MB/s ± 0% 683MB/s ± 0% -0.57% (p=0.004 n=6+6) JSONEncode-8 243MB/s ± 1% 244MB/s ± 0% +0.47% (p=0.015 n=6+6) JSONDecode-8 54.8MB/s ± 1% 55.3MB/s ± 0% +1.04% (p=0.002 n=6+6) GoParse-8 19.4MB/s ± 0% 19.4MB/s ± 1% ~ (p=0.103 n=5+5) RegexpMatchEasy0_32-8 576MB/s ± 1% 606MB/s ± 2% +5.21% (p=0.002 n=6+6) RegexpMatchEasy0_1K-8 5.75GB/s ± 0% 6.33GB/s ± 1% +10.10% (p=0.002 n=6+6) RegexpMatchEasy1_32-8 639MB/s ± 0% 661MB/s ± 2% +3.47% (p=0.002 n=6+6) RegexpMatchEasy1_1K-8 3.76GB/s ± 2% 3.82GB/s ± 1% ~ (p=0.065 n=6+6) RegexpMatchMedium_32-8 35.4MB/s ± 5% 35.7MB/s ± 7% ~ (p=0.615 n=6+6) RegexpMatchMedium_1K-8 38.6MB/s ± 0% 38.4MB/s ± 0% -0.40% (p=0.008 n=5+5) RegexpMatchHard_32-8 25.0MB/s ± 0% 24.8MB/s ± 1% ~ (p=0.167 n=6+6) RegexpMatchHard_1K-8 26.6MB/s ± 0% 26.6MB/s ± 0% ~ (p=0.238 n=5+5) Revcomp-8 639MB/s ± 0% 643MB/s ± 0% +0.65% (p=0.010 n=6+4) Template-8 40.1MB/s ± 0% 40.6MB/s ± 0% +1.32% (p=0.008 n=5+5) [Geo mean] 176MB/s 178MB/s +1.38%
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
@@ -51,8 +51,102 @@ const (
 
 	inlineBigFunctionNodes   = 5000 // Functions with this many nodes are considered "big".
 	inlineBigFunctionMaxCost = 20   // Max cost of inlinee when inlining into a "big" function.
+
+	inlineBigForNodes     = 50 // FORs with this many nodes are considered "big" and functions are not forced to be inlined.
+	inlineBigForCallNodes = 5  // FORs with this many call nodes are considered "big" and functions are not forced to be inlined.
+	inlineExtraForBudget  = 20 // Extra budget to inline into not a "big" FOR.
+
+	// The upper budget for a visitor. It accounts the maximum cost with which a function could be inlined.
+	inlineVisitorBudget = inlineMaxBudget + inlineExtraForBudget
 )
 
+type forContext struct {
+	liveCounter int
+	totalNodes  int
+	callNodes   int
+}
+
+type inlContext struct {
+	inlMap    map[*ir.Func]bool
+	forsStack []forContext
+}
+
+// isinlinable checks if the function can be inlined in a 'typical' scenario
+// when no boosts are applied.
+func isinlinable(fn *ir.Func) bool {
+	return fn != nil && fn.Inl != nil && fn.Inl.Cost <= inlineMaxBudget
+}
+
+// countNodes returns count of child nodes and inlinable child call nodes.
+func countInlinableCallNodes(n ir.Node) (int, int) {
+	child_nodes := 0
+	child_inlinable_call_nodes := 0
+	ir.Any(n, func(n ir.Node) bool {
+		child_nodes++
+		switch n.Op() {
+		case ir.OCALLFUNC:
+			call := n.(*ir.CallExpr)
+			if call.NoInline {
+				break
+			}
+			if ir.IsIntrinsicCall(call) {
+				break
+			}
+			if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
+				child_inlinable_call_nodes++
+			}
+		}
+		return false
+	})
+	return child_nodes, child_inlinable_call_nodes
+}
+
+// updateForsStack maintains forsStack, which is used to recognise
+// which call nodes are located inside fors, while doing inlnode.
+func updateForsStack(inlCtx *inlContext, n ir.Node) {
+	outdated := 0
+	for i := len(inlCtx.forsStack) - 1; i >= 0; i-- {
+		inlCtx.forsStack[i].liveCounter--
+		if inlCtx.forsStack[i].liveCounter < 0 {
+			outdated++
+		}
+	}
+	inlCtx.forsStack = inlCtx.forsStack[:len(inlCtx.forsStack)-outdated]
+
+	// If we are in a "big" FOR, it's useless to calculate node count
+	// for this FOR, since no function will be inlined.
+	if n.Op() == ir.OFOR && (len(inlCtx.forsStack) == 0 || ancestorForsAreSmall(inlCtx)) {
+		child_nodes, child_inlinable_call_nodes := countInlinableCallNodes(n)
+		inlCtx.forsStack = append(inlCtx.forsStack, forContext{child_nodes - 1, child_nodes - 1, child_inlinable_call_nodes})
+
+		if base.Flag.LowerM > 1 {
+			fmt.Printf("%v: add for to stack %v\n", ir.Line(n), inlCtx.forsStack)
+		}
+	}
+}
+
+// fixupForStackAfterInline fixes forsStack after a call node was replaced with inlined node.
+func fixupForStackAfterInline(inlCtx *inlContext, n ir.Node, call *ir.InlinedCallExpr) {
+	if len(inlCtx.forsStack) == 0 {
+		return
+	}
+
+	child_nodes, child_inlinable_call_nodes := countInlinableCallNodes(call)
+
+	for i := 0; i < len(inlCtx.forsStack); i++ {
+		inlCtx.forsStack[i].liveCounter += child_nodes - 1
+		inlCtx.forsStack[i].callNodes += child_inlinable_call_nodes
+	}
+
+	if base.Flag.LowerM > 1 {
+		fmt.Printf("%v: fixup inline %v\n", ir.Line(n), inlCtx.forsStack)
+	}
+}
+
+func ancestorForsAreSmall(inlCtx *inlContext) bool {
+	return len(inlCtx.forsStack) > 0 && inlCtx.forsStack[0].totalNodes < inlineBigForNodes && inlCtx.forsStack[0].callNodes < inlineBigForCallNodes
+}
+
 // InlinePackage finds functions that can be inlined and clones them before walk expands them.
 func InlinePackage() {
 	ir.VisitFuncsBottomUp(typecheck.Target.Decls, func(list []*ir.Func, recursive bool) {
@@ -167,7 +261,7 @@ func CanInline(fn *ir.Func) {
 	// list. See issue 25249 for more context.
 
 	visitor := hairyVisitor{
-		budget:        inlineMaxBudget,
+		budget:        inlineVisitorBudget,
 		extraCallCost: cc,
 	}
 	if visitor.tooHairy(fn) {
@@ -176,20 +270,24 @@ func CanInline(fn *ir.Func) {
 	}
 
 	n.Func.Inl = &ir.Inline{
-		Cost: inlineMaxBudget - visitor.budget,
+		Cost: inlineVisitorBudget - visitor.budget,
 		Dcl:  pruneUnusedAutos(n.Defn.(*ir.Func).Dcl, &visitor),
 		Body: inlcopylist(fn.Body),
 
 		CanDelayResults: canDelayResults(fn),
 	}
 
 	if base.Flag.LowerM > 1 {
-		fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, inlineMaxBudget-visitor.budget, fn.Type(), ir.Nodes(n.Func.Inl.Body))
-	} else if base.Flag.LowerM != 0 {
+		if isinlinable(n.Func) {
+			fmt.Printf("%v: can inline %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		} else {
+			fmt.Printf("%v: can inline only into small FORs %v with cost %d as: %v { %v }\n", ir.Line(fn), n, n.Func.Inl.Cost, fn.Type(), ir.Nodes(n.Func.Inl.Body))
+		}
+	} else if base.Flag.LowerM != 0 && isinlinable(n.Func) {
 		fmt.Printf("%v: can inline %v\n", ir.Line(fn), n)
 	}
 	if logopt.Enabled() {
-		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", inlineMaxBudget-visitor.budget))
+		logopt.LogOpt(fn.Pos(), "canInlineFunction", "inline", ir.FuncName(fn), fmt.Sprintf("cost: %d", n.Func.Inl.Cost))
 	}
 }
 
@@ -241,7 +339,7 @@ func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
 		return true
 	}
 	if v.budget < 0 {
-		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineMaxBudget-v.budget, inlineMaxBudget)
+		v.reason = fmt.Sprintf("function too complex: cost %d exceeds budget %d", inlineVisitorBudget-v.budget, inlineVisitorBudget)
 		return true
 	}
 	return false
@@ -503,10 +601,11 @@ func InlineCalls(fn *ir.Func) {
 	// but allow inlining if there is a recursion cycle of many functions.
 	// Most likely, the inlining will stop before we even hit the beginning of
 	// the cycle again, but the map catches the unusual case.
-	inlMap := make(map[*ir.Func]bool)
+	inlCtx := inlContext{make(map[*ir.Func]bool), make([]forContext, 0)}
+
 	var edit func(ir.Node) ir.Node
 	edit = func(n ir.Node) ir.Node {
-		return inlnode(n, maxCost, inlMap, edit)
+		return inlnode(n, maxCost, &inlCtx, edit)
 	}
 	ir.EditChildren(fn, edit)
 	ir.CurFunc = savefn
@@ -525,11 +624,16 @@ func InlineCalls(fn *ir.Func) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func inlnode(n ir.Node, maxCost int32, inlCtx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if n == nil {
 		return n
 	}
 
+	if updateForsStack(inlCtx, n); ancestorForsAreSmall(inlCtx) && maxCost == inlineMaxBudget {
+		// Boosts only regular functions
+		maxCost += inlineExtraForBudget
+	}
+
 	switch n.Op() {
 	case ir.ODEFER, ir.OGO:
 		n := n.(*ir.GoDeferStmt)
@@ -584,7 +688,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
 			break
 		}
 		if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
-			n = mkinlcall(call, fn, maxCost, inlMap, edit)
+			n = mkinlcall(call, fn, maxCost, inlCtx, edit)
 		}
 	}
 
@@ -657,7 +761,7 @@ var NewInline = func(call *ir.CallExpr, fn *ir.Func, inlIndex int) *ir.InlinedCa
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.Node) ir.Node) ir.Node {
+func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlCtx *inlContext, edit func(ir.Node) ir.Node) ir.Node {
 	if fn.Inl == nil {
 		if logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(ir.CurFunc),
@@ -693,15 +797,15 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		return n
 	}
 
-	if inlMap[fn] {
+	if inlCtx.inlMap[fn] {
 		if base.Flag.LowerM > 1 {
 			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", ir.Line(n), fn, ir.FuncName(ir.CurFunc))
 		}
 		return n
 	}
-	inlMap[fn] = true
+	inlCtx.inlMap[fn] = true
 	defer func() {
-		inlMap[fn] = false
+		inlCtx.inlMap[fn] = false
 	}()
 
 	typecheck.FixVariadicCall(n)
@@ -730,6 +834,8 @@ func mkinlcall(n *ir.CallExpr, fn *ir.Func, maxCost int32, inlMap map[*ir.Func]b
 		res = oldInline(n, fn, inlIndex)
 	}
 
+	fixupForStackAfterInline(inlCtx, n, res)
+
 	// transitive inlining
 	// might be nice to do this before exporting the body,
 	// but can't emit the body with inlining expanded.
diff --git a/test/inline.go b/test/inline.go
@@ -292,3 +292,43 @@ func conv2(v uint64) uint64 { // ERROR "can inline conv2"
 func conv1(v uint64) uint64 { // ERROR "can inline conv1"
 	return uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(uint64(v)))))))))))
 }
+
+// Inline into FORs
+func func_with_cost_88() {
+	x := 200
+	for i := 0; i < x; i++ {
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() {
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		for j := 0; j < 100; j++ {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+		if i == 4 {
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+		}
+		func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+	}
+
+	func_with_cost_88()
+	func_with_cost_88()
+
+	for i := 0; i < 100; i++ {
+		for j := 0; j < 100; j++ {
+			for x := 0; x < 100; x++ {
+				func_with_cost_88() // ERROR "inlining call to func_with_cost_88"
+			}
+		}
+	}
+	func_with_cost_88()
+}
diff --git a/test/inline_for.go b/test/inline_for.go
@@ -0,0 +1,39 @@
+// errorcheck -0 -m=2
+
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test, using compiler diagnostic flags, that inlining is working.
+// Compiles but does not run.
+
+package foo
+
+import "runtime"
+
+func func_with() int { // ERROR "can inline func_with .*"
+	return 10
+}
+
+func func_with_cost_88() { // ERROR "can inline only into small FORs .*"
+	x := 200
+	for i := 0; i < x; i++ { // ERROR "add for to stack \[\{25 25 0\}\]"
+		if i%2 == 0 {
+			runtime.GC()
+		} else {
+			i += 2
+			x += 1
+		}
+	}
+}
+
+func func_with_fors() { // ERROR "can inline only into small FORs .*"
+	for { // ERROR "add for to stack \[\{6 6 2\}\]"
+		for { // ERROR "add for to stack \[\{5 6 2\} \{2 2 1\}\]"
+			func_with_cost_88() // ERROR "inlining call to func_with_cost_88" "fixup inline \[\{36 6 2\} \{33 2 1\}\]" "add for to stack \[\{29 6 2\} \{26 2 1\} \{25 25 0\}\]"
+		}
+		for { // ERROR "add for to stack"
+			func_with() // ERROR "inlining call to func_with" "fixup inline \[\{10 6 2\} \{10 2 1\}\]"
+		}
+	}
+}