@@ -86,10 +86,134 @@ STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
86
86
STATISTIC (StackSlotMerged, " Number of stack slot merged." );
87
87
STATISTIC (EscapedAllocas, " Number of allocas that escaped the lifetime region" );
88
88
89
+ // ===----------------------------------------------------------------------===//
90
+ // StackColoring Pass
91
+ // ===----------------------------------------------------------------------===//
92
+ //
93
+ // Stack Coloring reduces stack usage by merging stack slots when they
94
+ // can't be used together. For example, consider the following C program:
95
+ //
96
+ // void bar(char *, int);
97
+ // void foo(bool var) {
98
+ // A: {
99
+ // char z[4096];
100
+ // bar(z, 0);
101
+ // }
102
+ //
103
+ // char *p;
104
+ // char x[4096];
105
+ // char y[4096];
106
+ // if (var) {
107
+ // p = x;
108
+ // } else {
109
+ // bar(y, 1);
110
+ // p = y + 1024;
111
+ // }
112
+ // B:
113
+ // bar(p, 2);
114
+ // }
115
+ //
116
+ // Naively-compiled, this program would use 12k of stack space. However, the
117
+ // stack slot corresponding to `z` is always destroyed before either of the
118
+ // stack slots for `x` or `y` are used, and then `x` is only used if `var`
119
+ // is true, while `y` is only used if `var` is false. So in no time are 2
120
+ // of the stack slots used together, and therefore we can merge them,
121
+ // compiling the function using only a single 4k alloca:
122
+ //
123
+ // void foo(bool var) { // equivalent
124
+ // char x[4096];
125
+ // char *p;
126
+ // bar(x, 0);
127
+ // if (var) {
128
+ // p = x;
129
+ // } else {
130
+ // bar(x, 1);
131
+ // p = x + 1024;
132
+ // }
133
+ // bar(p, 2);
134
+ // }
135
+ //
136
+ // This is an important optimization if we want stack space to be under
137
+ // control in large functions, both open-coded ones and ones created by
138
+ // inlining.
89
139
//
90
140
// Implementation Notes:
91
141
// ---------------------
92
142
//
143
+ // An important part of the above reasoning is that `z` can't be accessed
144
+ // while the latter 2 calls to `bar` are running. This is justified because
145
+ // `z`'s lifetime is over after we exit from block `A:`, so any further
146
+ // accesses to it would be UB. The way we represent this information
147
+ // in LLVM is by having frontends delimit blocks with `lifetime.start`
148
+ // and `lifetime.end` intrinsics.
149
+ //
150
+ // The effect of these intrinsics seems to be as follows (maybe I should
151
+ // specify this in the reference?):
152
+ //
153
+ // L1) at start, each stack-slot is marked as *out-of-scope*, unless no
154
+ // lifetime intrinsic refers to that stack slot, in which case
155
+ // it is marked as *in-scope*.
156
+ // L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
157
+ // the stack slot is overwritten with `undef`.
158
+ // L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
159
+ // L4) on function exit, all stack slots are marked as *out-of-scope*.
160
+ // L5) `lifetime.end` is a no-op when called on a slot that is already
161
+ // *out-of-scope*.
162
+ // L6) memory accesses to *out-of-scope* stack slots are UB.
163
+ // L7) when a stack-slot is marked as *out-of-scope*, all pointers to it
164
+ // are invalidated, unless the slot is "degenerate". This is used to
165
+ // justify not marking slots as in-use until the pointer to them is
166
+ // used, but feels a bit hacky in the presence of things like LICM. See
167
+ // the "Degenerate Slots" section for more details.
168
+ //
169
+ // Now, let's ground stack coloring on these rules. We'll define a slot
170
+ // as *in-use* at a (dynamic) point in execution if it either can be
171
+ // written to at that point, or if it has a live and non-undef content
172
+ // at that point.
173
+ //
174
+ // Obviously, slots that are never *in-use* together can be merged, and
175
+ // in our example `foo`, the slots for `x`, `y` and `z` are never
176
+ // in-use together (of course, sometimes slots that *are* in-use together
177
+ // might still be mergable, but we don't care about that here).
178
+ //
179
+ // In this implementation, we successively merge pairs of slots that are
180
+ // not *in-use* together. We could be smarter - for example, we could merge
181
+ // a single large slot with 2 small slots, or we could construct the
182
+ // interference graph and run a "smart" graph coloring algorithm, but with
183
+ // that aside, how do we find out whether a pair of slots might be *in-use*
184
+ // together?
185
+ //
186
+ // From our rules, we see that *out-of-scope* slots are never *in-use*,
187
+ // and from (L7) we see that "non-degenerate" slots remain non-*in-use*
188
+ // until their address is taken. Therefore, we can approximate slot activity
189
+ // using dataflow.
190
+ //
191
+ // A subtle point: naively, we might try to figure out which pairs of
192
+ // stack-slots interfere by propagating `S in-use` through the CFG for every
193
+ // stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
194
+ // which they are both *in-use*.
195
+ //
196
+ // That is sound, but overly conservative in some cases: in our (artificial)
197
+ // example `foo`, either `x` or `y` might be in use at the label `B:`, but
198
+ // as `x` is only in use if we came in from the `var` edge and `y` only
199
+ // if we came from the `!var` edge, they still can't be in use together.
200
+ // See PR32488 for an important real-life case.
201
+ //
202
+ // If we wanted to find all points of interference precisely, we could
203
+ // propagate `S in-use` and `S&T in-use` predicates through the CFG. That
204
+ // would be precise, but requires propagating `O(n^2)` dataflow facts.
205
+ //
206
+ // However, we aren't interested in the *set* of points of interference
207
+ // between 2 stack slots, only *whether* there *is* such a point. So we
208
+ // can rely on a little trick: for `S` and `T` to be in-use together,
209
+ // one of them needs to become in-use while the other is in-use (or
210
+ // they might both become in use simultaneously). We can check this
211
+ // by also keeping track of the points at which a stack slot might *start*
212
+ // being in-use.
213
+ //
214
+ // Exact first use:
215
+ // ----------------
216
+ //
93
217
// Consider the following motivating example:
94
218
//
95
219
// int foo() {
@@ -158,6 +282,9 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
158
282
// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
159
283
// byte stack (better).
160
284
//
285
+ // Degenerate Slots:
286
+ // -----------------
287
+ //
161
288
// Relying entirely on first-use of stack slots is problematic,
162
289
// however, due to the fact that optimizations can sometimes migrate
163
290
// uses of a variable outside of its lifetime start/end region. Here
@@ -237,10 +364,6 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
237
364
// for "b" then it will appear that 'b' has a degenerate lifetime.
238
365
//
239
366
240
- // ===----------------------------------------------------------------------===//
241
- // StackColoring Pass
242
- // ===----------------------------------------------------------------------===//
243
-
244
367
namespace {
245
368
// / StackColoring - A machine pass for merging disjoint stack allocations,
246
369
// / marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
@@ -271,8 +394,11 @@ class StackColoring : public MachineFunctionPass {
271
394
// / Maps basic blocks to a serial number.
272
395
SmallVector<const MachineBasicBlock*, 8 > BasicBlockNumbering;
273
396
274
- // / Maps liveness intervals for each slot.
397
+ // / Maps slots to their use interval. Outside of this interval, slots
398
+ // / values are either dead or `undef` and they will not be written to.
275
399
SmallVector<std::unique_ptr<LiveInterval>, 16 > Intervals;
400
+ // / Maps slots to the points where they can become in-use.
401
+ SmallVector<SmallVector<SlotIndex, 4 >, 16 > LiveStarts;
276
402
// / VNInfo is used for the construction of LiveIntervals.
277
403
VNInfo::Allocator VNInfoAllocator;
278
404
// / SlotIndex analysis object.
@@ -672,15 +798,22 @@ void StackColoring::calculateLocalLiveness()
672
798
673
799
void StackColoring::calculateLiveIntervals (unsigned NumSlots) {
674
800
SmallVector<SlotIndex, 16 > Starts;
675
- SmallVector<SlotIndex , 16 > Finishes ;
801
+ SmallVector<bool , 16 > DefinitelyInUse ;
676
802
677
803
// For each block, find which slots are active within this block
678
804
// and update the live intervals.
679
805
for (const MachineBasicBlock &MBB : *MF) {
680
806
Starts.clear ();
681
807
Starts.resize (NumSlots);
682
- Finishes.clear ();
683
- Finishes.resize (NumSlots);
808
+ DefinitelyInUse.clear ();
809
+ DefinitelyInUse.resize (NumSlots);
810
+
811
+ // Start the interval of the slots that we previously found to be 'in-use'.
812
+ BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
813
+ for (int pos = MBBLiveness.LiveIn .find_first (); pos != -1 ;
814
+ pos = MBBLiveness.LiveIn .find_next (pos)) {
815
+ Starts[pos] = Indexes->getMBBStartIdx (&MBB);
816
+ }
684
817
685
818
// Create the interval for the basic blocks containing lifetime begin/end.
686
819
for (const MachineInstr &MI : MBB) {
@@ -692,66 +825,35 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
692
825
SlotIndex ThisIndex = Indexes->getInstructionIndex (MI);
693
826
for (auto Slot : slots) {
694
827
if (IsStart) {
695
- if (!Starts[Slot].isValid () || Starts[Slot] > ThisIndex)
828
+ // If a slot is already definitely in use, we don't have to emit
829
+ // a new start marker because there is already a pre-existing
830
+ // one.
831
+ if (!DefinitelyInUse[Slot]) {
832
+ LiveStarts[Slot].push_back (ThisIndex);
833
+ DefinitelyInUse[Slot] = true ;
834
+ }
835
+ if (!Starts[Slot].isValid ())
696
836
Starts[Slot] = ThisIndex;
697
837
} else {
698
- if (!Finishes[Slot].isValid () || Finishes[Slot] < ThisIndex)
699
- Finishes[Slot] = ThisIndex;
838
+ if (Starts[Slot].isValid ()) {
839
+ VNInfo *VNI = Intervals[Slot]->getValNumInfo (0 );
840
+ Intervals[Slot]->addSegment (
841
+ LiveInterval::Segment (Starts[Slot], ThisIndex, VNI));
842
+ Starts[Slot] = SlotIndex (); // Invalidate the start index
843
+ DefinitelyInUse[Slot] = false ;
844
+ }
700
845
}
701
846
}
702
847
}
703
848
704
- // Create the interval of the blocks that we previously found to be 'alive'.
705
- BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
706
- for (unsigned pos : MBBLiveness.LiveIn .set_bits ()) {
707
- Starts[pos] = Indexes->getMBBStartIdx (&MBB);
708
- }
709
- for (unsigned pos : MBBLiveness.LiveOut .set_bits ()) {
710
- Finishes[pos] = Indexes->getMBBEndIdx (&MBB);
711
- }
712
-
849
+ // Finish up started segments
713
850
for (unsigned i = 0 ; i < NumSlots; ++i) {
714
- //
715
- // When LifetimeStartOnFirstUse is turned on, data flow analysis
716
- // is forward (from starts to ends), not bidirectional. A
717
- // consequence of this is that we can wind up in situations
718
- // where Starts[i] is invalid but Finishes[i] is valid and vice
719
- // versa. Example:
720
- //
721
- // LIFETIME_START x
722
- // if (...) {
723
- // <use of x>
724
- // throw ...;
725
- // }
726
- // LIFETIME_END x
727
- // return 2;
728
- //
729
- //
730
- // Here the slot for "x" will not be live into the block
731
- // containing the "return 2" (since lifetimes start with first
732
- // use, not at the dominating LIFETIME_START marker).
733
- //
734
- if (Starts[i].isValid () && !Finishes[i].isValid ()) {
735
- Finishes[i] = Indexes->getMBBEndIdx (&MBB);
736
- }
737
851
if (!Starts[i].isValid ())
738
852
continue ;
739
853
740
- assert (Starts[i] && Finishes[i] && " Invalid interval" );
741
- VNInfo *ValNum = Intervals[i]->getValNumInfo (0 );
742
- SlotIndex S = Starts[i];
743
- SlotIndex F = Finishes[i];
744
- if (S < F) {
745
- // We have a single consecutive region.
746
- Intervals[i]->addSegment (LiveInterval::Segment (S, F, ValNum));
747
- } else {
748
- // We have two non-consecutive regions. This happens when
749
- // LIFETIME_START appears after the LIFETIME_END marker.
750
- SlotIndex NewStart = Indexes->getMBBStartIdx (&MBB);
751
- SlotIndex NewFin = Indexes->getMBBEndIdx (&MBB);
752
- Intervals[i]->addSegment (LiveInterval::Segment (NewStart, F, ValNum));
753
- Intervals[i]->addSegment (LiveInterval::Segment (S, NewFin, ValNum));
754
- }
854
+ SlotIndex EndIdx = Indexes->getMBBEndIdx (&MBB);
855
+ VNInfo *VNI = Intervals[i]->getValNumInfo (0 );
856
+ Intervals[i]->addSegment (LiveInterval::Segment (Starts[i], EndIdx, VNI));
755
857
}
756
858
}
757
859
}
@@ -981,6 +1083,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
981
1083
BasicBlockNumbering.clear ();
982
1084
Markers.clear ();
983
1085
Intervals.clear ();
1086
+ LiveStarts.clear ();
984
1087
VNInfoAllocator.Reset ();
985
1088
986
1089
unsigned NumSlots = MFI->getObjectIndexEnd ();
@@ -992,6 +1095,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
992
1095
SmallVector<int , 8 > SortedSlots;
993
1096
SortedSlots.reserve (NumSlots);
994
1097
Intervals.reserve (NumSlots);
1098
+ LiveStarts.resize (NumSlots);
995
1099
996
1100
unsigned NumMarkers = collectMarkers (NumSlots);
997
1101
@@ -1063,6 +1167,9 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
1063
1167
return MFI->getObjectSize (LHS) > MFI->getObjectSize (RHS);
1064
1168
});
1065
1169
1170
+ for (auto &s : LiveStarts)
1171
+ std::sort (s.begin (), s.end ());
1172
+
1066
1173
bool Changed = true ;
1067
1174
while (Changed) {
1068
1175
Changed = false ;
@@ -1078,12 +1185,22 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
1078
1185
int SecondSlot = SortedSlots[J];
1079
1186
LiveInterval *First = &*Intervals[FirstSlot];
1080
1187
LiveInterval *Second = &*Intervals[SecondSlot];
1188
+ auto &FirstS = LiveStarts[FirstSlot];
1189
+ auto &SecondS = LiveStarts[SecondSlot];
1081
1190
assert (!First->empty () && !Second->empty () && " Found an empty range" );
1082
1191
1083
- // Merge disjoint slots.
1084
- if (!First->overlaps (*Second)) {
1192
+ // Merge disjoint slots. This is a little bit tricky - see the
1193
+ // Implementation Notes section for an explanation.
1194
+ if (!First->isLiveAtIndexes (SecondS) &&
1195
+ !Second->isLiveAtIndexes (FirstS)) {
1085
1196
Changed = true ;
1086
1197
First->MergeSegmentsInAsValue (*Second, First->getValNumInfo (0 ));
1198
+
1199
+ int OldSize = FirstS.size ();
1200
+ FirstS.append (SecondS.begin (), SecondS.end ());
1201
+ auto Mid = FirstS.begin () + OldSize;
1202
+ std::inplace_merge (FirstS.begin (), Mid, FirstS.end ());
1203
+
1087
1204
SlotRemap[SecondSlot] = FirstSlot;
1088
1205
SortedSlots[J] = -1 ;
1089
1206
DEBUG (dbgs ()<<" Merging #" <<FirstSlot<<" and slots #" <<
0 commit comments