@@ -87,10 +87,134 @@ STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
87
87
STATISTIC (StackSlotMerged, " Number of stack slot merged." );
88
88
STATISTIC (EscapedAllocas, " Number of allocas that escaped the lifetime region" );
89
89
90
+ // ===----------------------------------------------------------------------===//
91
+ // StackColoring Pass
92
+ // ===----------------------------------------------------------------------===//
93
+ //
94
+ // Stack Coloring reduces stack usage by merging stack slots when they
95
+ // can't be used together. For example, consider the following C program:
96
+ //
97
+ // void bar(char *, int);
98
+ // void foo(bool var) {
99
+ // A: {
100
+ // char z[4096];
101
+ // bar(z, 0);
102
+ // }
103
+ //
104
+ // char *p;
105
+ // char x[4096];
106
+ // char y[4096];
107
+ // if (var) {
108
+ // p = x;
109
+ // } else {
110
+ // bar(y, 1);
111
+ // p = y + 1024;
112
+ // }
113
+ // B:
114
+ // bar(p, 2);
115
+ // }
116
+ //
117
+ // Naively-compiled, this program would use 12k of stack space. However, the
118
+ // stack slot corresponding to `z` is always destroyed before either of the
119
+ // stack slots for `x` or `y` are used, and then `x` is only used if `var`
120
+ // is true, while `y` is only used if `var` is false. So in no time are 2
121
+ // of the stack slots used together, and therefore we can merge them,
122
+ // compiling the function using only a single 4k alloca:
123
+ //
124
+ // void foo(bool var) { // equivalent
125
+ // char x[4096];
126
+ // char *p;
127
+ // bar(x, 0);
128
+ // if (var) {
129
+ // p = x;
130
+ // } else {
131
+ // bar(x, 1);
132
+ // p = x + 1024;
133
+ // }
134
+ // bar(p, 2);
135
+ // }
136
+ //
137
+ // This is an important optimization if we want stack space to be under
138
+ // control in large functions, both open-coded ones and ones created by
139
+ // inlining.
90
140
//
91
141
// Implementation Notes:
92
142
// ---------------------
93
143
//
144
+ // An important part of the above reasoning is that `z` can't be accessed
145
+ // while the latter 2 calls to `bar` are running. This is justified because
146
+ // `z`'s lifetime is over after we exit from block `A:`, so any further
147
+ // accesses to it would be UB. The way we represent this information
148
+ // in LLVM is by having frontends delimit blocks with `lifetime.start`
149
+ // and `lifetime.end` intrinsics.
150
+ //
151
+ // The effect of these intrinsics seems to be as follows (maybe I should
152
+ // specify this in the reference?):
153
+ //
154
+ // L1) at start, each stack-slot is marked as *out-of-scope*, unless no
155
+ // lifetime intrinsic refers to that stack slot, in which case
156
+ // it is marked as *in-scope*.
157
+ // L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
158
+ // the stack slot is overwritten with `undef`.
159
+ // L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
160
+ // L4) on function exit, all stack slots are marked as *out-of-scope*.
161
+ // L5) `lifetime.end` is a no-op when called on a slot that is already
162
+ // *out-of-scope*.
163
+ // L6) memory accesses to *out-of-scope* stack slots are UB.
164
+ // L7) when a stack-slot is marked as *out-of-scope*, all pointers to it
165
+ // are invalidated, unless the slot is "degenerate". This is used to
166
+ // justify not marking slots as in-use until the pointer to them is
167
+ // used, but feels a bit hacky in the presence of things like LICM. See
168
+ // the "Degenerate Slots" section for more details.
169
+ //
170
+ // Now, let's ground stack coloring on these rules. We'll define a slot
171
+ // as *in-use* at a (dynamic) point in execution if it either can be
172
+ // written to at that point, or if it has a live and non-undef content
173
+ // at that point.
174
+ //
175
+ // Obviously, slots that are never *in-use* together can be merged, and
176
+ // in our example `foo`, the slots for `x`, `y` and `z` are never
177
+ // in-use together (of course, sometimes slots that *are* in-use together
178
+ // might still be mergable, but we don't care about that here).
179
+ //
180
+ // In this implementation, we successively merge pairs of slots that are
181
+ // not *in-use* together. We could be smarter - for example, we could merge
182
+ // a single large slot with 2 small slots, or we could construct the
183
+ // interference graph and run a "smart" graph coloring algorithm, but with
184
+ // that aside, how do we find out whether a pair of slots might be *in-use*
185
+ // together?
186
+ //
187
+ // From our rules, we see that *out-of-scope* slots are never *in-use*,
188
+ // and from (L7) we see that "non-degenerate" slots remain non-*in-use*
189
+ // until their address is taken. Therefore, we can approximate slot activity
190
+ // using dataflow.
191
+ //
192
+ // A subtle point: naively, we might try to figure out which pairs of
193
+ // stack-slots interfere by propagating `S in-use` through the CFG for every
194
+ // stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
195
+ // which they are both *in-use*.
196
+ //
197
+ // That is sound, but overly conservative in some cases: in our (artificial)
198
+ // example `foo`, either `x` or `y` might be in use at the label `B:`, but
199
+ // as `x` is only in use if we came in from the `var` edge and `y` only
200
+ // if we came from the `!var` edge, they still can't be in use together.
201
+ // See PR32488 for an important real-life case.
202
+ //
203
+ // If we wanted to find all points of interference precisely, we could
204
+ // propagate `S in-use` and `S&T in-use` predicates through the CFG. That
205
+ // would be precise, but requires propagating `O(n^2)` dataflow facts.
206
+ //
207
+ // However, we aren't interested in the *set* of points of interference
208
+ // between 2 stack slots, only *whether* there *is* such a point. So we
209
+ // can rely on a little trick: for `S` and `T` to be in-use together,
210
+ // one of them needs to become in-use while the other is in-use (or
211
+ // they might both become in use simultaneously). We can check this
212
+ // by also keeping track of the points at which a stack slot might *start*
213
+ // being in-use.
214
+ //
215
+ // Exact first use:
216
+ // ----------------
217
+ //
94
218
// Consider the following motivating example:
95
219
//
96
220
// int foo() {
@@ -159,6 +283,9 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
159
283
// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
160
284
// byte stack (better).
161
285
//
286
+ // Degenerate Slots:
287
+ // -----------------
288
+ //
162
289
// Relying entirely on first-use of stack slots is problematic,
163
290
// however, due to the fact that optimizations can sometimes migrate
164
291
// uses of a variable outside of its lifetime start/end region. Here
@@ -238,10 +365,6 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
238
365
// for "b" then it will appear that 'b' has a degenerate lifetime.
239
366
//
240
367
241
- // ===----------------------------------------------------------------------===//
242
- // StackColoring Pass
243
- // ===----------------------------------------------------------------------===//
244
-
245
368
namespace {
246
369
// / StackColoring - A machine pass for merging disjoint stack allocations,
247
370
// / marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
@@ -272,8 +395,11 @@ class StackColoring : public MachineFunctionPass {
272
395
// / Maps basic blocks to a serial number.
273
396
SmallVector<const MachineBasicBlock*, 8 > BasicBlockNumbering;
274
397
275
- // / Maps liveness intervals for each slot.
398
+ // / Maps slots to their use interval. Outside of this interval, slots
399
+ // / values are either dead or `undef` and they will not be written to.
276
400
SmallVector<std::unique_ptr<LiveInterval>, 16 > Intervals;
401
+ // / Maps slots to the points where they can become in-use.
402
+ SmallVector<SmallVector<SlotIndex, 4 >, 16 > LiveStarts;
277
403
// / VNInfo is used for the construction of LiveIntervals.
278
404
VNInfo::Allocator VNInfoAllocator;
279
405
// / SlotIndex analysis object.
@@ -676,15 +802,22 @@ void StackColoring::calculateLocalLiveness()
676
802
677
803
void StackColoring::calculateLiveIntervals (unsigned NumSlots) {
678
804
SmallVector<SlotIndex, 16 > Starts;
679
- SmallVector<SlotIndex , 16 > Finishes ;
805
+ SmallVector<bool , 16 > DefinitelyInUse ;
680
806
681
807
// For each block, find which slots are active within this block
682
808
// and update the live intervals.
683
809
for (const MachineBasicBlock &MBB : *MF) {
684
810
Starts.clear ();
685
811
Starts.resize (NumSlots);
686
- Finishes.clear ();
687
- Finishes.resize (NumSlots);
812
+ DefinitelyInUse.clear ();
813
+ DefinitelyInUse.resize (NumSlots);
814
+
815
+ // Start the interval of the slots that we previously found to be 'in-use'.
816
+ BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
817
+ for (int pos = MBBLiveness.LiveIn .find_first (); pos != -1 ;
818
+ pos = MBBLiveness.LiveIn .find_next (pos)) {
819
+ Starts[pos] = Indexes->getMBBStartIdx (&MBB);
820
+ }
688
821
689
822
// Create the interval for the basic blocks containing lifetime begin/end.
690
823
for (const MachineInstr &MI : MBB) {
@@ -696,68 +829,35 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
696
829
SlotIndex ThisIndex = Indexes->getInstructionIndex (MI);
697
830
for (auto Slot : slots) {
698
831
if (IsStart) {
699
- if (!Starts[Slot].isValid () || Starts[Slot] > ThisIndex)
832
+ // If a slot is already definitely in use, we don't have to emit
833
+ // a new start marker because there is already a pre-existing
834
+ // one.
835
+ if (!DefinitelyInUse[Slot]) {
836
+ LiveStarts[Slot].push_back (ThisIndex);
837
+ DefinitelyInUse[Slot] = true ;
838
+ }
839
+ if (!Starts[Slot].isValid ())
700
840
Starts[Slot] = ThisIndex;
701
841
} else {
702
- if (!Finishes[Slot].isValid () || Finishes[Slot] < ThisIndex)
703
- Finishes[Slot] = ThisIndex;
842
+ if (Starts[Slot].isValid ()) {
843
+ VNInfo *VNI = Intervals[Slot]->getValNumInfo (0 );
844
+ Intervals[Slot]->addSegment (
845
+ LiveInterval::Segment (Starts[Slot], ThisIndex, VNI));
846
+ Starts[Slot] = SlotIndex (); // Invalidate the start index
847
+ DefinitelyInUse[Slot] = false ;
848
+ }
704
849
}
705
850
}
706
851
}
707
852
708
- // Create the interval of the blocks that we previously found to be 'alive'.
709
- BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
710
- for (int pos = MBBLiveness.LiveIn .find_first (); pos != -1 ;
711
- pos = MBBLiveness.LiveIn .find_next (pos)) {
712
- Starts[pos] = Indexes->getMBBStartIdx (&MBB);
713
- }
714
- for (int pos = MBBLiveness.LiveOut .find_first (); pos != -1 ;
715
- pos = MBBLiveness.LiveOut .find_next (pos)) {
716
- Finishes[pos] = Indexes->getMBBEndIdx (&MBB);
717
- }
718
-
853
+ // Finish up started segments
719
854
for (unsigned i = 0 ; i < NumSlots; ++i) {
720
- //
721
- // When LifetimeStartOnFirstUse is turned on, data flow analysis
722
- // is forward (from starts to ends), not bidirectional. A
723
- // consequence of this is that we can wind up in situations
724
- // where Starts[i] is invalid but Finishes[i] is valid and vice
725
- // versa. Example:
726
- //
727
- // LIFETIME_START x
728
- // if (...) {
729
- // <use of x>
730
- // throw ...;
731
- // }
732
- // LIFETIME_END x
733
- // return 2;
734
- //
735
- //
736
- // Here the slot for "x" will not be live into the block
737
- // containing the "return 2" (since lifetimes start with first
738
- // use, not at the dominating LIFETIME_START marker).
739
- //
740
- if (Starts[i].isValid () && !Finishes[i].isValid ()) {
741
- Finishes[i] = Indexes->getMBBEndIdx (&MBB);
742
- }
743
855
if (!Starts[i].isValid ())
744
856
continue ;
745
857
746
- assert (Starts[i] && Finishes[i] && " Invalid interval" );
747
- VNInfo *ValNum = Intervals[i]->getValNumInfo (0 );
748
- SlotIndex S = Starts[i];
749
- SlotIndex F = Finishes[i];
750
- if (S < F) {
751
- // We have a single consecutive region.
752
- Intervals[i]->addSegment (LiveInterval::Segment (S, F, ValNum));
753
- } else {
754
- // We have two non-consecutive regions. This happens when
755
- // LIFETIME_START appears after the LIFETIME_END marker.
756
- SlotIndex NewStart = Indexes->getMBBStartIdx (&MBB);
757
- SlotIndex NewFin = Indexes->getMBBEndIdx (&MBB);
758
- Intervals[i]->addSegment (LiveInterval::Segment (NewStart, F, ValNum));
759
- Intervals[i]->addSegment (LiveInterval::Segment (S, NewFin, ValNum));
760
- }
858
+ SlotIndex EndIdx = Indexes->getMBBEndIdx (&MBB);
859
+ VNInfo *VNI = Intervals[i]->getValNumInfo (0 );
860
+ Intervals[i]->addSegment (LiveInterval::Segment (Starts[i], EndIdx, VNI));
761
861
}
762
862
}
763
863
}
@@ -987,6 +1087,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
987
1087
BasicBlockNumbering.clear ();
988
1088
Markers.clear ();
989
1089
Intervals.clear ();
1090
+ LiveStarts.clear ();
990
1091
VNInfoAllocator.Reset ();
991
1092
992
1093
unsigned NumSlots = MFI->getObjectIndexEnd ();
@@ -998,6 +1099,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
998
1099
SmallVector<int , 8 > SortedSlots;
999
1100
SortedSlots.reserve (NumSlots);
1000
1101
Intervals.reserve (NumSlots);
1102
+ LiveStarts.resize (NumSlots);
1001
1103
1002
1104
unsigned NumMarkers = collectMarkers (NumSlots);
1003
1105
@@ -1069,6 +1171,9 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
1069
1171
return MFI->getObjectSize (LHS) > MFI->getObjectSize (RHS);
1070
1172
});
1071
1173
1174
+ for (auto &s : LiveStarts)
1175
+ std::sort (s.begin (), s.end ());
1176
+
1072
1177
bool Changed = true ;
1073
1178
while (Changed) {
1074
1179
Changed = false ;
@@ -1084,12 +1189,22 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
1084
1189
int SecondSlot = SortedSlots[J];
1085
1190
LiveInterval *First = &*Intervals[FirstSlot];
1086
1191
LiveInterval *Second = &*Intervals[SecondSlot];
1192
+ auto &FirstS = LiveStarts[FirstSlot];
1193
+ auto &SecondS = LiveStarts[SecondSlot];
1087
1194
assert (!First->empty () && !Second->empty () && " Found an empty range" );
1088
1195
1089
- // Merge disjoint slots.
1090
- if (!First->overlaps (*Second)) {
1196
+ // Merge disjoint slots. This is a little bit tricky - see the
1197
+ // Implementation Notes section for an explanation.
1198
+ if (!First->isLiveAtIndexes (SecondS) &&
1199
+ !Second->isLiveAtIndexes (FirstS)) {
1091
1200
Changed = true ;
1092
1201
First->MergeSegmentsInAsValue (*Second, First->getValNumInfo (0 ));
1202
+
1203
+ int OldSize = FirstS.size ();
1204
+ FirstS.append (SecondS.begin (), SecondS.end ());
1205
+ auto Mid = FirstS.begin () + OldSize;
1206
+ std::inplace_merge (FirstS.begin (), Mid, FirstS.end ());
1207
+
1093
1208
SlotRemap[SecondSlot] = FirstSlot;
1094
1209
SortedSlots[J] = -1 ;
1095
1210
DEBUG (dbgs ()<<" Merging #" <<FirstSlot<<" and slots #" <<
0 commit comments