Skip to content

Commit 3de3ca3

Browse files
committed
[AArch64] Optimize add/sub with immediate
Optimize ([add|sub] r, imm) -> ([ADD|SUB] ([ADD|SUB] r, #imm0, lsl #12), #imm1), if imm == (imm0<<12)+imm1. and both imm0 and imm1 are non-zero 12-bit unsigned integers. Optimize ([add|sub] r, imm) -> ([SUB|ADD] ([SUB|ADD] r, #imm0, lsl #12), #imm1), if imm == -(imm0<<12)-imm1, and both imm0 and imm1 are non-zero 12-bit unsigned integers. Reviewed By: jaykang10, dmgreen Differential Revision: https://reviews.llvm.org/D111034
1 parent dc9f037 commit 3de3ca3

File tree

3 files changed

+234
-66
lines changed

3 files changed

+234
-66
lines changed

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

+166-33
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,19 @@
1111
// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
1212
// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
1313
//
14+
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15+
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16+
//
17+
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18+
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19+
//
1420
// The mov pseudo instruction could be expanded to multiple mov instructions
1521
// later. In this case, we could try to split the constant operand of mov
16-
// instruction into two bitmask immediates. It makes two AND instructions
17-
// intead of multiple `mov` + `and` instructions.
22+
// instruction into two immediates which can be directly encoded into
23+
// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24+
// multiple `mov` + `and/add/sub` instructions.
1825
//
19-
// 2. Remove redundant ORRWrs which is generated by zero-extend.
26+
// 4. Remove redundant ORRWrs which is generated by zero-extend.
2027
//
2128
// %3:gpr32 = ORRWrs $wzr, %2, 0
2229
// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
@@ -51,6 +58,12 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
5158
MachineLoopInfo *MLI;
5259
MachineRegisterInfo *MRI;
5360

61+
bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
62+
MachineInstr *&SubregToRegMI);
63+
64+
template <typename T>
65+
bool visitADDSUB(MachineInstr &MI,
66+
SmallSetVector<MachineInstr *, 8> &ToBeRemoved, bool IsAdd);
5467
template <typename T>
5568
bool visitAND(MachineInstr &MI,
5669
SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
@@ -131,36 +144,9 @@ bool AArch64MIPeepholeOpt::visitAND(
131144
assert((RegSize == 32 || RegSize == 64) &&
132145
"Invalid RegSize for AND bitmask peephole optimization");
133146

134-
// Check whether AND's MBB is in loop and the AND is loop invariant.
135-
MachineBasicBlock *MBB = MI.getParent();
136-
MachineLoop *L = MLI->getLoopFor(MBB);
137-
if (L && !L->isLoopInvariant(MI))
138-
return false;
139-
140-
// Check whether AND's operand is MOV with immediate.
141-
MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
142-
if (!MovMI)
143-
return false;
144-
145-
MachineInstr *SubregToRegMI = nullptr;
146-
// If it is SUBREG_TO_REG, check its operand.
147-
if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
148-
SubregToRegMI = MovMI;
149-
MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
150-
if (!MovMI)
151-
return false;
152-
}
153-
154-
if (MovMI->getOpcode() != AArch64::MOVi32imm &&
155-
MovMI->getOpcode() != AArch64::MOVi64imm)
156-
return false;
157-
158-
// If the MOV has multiple uses, do not split the immediate because it causes
159-
// more instructions.
160-
if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
161-
return false;
162-
163-
if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
147+
// Perform several essential checks against current MI.
148+
MachineInstr *MovMI = nullptr, *SubregToRegMI = nullptr;
149+
if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
164150
return false;
165151

166152
// Split the bitmask immediate into two.
@@ -177,6 +163,7 @@ bool AArch64MIPeepholeOpt::visitAND(
177163

178164
// Create new AND MIs.
179165
DebugLoc DL = MI.getDebugLoc();
166+
MachineBasicBlock *MBB = MI.getParent();
180167
const TargetRegisterClass *ANDImmRC =
181168
(RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
182169
Register DstReg = MI.getOperand(0).getReg();
@@ -251,6 +238,139 @@ bool AArch64MIPeepholeOpt::visitORR(
251238
return true;
252239
}
253240

241+
template <typename T>
242+
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
243+
// The immediate must be in the form of ((imm0 << 12) + imm1), in which both
244+
// imm0 and imm1 are non-zero 12-bit unsigned int.
245+
if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
246+
(Imm & ~static_cast<T>(0xffffff)) != 0)
247+
return false;
248+
249+
// The immediate can not be composed via a single instruction.
250+
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
251+
AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
252+
if (Insn.size() == 1)
253+
return false;
254+
255+
// Split Imm into (Imm0 << 12) + Imm1;
256+
Imm0 = (Imm >> 12) & 0xfff;
257+
Imm1 = Imm & 0xfff;
258+
return true;
259+
}
260+
261+
template <typename T>
262+
bool AArch64MIPeepholeOpt::visitADDSUB(
263+
MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
264+
bool IsAdd) {
265+
// Try below transformation.
266+
//
267+
// MOVi32imm + ADDWrr ==> ANDWri + ANDWri
268+
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
269+
//
270+
// MOVi32imm + SUBWrr ==> SUBWri + SUBWri
271+
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
272+
//
273+
// The mov pseudo instruction could be expanded to multiple mov instructions
274+
// later. Let's try to split the constant operand of mov instruction into two
275+
// legal add/sub immediates. It makes only two ADD/SUB instructions intead of
276+
// multiple `mov` + `and/sub` instructions.
277+
278+
unsigned RegSize = sizeof(T) * 8;
279+
assert((RegSize == 32 || RegSize == 64) &&
280+
"Invalid RegSize for legal add/sub immediate peephole optimization");
281+
282+
// Perform several essential checks against current MI.
283+
MachineInstr *MovMI, *SubregToRegMI;
284+
if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
285+
return false;
286+
287+
// Split the immediate to Imm0 and Imm1, and calculate the Opcode.
288+
T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
289+
unsigned Opcode;
290+
if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) {
291+
if (IsAdd)
292+
Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri;
293+
else
294+
Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri;
295+
} else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) {
296+
if (IsAdd)
297+
Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri;
298+
else
299+
Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri;
300+
} else {
301+
return false;
302+
}
303+
304+
// Create new ADD/SUB MIs.
305+
DebugLoc DL = MI.getDebugLoc();
306+
MachineBasicBlock *MBB = MI.getParent();
307+
const TargetRegisterClass *RC =
308+
(RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
309+
Register DstReg = MI.getOperand(0).getReg();
310+
Register SrcReg = MI.getOperand(1).getReg();
311+
Register TmpReg = MRI->createVirtualRegister(RC);
312+
313+
MRI->constrainRegClass(SrcReg, RC);
314+
BuildMI(*MBB, MI, DL, TII->get(Opcode), TmpReg)
315+
.addReg(SrcReg)
316+
.addImm(Imm0)
317+
.addImm(12);
318+
319+
MRI->constrainRegClass(DstReg, RC);
320+
BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
321+
.addReg(TmpReg)
322+
.addImm(Imm1)
323+
.addImm(0);
324+
325+
// Record the MIs need to be removed.
326+
ToBeRemoved.insert(&MI);
327+
if (SubregToRegMI)
328+
ToBeRemoved.insert(SubregToRegMI);
329+
ToBeRemoved.insert(MovMI);
330+
331+
return true;
332+
}
333+
334+
// Checks if the corresponding MOV immediate instruction is applicable for
335+
// this peephole optimization.
336+
bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
337+
MachineInstr *&MovMI,
338+
MachineInstr *&SubregToRegMI) {
339+
// Check whether current MBB is in loop and the AND is loop invariant.
340+
MachineBasicBlock *MBB = MI.getParent();
341+
MachineLoop *L = MLI->getLoopFor(MBB);
342+
if (L && !L->isLoopInvariant(MI))
343+
return false;
344+
345+
// Check whether current MI's operand is MOV with immediate.
346+
MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
347+
if (!MovMI)
348+
return false;
349+
350+
// If it is SUBREG_TO_REG, check its operand.
351+
SubregToRegMI = nullptr;
352+
if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
353+
SubregToRegMI = MovMI;
354+
MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
355+
if (!MovMI)
356+
return false;
357+
}
358+
359+
if (MovMI->getOpcode() != AArch64::MOVi32imm &&
360+
MovMI->getOpcode() != AArch64::MOVi64imm)
361+
return false;
362+
363+
// If the MOV has multiple uses, do not split the immediate because it causes
364+
// more instructions.
365+
if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
366+
return false;
367+
if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
368+
return false;
369+
370+
// It is OK to perform this peephole optimization.
371+
return true;
372+
}
373+
254374
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
255375
if (skipFunction(MF.getFunction()))
256376
return false;
@@ -278,6 +398,19 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
278398
break;
279399
case AArch64::ORRWrs:
280400
Changed = visitORR(MI, ToBeRemoved);
401+
break;
402+
case AArch64::ADDWrr:
403+
Changed = visitADDSUB<uint32_t>(MI, ToBeRemoved, true);
404+
break;
405+
case AArch64::SUBWrr:
406+
Changed = visitADDSUB<uint32_t>(MI, ToBeRemoved, false);
407+
break;
408+
case AArch64::ADDXrr:
409+
Changed = visitADDSUB<uint64_t>(MI, ToBeRemoved, true);
410+
break;
411+
case AArch64::SUBXrr:
412+
Changed = visitADDSUB<uint64_t>(MI, ToBeRemoved, false);
413+
break;
281414
}
282415
}
283416
}

0 commit comments

Comments
 (0)