1 //===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/CodeGen/ModuloSchedule.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/Analysis/MemoryLocation.h"
12 #include "llvm/CodeGen/LiveIntervals.h"
13 #include "llvm/CodeGen/MachineInstrBuilder.h"
14 #include "llvm/CodeGen/MachineLoopInfo.h"
15 #include "llvm/CodeGen/MachineRegisterInfo.h"
16 #include "llvm/InitializePasses.h"
17 #include "llvm/MC/MCContext.h"
18 #include "llvm/Support/Debug.h"
19 #include "llvm/Support/ErrorHandling.h"
20 #include "llvm/Support/raw_ostream.h"
21
22 #define DEBUG_TYPE "pipeliner"
23 using namespace llvm;
24
25 static cl::opt<bool> SwapBranchTargetsMVE(
26 "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false),
27 cl::desc("Swap target blocks of a conditional branch for MVE expander"));
28
print(raw_ostream & OS)29 void ModuloSchedule::print(raw_ostream &OS) {
30 for (MachineInstr *MI : ScheduledInstrs)
31 OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
32 }
33
34 //===----------------------------------------------------------------------===//
35 // ModuloScheduleExpander implementation
36 //===----------------------------------------------------------------------===//
37
38 /// Return the register values for the operands of a Phi instruction.
39 /// This function assume the instruction is a Phi.
getPhiRegs(MachineInstr & Phi,MachineBasicBlock * Loop,unsigned & InitVal,unsigned & LoopVal)40 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
41 unsigned &InitVal, unsigned &LoopVal) {
42 assert(Phi.isPHI() && "Expecting a Phi.");
43
44 InitVal = 0;
45 LoopVal = 0;
46 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
47 if (Phi.getOperand(i + 1).getMBB() != Loop)
48 InitVal = Phi.getOperand(i).getReg();
49 else
50 LoopVal = Phi.getOperand(i).getReg();
51
52 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
53 }
54
55 /// Return the Phi register value that comes from the incoming block.
getInitPhiReg(MachineInstr & Phi,MachineBasicBlock * LoopBB)56 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
57 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
58 if (Phi.getOperand(i + 1).getMBB() != LoopBB)
59 return Phi.getOperand(i).getReg();
60 return 0;
61 }
62
63 /// Return the Phi register value that comes the loop block.
getLoopPhiReg(MachineInstr & Phi,MachineBasicBlock * LoopBB)64 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
65 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
66 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
67 return Phi.getOperand(i).getReg();
68 return 0;
69 }
70
expand()71 void ModuloScheduleExpander::expand() {
72 BB = Schedule.getLoop()->getTopBlock();
73 Preheader = *BB->pred_begin();
74 if (Preheader == BB)
75 Preheader = *std::next(BB->pred_begin());
76
77 // Iterate over the definitions in each instruction, and compute the
78 // stage difference for each use. Keep the maximum value.
79 for (MachineInstr *MI : Schedule.getInstructions()) {
80 int DefStage = Schedule.getStage(MI);
81 for (const MachineOperand &Op : MI->all_defs()) {
82 Register Reg = Op.getReg();
83 unsigned MaxDiff = 0;
84 bool PhiIsSwapped = false;
85 for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
86 MachineInstr *UseMI = UseOp.getParent();
87 int UseStage = Schedule.getStage(UseMI);
88 unsigned Diff = 0;
89 if (UseStage != -1 && UseStage >= DefStage)
90 Diff = UseStage - DefStage;
91 if (MI->isPHI()) {
92 if (isLoopCarried(*MI))
93 ++Diff;
94 else
95 PhiIsSwapped = true;
96 }
97 MaxDiff = std::max(Diff, MaxDiff);
98 }
99 RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
100 }
101 }
102
103 generatePipelinedLoop();
104 }
105
generatePipelinedLoop()106 void ModuloScheduleExpander::generatePipelinedLoop() {
107 LoopInfo = TII->analyzeLoopForPipelining(BB);
108 assert(LoopInfo && "Must be able to analyze loop!");
109
110 // Create a new basic block for the kernel and add it to the CFG.
111 MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
112
113 unsigned MaxStageCount = Schedule.getNumStages() - 1;
114
115 // Remember the registers that are used in different stages. The index is
116 // the iteration, or stage, that the instruction is scheduled in. This is
117 // a map between register names in the original block and the names created
118 // in each stage of the pipelined loop.
119 ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
120
121 // The renaming destination by Phis for the registers across stages.
122 // This map is updated during Phis generation to point to the most recent
123 // renaming destination.
124 ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
125
126 InstrMapTy InstrMap;
127
128 SmallVector<MachineBasicBlock *, 4> PrologBBs;
129
130 // Generate the prolog instructions that set up the pipeline.
131 generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
132 MF.insert(BB->getIterator(), KernelBB);
133 LIS.insertMBBInMaps(KernelBB);
134
135 // Rearrange the instructions to generate the new, pipelined loop,
136 // and update register names as needed.
137 for (MachineInstr *CI : Schedule.getInstructions()) {
138 if (CI->isPHI())
139 continue;
140 unsigned StageNum = Schedule.getStage(CI);
141 MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
142 updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
143 KernelBB->push_back(NewMI);
144 InstrMap[NewMI] = CI;
145 }
146
147 // Copy any terminator instructions to the new kernel, and update
148 // names as needed.
149 for (MachineInstr &MI : BB->terminators()) {
150 MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
151 updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
152 KernelBB->push_back(NewMI);
153 InstrMap[NewMI] = &MI;
154 }
155
156 NewKernel = KernelBB;
157 KernelBB->transferSuccessors(BB);
158 KernelBB->replaceSuccessor(BB, KernelBB);
159
160 generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
161 InstrMap, MaxStageCount, MaxStageCount, false);
162 generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
163 InstrMap, MaxStageCount, MaxStageCount, false);
164
165 LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
166
167 SmallVector<MachineBasicBlock *, 4> EpilogBBs;
168 // Generate the epilog instructions to complete the pipeline.
169 generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
170 PrologBBs);
171
172 // We need this step because the register allocation doesn't handle some
173 // situations well, so we insert copies to help out.
174 splitLifetimes(KernelBB, EpilogBBs);
175
176 // Remove dead instructions due to loop induction variables.
177 removeDeadInstructions(KernelBB, EpilogBBs);
178
179 // Add branches between prolog and epilog blocks.
180 addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
181
182 delete[] VRMap;
183 delete[] VRMapPhi;
184 }
185
cleanup()186 void ModuloScheduleExpander::cleanup() {
187 // Remove the original loop since it's no longer referenced.
188 for (auto &I : *BB)
189 LIS.RemoveMachineInstrFromMaps(I);
190 BB->clear();
191 BB->eraseFromParent();
192 }
193
194 /// Generate the pipeline prolog code.
generateProlog(unsigned LastStage,MachineBasicBlock * KernelBB,ValueMapTy * VRMap,MBBVectorTy & PrologBBs)195 void ModuloScheduleExpander::generateProlog(unsigned LastStage,
196 MachineBasicBlock *KernelBB,
197 ValueMapTy *VRMap,
198 MBBVectorTy &PrologBBs) {
199 MachineBasicBlock *PredBB = Preheader;
200 InstrMapTy InstrMap;
201
202 // Generate a basic block for each stage, not including the last stage,
203 // which will be generated in the kernel. Each basic block may contain
204 // instructions from multiple stages/iterations.
205 for (unsigned i = 0; i < LastStage; ++i) {
206 // Create and insert the prolog basic block prior to the original loop
207 // basic block. The original loop is removed later.
208 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
209 PrologBBs.push_back(NewBB);
210 MF.insert(BB->getIterator(), NewBB);
211 NewBB->transferSuccessors(PredBB);
212 PredBB->addSuccessor(NewBB);
213 PredBB = NewBB;
214 LIS.insertMBBInMaps(NewBB);
215
216 // Generate instructions for each appropriate stage. Process instructions
217 // in original program order.
218 for (int StageNum = i; StageNum >= 0; --StageNum) {
219 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
220 BBE = BB->getFirstTerminator();
221 BBI != BBE; ++BBI) {
222 if (Schedule.getStage(&*BBI) == StageNum) {
223 if (BBI->isPHI())
224 continue;
225 MachineInstr *NewMI =
226 cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
227 updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
228 NewBB->push_back(NewMI);
229 InstrMap[NewMI] = &*BBI;
230 }
231 }
232 }
233 rewritePhiValues(NewBB, i, VRMap, InstrMap);
234 LLVM_DEBUG({
235 dbgs() << "prolog:\n";
236 NewBB->dump();
237 });
238 }
239
240 PredBB->replaceSuccessor(BB, KernelBB);
241
242 // Check if we need to remove the branch from the preheader to the original
243 // loop, and replace it with a branch to the new loop.
244 unsigned numBranches = TII->removeBranch(*Preheader);
245 if (numBranches) {
246 SmallVector<MachineOperand, 0> Cond;
247 TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
248 }
249 }
250
251 /// Generate the pipeline epilog code. The epilog code finishes the iterations
252 /// that were started in either the prolog or the kernel. We create a basic
253 /// block for each stage that needs to complete.
generateEpilog(unsigned LastStage,MachineBasicBlock * KernelBB,MachineBasicBlock * OrigBB,ValueMapTy * VRMap,ValueMapTy * VRMapPhi,MBBVectorTy & EpilogBBs,MBBVectorTy & PrologBBs)254 void ModuloScheduleExpander::generateEpilog(
255 unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
256 ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
257 MBBVectorTy &PrologBBs) {
258 // We need to change the branch from the kernel to the first epilog block, so
259 // this call to analyze branch uses the kernel rather than the original BB.
260 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
261 SmallVector<MachineOperand, 4> Cond;
262 bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
263 assert(!checkBranch && "generateEpilog must be able to analyze the branch");
264 if (checkBranch)
265 return;
266
267 MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
268 if (*LoopExitI == KernelBB)
269 ++LoopExitI;
270 assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
271 MachineBasicBlock *LoopExitBB = *LoopExitI;
272
273 MachineBasicBlock *PredBB = KernelBB;
274 MachineBasicBlock *EpilogStart = LoopExitBB;
275 InstrMapTy InstrMap;
276
277 // Generate a basic block for each stage, not including the last stage,
278 // which was generated for the kernel. Each basic block may contain
279 // instructions from multiple stages/iterations.
280 int EpilogStage = LastStage + 1;
281 for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
282 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
283 EpilogBBs.push_back(NewBB);
284 MF.insert(BB->getIterator(), NewBB);
285
286 PredBB->replaceSuccessor(LoopExitBB, NewBB);
287 NewBB->addSuccessor(LoopExitBB);
288 LIS.insertMBBInMaps(NewBB);
289
290 if (EpilogStart == LoopExitBB)
291 EpilogStart = NewBB;
292
293 // Add instructions to the epilog depending on the current block.
294 // Process instructions in original program order.
295 for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
296 for (auto &BBI : *BB) {
297 if (BBI.isPHI())
298 continue;
299 MachineInstr *In = &BBI;
300 if ((unsigned)Schedule.getStage(In) == StageNum) {
301 // Instructions with memoperands in the epilog are updated with
302 // conservative values.
303 MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
304 updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
305 NewBB->push_back(NewMI);
306 InstrMap[NewMI] = In;
307 }
308 }
309 }
310 generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
311 InstrMap, LastStage, EpilogStage, i == 1);
312 generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
313 InstrMap, LastStage, EpilogStage, i == 1);
314 PredBB = NewBB;
315
316 LLVM_DEBUG({
317 dbgs() << "epilog:\n";
318 NewBB->dump();
319 });
320 }
321
322 // Fix any Phi nodes in the loop exit block.
323 LoopExitBB->replacePhiUsesWith(BB, PredBB);
324
325 // Create a branch to the new epilog from the kernel.
326 // Remove the original branch and add a new branch to the epilog.
327 TII->removeBranch(*KernelBB);
328 assert((OrigBB == TBB || OrigBB == FBB) &&
329 "Unable to determine looping branch direction");
330 if (OrigBB != TBB)
331 TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
332 else
333 TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
334 // Add a branch to the loop exit.
335 if (EpilogBBs.size() > 0) {
336 MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
337 SmallVector<MachineOperand, 4> Cond1;
338 TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
339 }
340 }
341
342 /// Replace all uses of FromReg that appear outside the specified
343 /// basic block with ToReg.
replaceRegUsesAfterLoop(unsigned FromReg,unsigned ToReg,MachineBasicBlock * MBB,MachineRegisterInfo & MRI,LiveIntervals & LIS)344 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
345 MachineBasicBlock *MBB,
346 MachineRegisterInfo &MRI,
347 LiveIntervals &LIS) {
348 for (MachineOperand &O :
349 llvm::make_early_inc_range(MRI.use_operands(FromReg)))
350 if (O.getParent()->getParent() != MBB)
351 O.setReg(ToReg);
352 if (!LIS.hasInterval(ToReg))
353 LIS.createEmptyInterval(ToReg);
354 }
355
356 /// Return true if the register has a use that occurs outside the
357 /// specified loop.
hasUseAfterLoop(unsigned Reg,MachineBasicBlock * BB,MachineRegisterInfo & MRI)358 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
359 MachineRegisterInfo &MRI) {
360 for (const MachineOperand &MO : MRI.use_operands(Reg))
361 if (MO.getParent()->getParent() != BB)
362 return true;
363 return false;
364 }
365
366 /// Generate Phis for the specific block in the generated pipelined code.
367 /// This function looks at the Phis from the original code to guide the
368 /// creation of new Phis.
generateExistingPhis(MachineBasicBlock * NewBB,MachineBasicBlock * BB1,MachineBasicBlock * BB2,MachineBasicBlock * KernelBB,ValueMapTy * VRMap,InstrMapTy & InstrMap,unsigned LastStageNum,unsigned CurStageNum,bool IsLast)369 void ModuloScheduleExpander::generateExistingPhis(
370 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
371 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
372 unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
373 // Compute the stage number for the initial value of the Phi, which
374 // comes from the prolog. The prolog to use depends on to which kernel/
375 // epilog that we're adding the Phi.
376 unsigned PrologStage = 0;
377 unsigned PrevStage = 0;
378 bool InKernel = (LastStageNum == CurStageNum);
379 if (InKernel) {
380 PrologStage = LastStageNum - 1;
381 PrevStage = CurStageNum;
382 } else {
383 PrologStage = LastStageNum - (CurStageNum - LastStageNum);
384 PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
385 }
386
387 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
388 BBE = BB->getFirstNonPHI();
389 BBI != BBE; ++BBI) {
390 Register Def = BBI->getOperand(0).getReg();
391
392 unsigned InitVal = 0;
393 unsigned LoopVal = 0;
394 getPhiRegs(*BBI, BB, InitVal, LoopVal);
395
396 unsigned PhiOp1 = 0;
397 // The Phi value from the loop body typically is defined in the loop, but
398 // not always. So, we need to check if the value is defined in the loop.
399 unsigned PhiOp2 = LoopVal;
400 if (VRMap[LastStageNum].count(LoopVal))
401 PhiOp2 = VRMap[LastStageNum][LoopVal];
402
403 int StageScheduled = Schedule.getStage(&*BBI);
404 int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
405 unsigned NumStages = getStagesForReg(Def, CurStageNum);
406 if (NumStages == 0) {
407 // We don't need to generate a Phi anymore, but we need to rename any uses
408 // of the Phi value.
409 unsigned NewReg = VRMap[PrevStage][LoopVal];
410 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
411 InitVal, NewReg);
412 if (VRMap[CurStageNum].count(LoopVal))
413 VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
414 }
415 // Adjust the number of Phis needed depending on the number of prologs left,
416 // and the distance from where the Phi is first scheduled. The number of
417 // Phis cannot exceed the number of prolog stages. Each stage can
418 // potentially define two values.
419 unsigned MaxPhis = PrologStage + 2;
420 if (!InKernel && (int)PrologStage <= LoopValStage)
421 MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
422 unsigned NumPhis = std::min(NumStages, MaxPhis);
423
424 unsigned NewReg = 0;
425 unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
426 // In the epilog, we may need to look back one stage to get the correct
427 // Phi name, because the epilog and prolog blocks execute the same stage.
428 // The correct name is from the previous block only when the Phi has
429 // been completely scheduled prior to the epilog, and Phi value is not
430 // needed in multiple stages.
431 int StageDiff = 0;
432 if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
433 NumPhis == 1)
434 StageDiff = 1;
435 // Adjust the computations below when the phi and the loop definition
436 // are scheduled in different stages.
437 if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
438 StageDiff = StageScheduled - LoopValStage;
439 for (unsigned np = 0; np < NumPhis; ++np) {
440 // If the Phi hasn't been scheduled, then use the initial Phi operand
441 // value. Otherwise, use the scheduled version of the instruction. This
442 // is a little complicated when a Phi references another Phi.
443 if (np > PrologStage || StageScheduled >= (int)LastStageNum)
444 PhiOp1 = InitVal;
445 // Check if the Phi has already been scheduled in a prolog stage.
446 else if (PrologStage >= AccessStage + StageDiff + np &&
447 VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
448 PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
449 // Check if the Phi has already been scheduled, but the loop instruction
450 // is either another Phi, or doesn't occur in the loop.
451 else if (PrologStage >= AccessStage + StageDiff + np) {
452 // If the Phi references another Phi, we need to examine the other
453 // Phi to get the correct value.
454 PhiOp1 = LoopVal;
455 MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
456 int Indirects = 1;
457 while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
458 int PhiStage = Schedule.getStage(InstOp1);
459 if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
460 PhiOp1 = getInitPhiReg(*InstOp1, BB);
461 else
462 PhiOp1 = getLoopPhiReg(*InstOp1, BB);
463 InstOp1 = MRI.getVRegDef(PhiOp1);
464 int PhiOpStage = Schedule.getStage(InstOp1);
465 int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
466 if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
467 VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
468 PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
469 break;
470 }
471 ++Indirects;
472 }
473 } else
474 PhiOp1 = InitVal;
475 // If this references a generated Phi in the kernel, get the Phi operand
476 // from the incoming block.
477 if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
478 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
479 PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
480
481 MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
482 bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
483 // In the epilog, a map lookup is needed to get the value from the kernel,
484 // or previous epilog block. How is does this depends on if the
485 // instruction is scheduled in the previous block.
486 if (!InKernel) {
487 int StageDiffAdj = 0;
488 if (LoopValStage != -1 && StageScheduled > LoopValStage)
489 StageDiffAdj = StageScheduled - LoopValStage;
490 // Use the loop value defined in the kernel, unless the kernel
491 // contains the last definition of the Phi.
492 if (np == 0 && PrevStage == LastStageNum &&
493 (StageScheduled != 0 || LoopValStage != 0) &&
494 VRMap[PrevStage - StageDiffAdj].count(LoopVal))
495 PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
496 // Use the value defined by the Phi. We add one because we switch
497 // from looking at the loop value to the Phi definition.
498 else if (np > 0 && PrevStage == LastStageNum &&
499 VRMap[PrevStage - np + 1].count(Def))
500 PhiOp2 = VRMap[PrevStage - np + 1][Def];
501 // Use the loop value defined in the kernel.
502 else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
503 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
504 PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
505 // Use the value defined by the Phi, unless we're generating the first
506 // epilog and the Phi refers to a Phi in a different stage.
507 else if (VRMap[PrevStage - np].count(Def) &&
508 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
509 (LoopValStage == StageScheduled)))
510 PhiOp2 = VRMap[PrevStage - np][Def];
511 }
512
513 // Check if we can reuse an existing Phi. This occurs when a Phi
514 // references another Phi, and the other Phi is scheduled in an
515 // earlier stage. We can try to reuse an existing Phi up until the last
516 // stage of the current Phi.
517 if (LoopDefIsPhi) {
518 if (static_cast<int>(PrologStage - np) >= StageScheduled) {
519 int LVNumStages = getStagesForPhi(LoopVal);
520 int StageDiff = (StageScheduled - LoopValStage);
521 LVNumStages -= StageDiff;
522 // Make sure the loop value Phi has been processed already.
523 if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
524 NewReg = PhiOp2;
525 unsigned ReuseStage = CurStageNum;
526 if (isLoopCarried(*PhiInst))
527 ReuseStage -= LVNumStages;
528 // Check if the Phi to reuse has been generated yet. If not, then
529 // there is nothing to reuse.
530 if (VRMap[ReuseStage - np].count(LoopVal)) {
531 NewReg = VRMap[ReuseStage - np][LoopVal];
532
533 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
534 Def, NewReg);
535 // Update the map with the new Phi name.
536 VRMap[CurStageNum - np][Def] = NewReg;
537 PhiOp2 = NewReg;
538 if (VRMap[LastStageNum - np - 1].count(LoopVal))
539 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
540
541 if (IsLast && np == NumPhis - 1)
542 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
543 continue;
544 }
545 }
546 }
547 if (InKernel && StageDiff > 0 &&
548 VRMap[CurStageNum - StageDiff - np].count(LoopVal))
549 PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
550 }
551
552 const TargetRegisterClass *RC = MRI.getRegClass(Def);
553 NewReg = MRI.createVirtualRegister(RC);
554
555 MachineInstrBuilder NewPhi =
556 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
557 TII->get(TargetOpcode::PHI), NewReg);
558 NewPhi.addReg(PhiOp1).addMBB(BB1);
559 NewPhi.addReg(PhiOp2).addMBB(BB2);
560 if (np == 0)
561 InstrMap[NewPhi] = &*BBI;
562
563 // We define the Phis after creating the new pipelined code, so
564 // we need to rename the Phi values in scheduled instructions.
565
566 unsigned PrevReg = 0;
567 if (InKernel && VRMap[PrevStage - np].count(LoopVal))
568 PrevReg = VRMap[PrevStage - np][LoopVal];
569 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
570 NewReg, PrevReg);
571 // If the Phi has been scheduled, use the new name for rewriting.
572 if (VRMap[CurStageNum - np].count(Def)) {
573 unsigned R = VRMap[CurStageNum - np][Def];
574 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
575 NewReg);
576 }
577
578 // Check if we need to rename any uses that occurs after the loop. The
579 // register to replace depends on whether the Phi is scheduled in the
580 // epilog.
581 if (IsLast && np == NumPhis - 1)
582 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
583
584 // In the kernel, a dependent Phi uses the value from this Phi.
585 if (InKernel)
586 PhiOp2 = NewReg;
587
588 // Update the map with the new Phi name.
589 VRMap[CurStageNum - np][Def] = NewReg;
590 }
591
592 while (NumPhis++ < NumStages) {
593 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
594 NewReg, 0);
595 }
596
597 // Check if we need to rename a Phi that has been eliminated due to
598 // scheduling.
599 if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
600 replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
601 }
602 }
603
604 /// Generate Phis for the specified block in the generated pipelined code.
605 /// These are new Phis needed because the definition is scheduled after the
606 /// use in the pipelined sequence.
generatePhis(MachineBasicBlock * NewBB,MachineBasicBlock * BB1,MachineBasicBlock * BB2,MachineBasicBlock * KernelBB,ValueMapTy * VRMap,ValueMapTy * VRMapPhi,InstrMapTy & InstrMap,unsigned LastStageNum,unsigned CurStageNum,bool IsLast)607 void ModuloScheduleExpander::generatePhis(
608 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
609 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
610 InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
611 bool IsLast) {
612 // Compute the stage number that contains the initial Phi value, and
613 // the Phi from the previous stage.
614 unsigned PrologStage = 0;
615 unsigned PrevStage = 0;
616 unsigned StageDiff = CurStageNum - LastStageNum;
617 bool InKernel = (StageDiff == 0);
618 if (InKernel) {
619 PrologStage = LastStageNum - 1;
620 PrevStage = CurStageNum;
621 } else {
622 PrologStage = LastStageNum - StageDiff;
623 PrevStage = LastStageNum + StageDiff - 1;
624 }
625
626 for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
627 BBE = BB->instr_end();
628 BBI != BBE; ++BBI) {
629 for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
630 MachineOperand &MO = BBI->getOperand(i);
631 if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
632 continue;
633
634 int StageScheduled = Schedule.getStage(&*BBI);
635 assert(StageScheduled != -1 && "Expecting scheduled instruction.");
636 Register Def = MO.getReg();
637 unsigned NumPhis = getStagesForReg(Def, CurStageNum);
638 // An instruction scheduled in stage 0 and is used after the loop
639 // requires a phi in the epilog for the last definition from either
640 // the kernel or prolog.
641 if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
642 hasUseAfterLoop(Def, BB, MRI))
643 NumPhis = 1;
644 if (!InKernel && (unsigned)StageScheduled > PrologStage)
645 continue;
646
647 unsigned PhiOp2;
648 if (InKernel) {
649 PhiOp2 = VRMap[PrevStage][Def];
650 if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
651 if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
652 PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
653 }
654 // The number of Phis can't exceed the number of prolog stages. The
655 // prolog stage number is zero based.
656 if (NumPhis > PrologStage + 1 - StageScheduled)
657 NumPhis = PrologStage + 1 - StageScheduled;
658 for (unsigned np = 0; np < NumPhis; ++np) {
659 // Example for
660 // Org:
661 // %Org = ... (Scheduled at Stage#0, NumPhi = 2)
662 //
663 // Prolog0 (Stage0):
664 // %Clone0 = ...
665 // Prolog1 (Stage1):
666 // %Clone1 = ...
667 // Kernel (Stage2):
668 // %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
669 // %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
670 // %Clone2 = ...
671 // Epilog0 (Stage3):
672 // %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
673 // %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
674 // Epilog1 (Stage4):
675 // %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
676 //
677 // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
678 // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
679 // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
680
681 unsigned PhiOp1 = VRMap[PrologStage][Def];
682 if (np <= PrologStage)
683 PhiOp1 = VRMap[PrologStage - np][Def];
684 if (!InKernel) {
685 if (PrevStage == LastStageNum && np == 0)
686 PhiOp2 = VRMap[LastStageNum][Def];
687 else
688 PhiOp2 = VRMapPhi[PrevStage - np][Def];
689 }
690
691 const TargetRegisterClass *RC = MRI.getRegClass(Def);
692 Register NewReg = MRI.createVirtualRegister(RC);
693
694 MachineInstrBuilder NewPhi =
695 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
696 TII->get(TargetOpcode::PHI), NewReg);
697 NewPhi.addReg(PhiOp1).addMBB(BB1);
698 NewPhi.addReg(PhiOp2).addMBB(BB2);
699 if (np == 0)
700 InstrMap[NewPhi] = &*BBI;
701
702 // Rewrite uses and update the map. The actions depend upon whether
703 // we generating code for the kernel or epilog blocks.
704 if (InKernel) {
705 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
706 NewReg);
707 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
708 NewReg);
709
710 PhiOp2 = NewReg;
711 VRMapPhi[PrevStage - np - 1][Def] = NewReg;
712 } else {
713 VRMapPhi[CurStageNum - np][Def] = NewReg;
714 if (np == NumPhis - 1)
715 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
716 NewReg);
717 }
718 if (IsLast && np == NumPhis - 1)
719 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
720 }
721 }
722 }
723 }
724
725 /// Remove instructions that generate values with no uses.
726 /// Typically, these are induction variable operations that generate values
727 /// used in the loop itself. A dead instruction has a definition with
728 /// no uses, or uses that occur in the original loop only.
removeDeadInstructions(MachineBasicBlock * KernelBB,MBBVectorTy & EpilogBBs)729 void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
730 MBBVectorTy &EpilogBBs) {
731 // For each epilog block, check that the value defined by each instruction
732 // is used. If not, delete it.
733 for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
734 for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
735 ME = MBB->instr_rend();
736 MI != ME;) {
737 // From DeadMachineInstructionElem. Don't delete inline assembly.
738 if (MI->isInlineAsm()) {
739 ++MI;
740 continue;
741 }
742 bool SawStore = false;
743 // Check if it's safe to remove the instruction due to side effects.
744 // We can, and want to, remove Phis here.
745 if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
746 ++MI;
747 continue;
748 }
749 bool used = true;
750 for (const MachineOperand &MO : MI->all_defs()) {
751 Register reg = MO.getReg();
752 // Assume physical registers are used, unless they are marked dead.
753 if (reg.isPhysical()) {
754 used = !MO.isDead();
755 if (used)
756 break;
757 continue;
758 }
759 unsigned realUses = 0;
760 for (const MachineOperand &U : MRI.use_operands(reg)) {
761 // Check if there are any uses that occur only in the original
762 // loop. If so, that's not a real use.
763 if (U.getParent()->getParent() != BB) {
764 realUses++;
765 used = true;
766 break;
767 }
768 }
769 if (realUses > 0)
770 break;
771 used = false;
772 }
773 if (!used) {
774 LIS.RemoveMachineInstrFromMaps(*MI);
775 MI++->eraseFromParent();
776 continue;
777 }
778 ++MI;
779 }
780 // In the kernel block, check if we can remove a Phi that generates a value
781 // used in an instruction removed in the epilog block.
782 for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
783 Register reg = MI.getOperand(0).getReg();
784 if (MRI.use_begin(reg) == MRI.use_end()) {
785 LIS.RemoveMachineInstrFromMaps(MI);
786 MI.eraseFromParent();
787 }
788 }
789 }
790
791 /// For loop carried definitions, we split the lifetime of a virtual register
792 /// that has uses past the definition in the next iteration. A copy with a new
793 /// virtual register is inserted before the definition, which helps with
794 /// generating a better register assignment.
795 ///
796 /// v1 = phi(a, v2) v1 = phi(a, v2)
797 /// v2 = phi(b, v3) v2 = phi(b, v3)
798 /// v3 = .. v4 = copy v1
799 /// .. = V1 v3 = ..
800 /// .. = v4
splitLifetimes(MachineBasicBlock * KernelBB,MBBVectorTy & EpilogBBs)801 void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
802 MBBVectorTy &EpilogBBs) {
803 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
804 for (auto &PHI : KernelBB->phis()) {
805 Register Def = PHI.getOperand(0).getReg();
806 // Check for any Phi definition that used as an operand of another Phi
807 // in the same block.
808 for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
809 E = MRI.use_instr_end();
810 I != E; ++I) {
811 if (I->isPHI() && I->getParent() == KernelBB) {
812 // Get the loop carried definition.
813 unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
814 if (!LCDef)
815 continue;
816 MachineInstr *MI = MRI.getVRegDef(LCDef);
817 if (!MI || MI->getParent() != KernelBB || MI->isPHI())
818 continue;
819 // Search through the rest of the block looking for uses of the Phi
820 // definition. If one occurs, then split the lifetime.
821 unsigned SplitReg = 0;
822 for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
823 KernelBB->instr_end()))
824 if (BBJ.readsRegister(Def, /*TRI=*/nullptr)) {
825 // We split the lifetime when we find the first use.
826 if (SplitReg == 0) {
827 SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
828 BuildMI(*KernelBB, MI, MI->getDebugLoc(),
829 TII->get(TargetOpcode::COPY), SplitReg)
830 .addReg(Def);
831 }
832 BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
833 }
834 if (!SplitReg)
835 continue;
836 // Search through each of the epilog blocks for any uses to be renamed.
837 for (auto &Epilog : EpilogBBs)
838 for (auto &I : *Epilog)
839 if (I.readsRegister(Def, /*TRI=*/nullptr))
840 I.substituteRegister(Def, SplitReg, 0, *TRI);
841 break;
842 }
843 }
844 }
845 }
846
847 /// Remove the incoming block from the Phis in a basic block.
removePhis(MachineBasicBlock * BB,MachineBasicBlock * Incoming)848 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
849 for (MachineInstr &MI : *BB) {
850 if (!MI.isPHI())
851 break;
852 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
853 if (MI.getOperand(i + 1).getMBB() == Incoming) {
854 MI.removeOperand(i + 1);
855 MI.removeOperand(i);
856 break;
857 }
858 }
859 }
860
861 /// Create branches from each prolog basic block to the appropriate epilog
862 /// block. These edges are needed if the loop ends before reaching the
863 /// kernel.
addBranches(MachineBasicBlock & PreheaderBB,MBBVectorTy & PrologBBs,MachineBasicBlock * KernelBB,MBBVectorTy & EpilogBBs,ValueMapTy * VRMap)864 void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
865 MBBVectorTy &PrologBBs,
866 MachineBasicBlock *KernelBB,
867 MBBVectorTy &EpilogBBs,
868 ValueMapTy *VRMap) {
869 assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
870 MachineBasicBlock *LastPro = KernelBB;
871 MachineBasicBlock *LastEpi = KernelBB;
872
873 // Start from the blocks connected to the kernel and work "out"
874 // to the first prolog and the last epilog blocks.
875 SmallVector<MachineInstr *, 4> PrevInsts;
876 unsigned MaxIter = PrologBBs.size() - 1;
877 for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
878 // Add branches to the prolog that go to the corresponding
879 // epilog, and the fall-thru prolog/kernel block.
880 MachineBasicBlock *Prolog = PrologBBs[j];
881 MachineBasicBlock *Epilog = EpilogBBs[i];
882
883 SmallVector<MachineOperand, 4> Cond;
884 std::optional<bool> StaticallyGreater =
885 LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
886 unsigned numAdded = 0;
887 if (!StaticallyGreater) {
888 Prolog->addSuccessor(Epilog);
889 numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
890 } else if (*StaticallyGreater == false) {
891 Prolog->addSuccessor(Epilog);
892 Prolog->removeSuccessor(LastPro);
893 LastEpi->removeSuccessor(Epilog);
894 numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
895 removePhis(Epilog, LastEpi);
896 // Remove the blocks that are no longer referenced.
897 if (LastPro != LastEpi) {
898 LastEpi->clear();
899 LastEpi->eraseFromParent();
900 }
901 if (LastPro == KernelBB) {
902 LoopInfo->disposed();
903 NewKernel = nullptr;
904 }
905 LastPro->clear();
906 LastPro->eraseFromParent();
907 } else {
908 numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
909 removePhis(Epilog, Prolog);
910 }
911 LastPro = Prolog;
912 LastEpi = Epilog;
913 for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
914 E = Prolog->instr_rend();
915 I != E && numAdded > 0; ++I, --numAdded)
916 updateInstruction(&*I, false, j, 0, VRMap);
917 }
918
919 if (NewKernel) {
920 LoopInfo->setPreheader(PrologBBs[MaxIter]);
921 LoopInfo->adjustTripCount(-(MaxIter + 1));
922 }
923 }
924
925 /// Return true if we can compute the amount the instruction changes
926 /// during each iteration. Set Delta to the amount of the change.
computeDelta(MachineInstr & MI,unsigned & Delta)927 bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
928 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
929 const MachineOperand *BaseOp;
930 int64_t Offset;
931 bool OffsetIsScalable;
932 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
933 return false;
934
935 // FIXME: This algorithm assumes instructions have fixed-size offsets.
936 if (OffsetIsScalable)
937 return false;
938
939 if (!BaseOp->isReg())
940 return false;
941
942 Register BaseReg = BaseOp->getReg();
943
944 MachineRegisterInfo &MRI = MF.getRegInfo();
945 // Check if there is a Phi. If so, get the definition in the loop.
946 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
947 if (BaseDef && BaseDef->isPHI()) {
948 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
949 BaseDef = MRI.getVRegDef(BaseReg);
950 }
951 if (!BaseDef)
952 return false;
953
954 int D = 0;
955 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
956 return false;
957
958 Delta = D;
959 return true;
960 }
961
962 /// Update the memory operand with a new offset when the pipeliner
963 /// generates a new copy of the instruction that refers to a
964 /// different memory location.
updateMemOperands(MachineInstr & NewMI,MachineInstr & OldMI,unsigned Num)965 void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
966 MachineInstr &OldMI,
967 unsigned Num) {
968 if (Num == 0)
969 return;
970 // If the instruction has memory operands, then adjust the offset
971 // when the instruction appears in different stages.
972 if (NewMI.memoperands_empty())
973 return;
974 SmallVector<MachineMemOperand *, 2> NewMMOs;
975 for (MachineMemOperand *MMO : NewMI.memoperands()) {
976 // TODO: Figure out whether isAtomic is really necessary (see D57601).
977 if (MMO->isVolatile() || MMO->isAtomic() ||
978 (MMO->isInvariant() && MMO->isDereferenceable()) ||
979 (!MMO->getValue())) {
980 NewMMOs.push_back(MMO);
981 continue;
982 }
983 unsigned Delta;
984 if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
985 int64_t AdjOffset = Delta * Num;
986 NewMMOs.push_back(
987 MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
988 } else {
989 NewMMOs.push_back(MF.getMachineMemOperand(
990 MMO, 0, LocationSize::beforeOrAfterPointer()));
991 }
992 }
993 NewMI.setMemRefs(MF, NewMMOs);
994 }
995
996 /// Clone the instruction for the new pipelined loop and update the
997 /// memory operands, if needed.
cloneInstr(MachineInstr * OldMI,unsigned CurStageNum,unsigned InstStageNum)998 MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
999 unsigned CurStageNum,
1000 unsigned InstStageNum) {
1001 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1002 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1003 return NewMI;
1004 }
1005
1006 /// Clone the instruction for the new pipelined loop. If needed, this
1007 /// function updates the instruction using the values saved in the
1008 /// InstrChanges structure.
cloneAndChangeInstr(MachineInstr * OldMI,unsigned CurStageNum,unsigned InstStageNum)1009 MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1010 MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1011 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1012 auto It = InstrChanges.find(OldMI);
1013 if (It != InstrChanges.end()) {
1014 std::pair<unsigned, int64_t> RegAndOffset = It->second;
1015 unsigned BasePos, OffsetPos;
1016 if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
1017 return nullptr;
1018 int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
1019 MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
1020 if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
1021 NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1022 NewMI->getOperand(OffsetPos).setImm(NewOffset);
1023 }
1024 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1025 return NewMI;
1026 }
1027
1028 /// Update the machine instruction with new virtual registers. This
1029 /// function may change the definitions and/or uses.
updateInstruction(MachineInstr * NewMI,bool LastDef,unsigned CurStageNum,unsigned InstrStageNum,ValueMapTy * VRMap)1030 void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1031 bool LastDef,
1032 unsigned CurStageNum,
1033 unsigned InstrStageNum,
1034 ValueMapTy *VRMap) {
1035 for (MachineOperand &MO : NewMI->operands()) {
1036 if (!MO.isReg() || !MO.getReg().isVirtual())
1037 continue;
1038 Register reg = MO.getReg();
1039 if (MO.isDef()) {
1040 // Create a new virtual register for the definition.
1041 const TargetRegisterClass *RC = MRI.getRegClass(reg);
1042 Register NewReg = MRI.createVirtualRegister(RC);
1043 MO.setReg(NewReg);
1044 VRMap[CurStageNum][reg] = NewReg;
1045 if (LastDef)
1046 replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
1047 } else if (MO.isUse()) {
1048 MachineInstr *Def = MRI.getVRegDef(reg);
1049 // Compute the stage that contains the last definition for instruction.
1050 int DefStageNum = Schedule.getStage(Def);
1051 unsigned StageNum = CurStageNum;
1052 if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1053 // Compute the difference in stages between the defintion and the use.
1054 unsigned StageDiff = (InstrStageNum - DefStageNum);
1055 // Make an adjustment to get the last definition.
1056 StageNum -= StageDiff;
1057 }
1058 if (VRMap[StageNum].count(reg))
1059 MO.setReg(VRMap[StageNum][reg]);
1060 }
1061 }
1062 }
1063
1064 /// Return the instruction in the loop that defines the register.
1065 /// If the definition is a Phi, then follow the Phi operand to
1066 /// the instruction in the loop.
findDefInLoop(unsigned Reg)1067 MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
1068 SmallPtrSet<MachineInstr *, 8> Visited;
1069 MachineInstr *Def = MRI.getVRegDef(Reg);
1070 while (Def->isPHI()) {
1071 if (!Visited.insert(Def).second)
1072 break;
1073 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1074 if (Def->getOperand(i + 1).getMBB() == BB) {
1075 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
1076 break;
1077 }
1078 }
1079 return Def;
1080 }
1081
1082 /// Return the new name for the value from the previous stage.
getPrevMapVal(unsigned StageNum,unsigned PhiStage,unsigned LoopVal,unsigned LoopStage,ValueMapTy * VRMap,MachineBasicBlock * BB)1083 unsigned ModuloScheduleExpander::getPrevMapVal(
1084 unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
1085 ValueMapTy *VRMap, MachineBasicBlock *BB) {
1086 unsigned PrevVal = 0;
1087 if (StageNum > PhiStage) {
1088 MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
1089 if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
1090 // The name is defined in the previous stage.
1091 PrevVal = VRMap[StageNum - 1][LoopVal];
1092 else if (VRMap[StageNum].count(LoopVal))
1093 // The previous name is defined in the current stage when the instruction
1094 // order is swapped.
1095 PrevVal = VRMap[StageNum][LoopVal];
1096 else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1097 // The loop value hasn't yet been scheduled.
1098 PrevVal = LoopVal;
1099 else if (StageNum == PhiStage + 1)
1100 // The loop value is another phi, which has not been scheduled.
1101 PrevVal = getInitPhiReg(*LoopInst, BB);
1102 else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1103 // The loop value is another phi, which has been scheduled.
1104 PrevVal =
1105 getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
1106 LoopStage, VRMap, BB);
1107 }
1108 return PrevVal;
1109 }
1110
1111 /// Rewrite the Phi values in the specified block to use the mappings
1112 /// from the initial operand. Once the Phi is scheduled, we switch
1113 /// to using the loop value instead of the Phi value, so those names
1114 /// do not need to be rewritten.
rewritePhiValues(MachineBasicBlock * NewBB,unsigned StageNum,ValueMapTy * VRMap,InstrMapTy & InstrMap)1115 void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1116 unsigned StageNum,
1117 ValueMapTy *VRMap,
1118 InstrMapTy &InstrMap) {
1119 for (auto &PHI : BB->phis()) {
1120 unsigned InitVal = 0;
1121 unsigned LoopVal = 0;
1122 getPhiRegs(PHI, BB, InitVal, LoopVal);
1123 Register PhiDef = PHI.getOperand(0).getReg();
1124
1125 unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
1126 unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
1127 unsigned NumPhis = getStagesForPhi(PhiDef);
1128 if (NumPhis > StageNum)
1129 NumPhis = StageNum;
1130 for (unsigned np = 0; np <= NumPhis; ++np) {
1131 unsigned NewVal =
1132 getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1133 if (!NewVal)
1134 NewVal = InitVal;
1135 rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
1136 NewVal);
1137 }
1138 }
1139 }
1140
1141 /// Rewrite a previously scheduled instruction to use the register value
1142 /// from the new instruction. Make sure the instruction occurs in the
1143 /// basic block, and we don't change the uses in the new instruction.
rewriteScheduledInstr(MachineBasicBlock * BB,InstrMapTy & InstrMap,unsigned CurStageNum,unsigned PhiNum,MachineInstr * Phi,unsigned OldReg,unsigned NewReg,unsigned PrevReg)1144 void ModuloScheduleExpander::rewriteScheduledInstr(
1145 MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1146 unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
1147 unsigned PrevReg) {
1148 bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1149 int StagePhi = Schedule.getStage(Phi) + PhiNum;
1150 // Rewrite uses that have been scheduled already to use the new
1151 // Phi register.
1152 for (MachineOperand &UseOp :
1153 llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
1154 MachineInstr *UseMI = UseOp.getParent();
1155 if (UseMI->getParent() != BB)
1156 continue;
1157 if (UseMI->isPHI()) {
1158 if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
1159 continue;
1160 if (getLoopPhiReg(*UseMI, BB) != OldReg)
1161 continue;
1162 }
1163 InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
1164 assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1165 MachineInstr *OrigMI = OrigInstr->second;
1166 int StageSched = Schedule.getStage(OrigMI);
1167 int CycleSched = Schedule.getCycle(OrigMI);
1168 unsigned ReplaceReg = 0;
1169 // This is the stage for the scheduled instruction.
1170 if (StagePhi == StageSched && Phi->isPHI()) {
1171 int CyclePhi = Schedule.getCycle(Phi);
1172 if (PrevReg && InProlog)
1173 ReplaceReg = PrevReg;
1174 else if (PrevReg && !isLoopCarried(*Phi) &&
1175 (CyclePhi <= CycleSched || OrigMI->isPHI()))
1176 ReplaceReg = PrevReg;
1177 else
1178 ReplaceReg = NewReg;
1179 }
1180 // The scheduled instruction occurs before the scheduled Phi, and the
1181 // Phi is not loop carried.
1182 if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
1183 ReplaceReg = NewReg;
1184 if (StagePhi > StageSched && Phi->isPHI())
1185 ReplaceReg = NewReg;
1186 if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1187 ReplaceReg = NewReg;
1188 if (ReplaceReg) {
1189 const TargetRegisterClass *NRC =
1190 MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
1191 if (NRC)
1192 UseOp.setReg(ReplaceReg);
1193 else {
1194 Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
1195 BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
1196 SplitReg)
1197 .addReg(ReplaceReg);
1198 UseOp.setReg(SplitReg);
1199 }
1200 }
1201 }
1202 }
1203
isLoopCarried(MachineInstr & Phi)1204 bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1205 if (!Phi.isPHI())
1206 return false;
1207 int DefCycle = Schedule.getCycle(&Phi);
1208 int DefStage = Schedule.getStage(&Phi);
1209
1210 unsigned InitVal = 0;
1211 unsigned LoopVal = 0;
1212 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
1213 MachineInstr *Use = MRI.getVRegDef(LoopVal);
1214 if (!Use || Use->isPHI())
1215 return true;
1216 int LoopCycle = Schedule.getCycle(Use);
1217 int LoopStage = Schedule.getStage(Use);
1218 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1219 }
1220
1221 //===----------------------------------------------------------------------===//
1222 // PeelingModuloScheduleExpander implementation
1223 //===----------------------------------------------------------------------===//
1224 // This is a reimplementation of ModuloScheduleExpander that works by creating
1225 // a fully correct steady-state kernel and peeling off the prolog and epilogs.
1226 //===----------------------------------------------------------------------===//
1227
1228 namespace {
1229 // Remove any dead phis in MBB. Dead phis either have only one block as input
1230 // (in which case they are the identity) or have no uses.
EliminateDeadPhis(MachineBasicBlock * MBB,MachineRegisterInfo & MRI,LiveIntervals * LIS,bool KeepSingleSrcPhi=false)1231 void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1232 LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1233 bool Changed = true;
1234 while (Changed) {
1235 Changed = false;
1236 for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
1237 assert(MI.isPHI());
1238 if (MRI.use_empty(MI.getOperand(0).getReg())) {
1239 if (LIS)
1240 LIS->RemoveMachineInstrFromMaps(MI);
1241 MI.eraseFromParent();
1242 Changed = true;
1243 } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1244 const TargetRegisterClass *ConstrainRegClass =
1245 MRI.constrainRegClass(MI.getOperand(1).getReg(),
1246 MRI.getRegClass(MI.getOperand(0).getReg()));
1247 assert(ConstrainRegClass &&
1248 "Expected a valid constrained register class!");
1249 (void)ConstrainRegClass;
1250 MRI.replaceRegWith(MI.getOperand(0).getReg(),
1251 MI.getOperand(1).getReg());
1252 if (LIS)
1253 LIS->RemoveMachineInstrFromMaps(MI);
1254 MI.eraseFromParent();
1255 Changed = true;
1256 }
1257 }
1258 }
1259 }
1260
1261 /// Rewrites the kernel block in-place to adhere to the given schedule.
1262 /// KernelRewriter holds all of the state required to perform the rewriting.
1263 class KernelRewriter {
1264 ModuloSchedule &S;
1265 MachineBasicBlock *BB;
1266 MachineBasicBlock *PreheaderBB, *ExitBB;
1267 MachineRegisterInfo &MRI;
1268 const TargetInstrInfo *TII;
1269 LiveIntervals *LIS;
1270
1271 // Map from register class to canonical undef register for that class.
1272 DenseMap<const TargetRegisterClass *, Register> Undefs;
1273 // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1274 // this map is only used when InitReg is non-undef.
1275 DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
1276 // Map from LoopReg to phi register where the InitReg is undef.
1277 DenseMap<Register, Register> UndefPhis;
1278
1279 // Reg is used by MI. Return the new register MI should use to adhere to the
1280 // schedule. Insert phis as necessary.
1281 Register remapUse(Register Reg, MachineInstr &MI);
1282 // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1283 // If InitReg is not given it is chosen arbitrarily. It will either be undef
1284 // or will be chosen so as to share another phi.
1285 Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1286 const TargetRegisterClass *RC = nullptr);
1287 // Create an undef register of the given register class.
1288 Register undef(const TargetRegisterClass *RC);
1289
1290 public:
1291 KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1292 LiveIntervals *LIS = nullptr);
1293 void rewrite();
1294 };
1295 } // namespace
1296
KernelRewriter(MachineLoop & L,ModuloSchedule & S,MachineBasicBlock * LoopBB,LiveIntervals * LIS)1297 KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1298 MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1299 : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1300 ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1301 TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1302 PreheaderBB = *BB->pred_begin();
1303 if (PreheaderBB == BB)
1304 PreheaderBB = *std::next(BB->pred_begin());
1305 }
1306
rewrite()1307 void KernelRewriter::rewrite() {
1308 // Rearrange the loop to be in schedule order. Note that the schedule may
1309 // contain instructions that are not owned by the loop block (InstrChanges and
1310 // friends), so we gracefully handle unowned instructions and delete any
1311 // instructions that weren't in the schedule.
1312 auto InsertPt = BB->getFirstTerminator();
1313 MachineInstr *FirstMI = nullptr;
1314 for (MachineInstr *MI : S.getInstructions()) {
1315 if (MI->isPHI())
1316 continue;
1317 if (MI->getParent())
1318 MI->removeFromParent();
1319 BB->insert(InsertPt, MI);
1320 if (!FirstMI)
1321 FirstMI = MI;
1322 }
1323 assert(FirstMI && "Failed to find first MI in schedule");
1324
1325 // At this point all of the scheduled instructions are between FirstMI
1326 // and the end of the block. Kill from the first non-phi to FirstMI.
1327 for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1328 if (LIS)
1329 LIS->RemoveMachineInstrFromMaps(*I);
1330 (I++)->eraseFromParent();
1331 }
1332
1333 // Now remap every instruction in the loop.
1334 for (MachineInstr &MI : *BB) {
1335 if (MI.isPHI() || MI.isTerminator())
1336 continue;
1337 for (MachineOperand &MO : MI.uses()) {
1338 if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1339 continue;
1340 Register Reg = remapUse(MO.getReg(), MI);
1341 MO.setReg(Reg);
1342 }
1343 }
1344 EliminateDeadPhis(BB, MRI, LIS);
1345
1346 // Ensure a phi exists for all instructions that are either referenced by
1347 // an illegal phi or by an instruction outside the loop. This allows us to
1348 // treat remaps of these values the same as "normal" values that come from
1349 // loop-carried phis.
1350 for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1351 if (MI->isPHI()) {
1352 Register R = MI->getOperand(0).getReg();
1353 phi(R);
1354 continue;
1355 }
1356
1357 for (MachineOperand &Def : MI->defs()) {
1358 for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
1359 if (MI.getParent() != BB) {
1360 phi(Def.getReg());
1361 break;
1362 }
1363 }
1364 }
1365 }
1366 }
1367
remapUse(Register Reg,MachineInstr & MI)1368 Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1369 MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1370 if (!Producer)
1371 return Reg;
1372
1373 int ConsumerStage = S.getStage(&MI);
1374 if (!Producer->isPHI()) {
1375 // Non-phi producers are simple to remap. Insert as many phis as the
1376 // difference between the consumer and producer stages.
1377 if (Producer->getParent() != BB)
1378 // Producer was not inside the loop. Use the register as-is.
1379 return Reg;
1380 int ProducerStage = S.getStage(Producer);
1381 assert(ConsumerStage != -1 &&
1382 "In-loop consumer should always be scheduled!");
1383 assert(ConsumerStage >= ProducerStage);
1384 unsigned StageDiff = ConsumerStage - ProducerStage;
1385
1386 for (unsigned I = 0; I < StageDiff; ++I)
1387 Reg = phi(Reg);
1388 return Reg;
1389 }
1390
1391 // First, dive through the phi chain to find the defaults for the generated
1392 // phis.
1393 SmallVector<std::optional<Register>, 4> Defaults;
1394 Register LoopReg = Reg;
1395 auto LoopProducer = Producer;
1396 while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1397 LoopReg = getLoopPhiReg(*LoopProducer, BB);
1398 Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
1399 LoopProducer = MRI.getUniqueVRegDef(LoopReg);
1400 assert(LoopProducer);
1401 }
1402 int LoopProducerStage = S.getStage(LoopProducer);
1403
1404 std::optional<Register> IllegalPhiDefault;
1405
1406 if (LoopProducerStage == -1) {
1407 // Do nothing.
1408 } else if (LoopProducerStage > ConsumerStage) {
1409 // This schedule is only representable if ProducerStage == ConsumerStage+1.
1410 // In addition, Consumer's cycle must be scheduled after Producer in the
1411 // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1412 // functions.
1413 #ifndef NDEBUG // Silence unused variables in non-asserts mode.
1414 int LoopProducerCycle = S.getCycle(LoopProducer);
1415 int ConsumerCycle = S.getCycle(&MI);
1416 #endif
1417 assert(LoopProducerCycle <= ConsumerCycle);
1418 assert(LoopProducerStage == ConsumerStage + 1);
1419 // Peel off the first phi from Defaults and insert a phi between producer
1420 // and consumer. This phi will not be at the front of the block so we
1421 // consider it illegal. It will only exist during the rewrite process; it
1422 // needs to exist while we peel off prologs because these could take the
1423 // default value. After that we can replace all uses with the loop producer
1424 // value.
1425 IllegalPhiDefault = Defaults.front();
1426 Defaults.erase(Defaults.begin());
1427 } else {
1428 assert(ConsumerStage >= LoopProducerStage);
1429 int StageDiff = ConsumerStage - LoopProducerStage;
1430 if (StageDiff > 0) {
1431 LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1432 << " to " << (Defaults.size() + StageDiff) << "\n");
1433 // If we need more phis than we have defaults for, pad out with undefs for
1434 // the earliest phis, which are at the end of the defaults chain (the
1435 // chain is in reverse order).
1436 Defaults.resize(Defaults.size() + StageDiff,
1437 Defaults.empty() ? std::optional<Register>()
1438 : Defaults.back());
1439 }
1440 }
1441
1442 // Now we know the number of stages to jump back, insert the phi chain.
1443 auto DefaultI = Defaults.rbegin();
1444 while (DefaultI != Defaults.rend())
1445 LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
1446
1447 if (IllegalPhiDefault) {
1448 // The consumer optionally consumes LoopProducer in the same iteration
1449 // (because the producer is scheduled at an earlier cycle than the consumer)
1450 // or the initial value. To facilitate this we create an illegal block here
1451 // by embedding a phi in the middle of the block. We will fix this up
1452 // immediately prior to pruning.
1453 auto RC = MRI.getRegClass(Reg);
1454 Register R = MRI.createVirtualRegister(RC);
1455 MachineInstr *IllegalPhi =
1456 BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1457 .addReg(*IllegalPhiDefault)
1458 .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
1459 .addReg(LoopReg)
1460 .addMBB(BB); // Block choice is arbitrary and has no effect.
1461 // Illegal phi should belong to the producer stage so that it can be
1462 // filtered correctly during peeling.
1463 S.setStage(IllegalPhi, LoopProducerStage);
1464 return R;
1465 }
1466
1467 return LoopReg;
1468 }
1469
phi(Register LoopReg,std::optional<Register> InitReg,const TargetRegisterClass * RC)1470 Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1471 const TargetRegisterClass *RC) {
1472 // If the init register is not undef, try and find an existing phi.
1473 if (InitReg) {
1474 auto I = Phis.find({LoopReg, *InitReg});
1475 if (I != Phis.end())
1476 return I->second;
1477 } else {
1478 for (auto &KV : Phis) {
1479 if (KV.first.first == LoopReg)
1480 return KV.second;
1481 }
1482 }
1483
1484 // InitReg is either undef or no existing phi takes InitReg as input. Try and
1485 // find a phi that takes undef as input.
1486 auto I = UndefPhis.find(LoopReg);
1487 if (I != UndefPhis.end()) {
1488 Register R = I->second;
1489 if (!InitReg)
1490 // Found a phi taking undef as input, and this input is undef so return
1491 // without any more changes.
1492 return R;
1493 // Found a phi taking undef as input, so rewrite it to take InitReg.
1494 MachineInstr *MI = MRI.getVRegDef(R);
1495 MI->getOperand(1).setReg(*InitReg);
1496 Phis.insert({{LoopReg, *InitReg}, R});
1497 const TargetRegisterClass *ConstrainRegClass =
1498 MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1499 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1500 (void)ConstrainRegClass;
1501 UndefPhis.erase(I);
1502 return R;
1503 }
1504
1505 // Failed to find any existing phi to reuse, so create a new one.
1506 if (!RC)
1507 RC = MRI.getRegClass(LoopReg);
1508 Register R = MRI.createVirtualRegister(RC);
1509 if (InitReg) {
1510 const TargetRegisterClass *ConstrainRegClass =
1511 MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1512 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1513 (void)ConstrainRegClass;
1514 }
1515 BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
1516 .addReg(InitReg ? *InitReg : undef(RC))
1517 .addMBB(PreheaderBB)
1518 .addReg(LoopReg)
1519 .addMBB(BB);
1520 if (!InitReg)
1521 UndefPhis[LoopReg] = R;
1522 else
1523 Phis[{LoopReg, *InitReg}] = R;
1524 return R;
1525 }
1526
undef(const TargetRegisterClass * RC)1527 Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1528 Register &R = Undefs[RC];
1529 if (R == 0) {
1530 // Create an IMPLICIT_DEF that defines this register if we need it.
1531 // All uses of this should be removed by the time we have finished unrolling
1532 // prologs and epilogs.
1533 R = MRI.createVirtualRegister(RC);
1534 auto *InsertBB = &PreheaderBB->getParent()->front();
1535 BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
1536 TII->get(TargetOpcode::IMPLICIT_DEF), R);
1537 }
1538 return R;
1539 }
1540
1541 namespace {
1542 /// Describes an operand in the kernel of a pipelined loop. Characteristics of
1543 /// the operand are discovered, such as how many in-loop PHIs it has to jump
1544 /// through and defaults for these phis.
1545 class KernelOperandInfo {
1546 MachineBasicBlock *BB;
1547 MachineRegisterInfo &MRI;
1548 SmallVector<Register, 4> PhiDefaults;
1549 MachineOperand *Source;
1550 MachineOperand *Target;
1551
1552 public:
KernelOperandInfo(MachineOperand * MO,MachineRegisterInfo & MRI,const SmallPtrSetImpl<MachineInstr * > & IllegalPhis)1553 KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1554 const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1555 : MRI(MRI) {
1556 Source = MO;
1557 BB = MO->getParent()->getParent();
1558 while (isRegInLoop(MO)) {
1559 MachineInstr *MI = MRI.getVRegDef(MO->getReg());
1560 if (MI->isFullCopy()) {
1561 MO = &MI->getOperand(1);
1562 continue;
1563 }
1564 if (!MI->isPHI())
1565 break;
1566 // If this is an illegal phi, don't count it in distance.
1567 if (IllegalPhis.count(MI)) {
1568 MO = &MI->getOperand(3);
1569 continue;
1570 }
1571
1572 Register Default = getInitPhiReg(*MI, BB);
1573 MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
1574 : &MI->getOperand(3);
1575 PhiDefaults.push_back(Default);
1576 }
1577 Target = MO;
1578 }
1579
operator ==(const KernelOperandInfo & Other) const1580 bool operator==(const KernelOperandInfo &Other) const {
1581 return PhiDefaults.size() == Other.PhiDefaults.size();
1582 }
1583
print(raw_ostream & OS) const1584 void print(raw_ostream &OS) const {
1585 OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1586 << *Source->getParent();
1587 }
1588
1589 private:
isRegInLoop(MachineOperand * MO)1590 bool isRegInLoop(MachineOperand *MO) {
1591 return MO->isReg() && MO->getReg().isVirtual() &&
1592 MRI.getVRegDef(MO->getReg())->getParent() == BB;
1593 }
1594 };
1595 } // namespace
1596
1597 MachineBasicBlock *
peelKernel(LoopPeelDirection LPD)1598 PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1599 MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
1600 if (LPD == LPD_Front)
1601 PeeledFront.push_back(NewBB);
1602 else
1603 PeeledBack.push_front(NewBB);
1604 for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1605 ++I, ++NI) {
1606 CanonicalMIs[&*I] = &*I;
1607 CanonicalMIs[&*NI] = &*I;
1608 BlockMIs[{NewBB, &*I}] = &*NI;
1609 BlockMIs[{BB, &*I}] = &*I;
1610 }
1611 return NewBB;
1612 }
1613
filterInstructions(MachineBasicBlock * MB,int MinStage)1614 void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1615 int MinStage) {
1616 for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1617 I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
1618 MachineInstr *MI = &*I++;
1619 int Stage = getStage(MI);
1620 if (Stage == -1 || Stage >= MinStage)
1621 continue;
1622
1623 for (MachineOperand &DefMO : MI->defs()) {
1624 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1625 for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1626 // Only PHIs can use values from this block by construction.
1627 // Match with the equivalent PHI in B.
1628 assert(UseMI.isPHI());
1629 Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1630 MI->getParent());
1631 Subs.emplace_back(&UseMI, Reg);
1632 }
1633 for (auto &Sub : Subs)
1634 Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1635 *MRI.getTargetRegisterInfo());
1636 }
1637 if (LIS)
1638 LIS->RemoveMachineInstrFromMaps(*MI);
1639 MI->eraseFromParent();
1640 }
1641 }
1642
moveStageBetweenBlocks(MachineBasicBlock * DestBB,MachineBasicBlock * SourceBB,unsigned Stage)1643 void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1644 MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1645 auto InsertPt = DestBB->getFirstNonPHI();
1646 DenseMap<Register, Register> Remaps;
1647 for (MachineInstr &MI : llvm::make_early_inc_range(
1648 llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
1649 if (MI.isPHI()) {
1650 // This is an illegal PHI. If we move any instructions using an illegal
1651 // PHI, we need to create a legal Phi.
1652 if (getStage(&MI) != Stage) {
1653 // The legal Phi is not necessary if the illegal phi's stage
1654 // is being moved.
1655 Register PhiR = MI.getOperand(0).getReg();
1656 auto RC = MRI.getRegClass(PhiR);
1657 Register NR = MRI.createVirtualRegister(RC);
1658 MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
1659 DebugLoc(), TII->get(TargetOpcode::PHI), NR)
1660 .addReg(PhiR)
1661 .addMBB(SourceBB);
1662 BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1663 CanonicalMIs[NI] = CanonicalMIs[&MI];
1664 Remaps[PhiR] = NR;
1665 }
1666 }
1667 if (getStage(&MI) != Stage)
1668 continue;
1669 MI.removeFromParent();
1670 DestBB->insert(InsertPt, &MI);
1671 auto *KernelMI = CanonicalMIs[&MI];
1672 BlockMIs[{DestBB, KernelMI}] = &MI;
1673 BlockMIs.erase({SourceBB, KernelMI});
1674 }
1675 SmallVector<MachineInstr *, 4> PhiToDelete;
1676 for (MachineInstr &MI : DestBB->phis()) {
1677 assert(MI.getNumOperands() == 3);
1678 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
1679 // If the instruction referenced by the phi is moved inside the block
1680 // we don't need the phi anymore.
1681 if (getStage(Def) == Stage) {
1682 Register PhiReg = MI.getOperand(0).getReg();
1683 assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1684 /*TRI=*/nullptr) != -1);
1685 MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
1686 MI.getOperand(0).setReg(PhiReg);
1687 PhiToDelete.push_back(&MI);
1688 }
1689 }
1690 for (auto *P : PhiToDelete)
1691 P->eraseFromParent();
1692 InsertPt = DestBB->getFirstNonPHI();
1693 // Helper to clone Phi instructions into the destination block. We clone Phi
1694 // greedily to avoid combinatorial explosion of Phi instructions.
1695 auto clonePhi = [&](MachineInstr *Phi) {
1696 MachineInstr *NewMI = MF.CloneMachineInstr(Phi);
1697 DestBB->insert(InsertPt, NewMI);
1698 Register OrigR = Phi->getOperand(0).getReg();
1699 Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
1700 NewMI->getOperand(0).setReg(R);
1701 NewMI->getOperand(1).setReg(OrigR);
1702 NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
1703 Remaps[OrigR] = R;
1704 CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1705 BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1706 PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1707 return R;
1708 };
1709 for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1710 for (MachineOperand &MO : I->uses()) {
1711 if (!MO.isReg())
1712 continue;
1713 if (Remaps.count(MO.getReg()))
1714 MO.setReg(Remaps[MO.getReg()]);
1715 else {
1716 // If we are using a phi from the source block we need to add a new phi
1717 // pointing to the old one.
1718 MachineInstr *Use = MRI.getUniqueVRegDef(MO.getReg());
1719 if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1720 Register R = clonePhi(Use);
1721 MO.setReg(R);
1722 }
1723 }
1724 }
1725 }
1726 }
1727
1728 Register
getPhiCanonicalReg(MachineInstr * CanonicalPhi,MachineInstr * Phi)1729 PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1730 MachineInstr *Phi) {
1731 unsigned distance = PhiNodeLoopIteration[Phi];
1732 MachineInstr *CanonicalUse = CanonicalPhi;
1733 Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
1734 for (unsigned I = 0; I < distance; ++I) {
1735 assert(CanonicalUse->isPHI());
1736 assert(CanonicalUse->getNumOperands() == 5);
1737 unsigned LoopRegIdx = 3, InitRegIdx = 1;
1738 if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
1739 std::swap(LoopRegIdx, InitRegIdx);
1740 CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
1741 CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
1742 }
1743 return CanonicalUseReg;
1744 }
1745
peelPrologAndEpilogs()1746 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1747 BitVector LS(Schedule.getNumStages(), true);
1748 BitVector AS(Schedule.getNumStages(), true);
1749 LiveStages[BB] = LS;
1750 AvailableStages[BB] = AS;
1751
1752 // Peel out the prologs.
1753 LS.reset();
1754 for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1755 LS[I] = true;
1756 Prologs.push_back(peelKernel(LPD_Front));
1757 LiveStages[Prologs.back()] = LS;
1758 AvailableStages[Prologs.back()] = LS;
1759 }
1760
1761 // Create a block that will end up as the new loop exiting block (dominated by
1762 // all prologs and epilogs). It will only contain PHIs, in the same order as
1763 // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1764 // that the exiting block is a (sub) clone of BB. This in turn gives us the
1765 // property that any value deffed in BB but used outside of BB is used by a
1766 // PHI in the exiting block.
1767 MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1768 EliminateDeadPhis(ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1769 // Push out the epilogs, again in reverse order.
1770 // We can't assume anything about the minumum loop trip count at this point,
1771 // so emit a fairly complex epilog.
1772
1773 // We first peel number of stages minus one epilogue. Then we remove dead
1774 // stages and reorder instructions based on their stage. If we have 3 stages
1775 // we generate first:
1776 // E0[3, 2, 1]
1777 // E1[3', 2']
1778 // E2[3'']
1779 // And then we move instructions based on their stages to have:
1780 // E0[3]
1781 // E1[2, 3']
1782 // E2[1, 2', 3'']
1783 // The transformation is legal because we only move instructions past
1784 // instructions of a previous loop iteration.
1785 for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1786 Epilogs.push_back(peelKernel(LPD_Back));
1787 MachineBasicBlock *B = Epilogs.back();
1788 filterInstructions(B, Schedule.getNumStages() - I);
1789 // Keep track at which iteration each phi belongs to. We need it to know
1790 // what version of the variable to use during prologue/epilogue stitching.
1791 EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1792 for (MachineInstr &Phi : B->phis())
1793 PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1794 }
1795 for (size_t I = 0; I < Epilogs.size(); I++) {
1796 LS.reset();
1797 for (size_t J = I; J < Epilogs.size(); J++) {
1798 int Iteration = J;
1799 unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1800 // Move stage one block at a time so that Phi nodes are updated correctly.
1801 for (size_t K = Iteration; K > I; K--)
1802 moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
1803 LS[Stage] = true;
1804 }
1805 LiveStages[Epilogs[I]] = LS;
1806 AvailableStages[Epilogs[I]] = AS;
1807 }
1808
1809 // Now we've defined all the prolog and epilog blocks as a fallthrough
1810 // sequence, add the edges that will be followed if the loop trip count is
1811 // lower than the number of stages (connecting prologs directly with epilogs).
1812 auto PI = Prologs.begin();
1813 auto EI = Epilogs.begin();
1814 assert(Prologs.size() == Epilogs.size());
1815 for (; PI != Prologs.end(); ++PI, ++EI) {
1816 MachineBasicBlock *Pred = *(*EI)->pred_begin();
1817 (*PI)->addSuccessor(*EI);
1818 for (MachineInstr &MI : (*EI)->phis()) {
1819 Register Reg = MI.getOperand(1).getReg();
1820 MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1821 if (Use && Use->getParent() == Pred) {
1822 MachineInstr *CanonicalUse = CanonicalMIs[Use];
1823 if (CanonicalUse->isPHI()) {
1824 // If the use comes from a phi we need to skip as many phi as the
1825 // distance between the epilogue and the kernel. Trace through the phi
1826 // chain to find the right value.
1827 Reg = getPhiCanonicalReg(CanonicalUse, Use);
1828 }
1829 Reg = getEquivalentRegisterIn(Reg, *PI);
1830 }
1831 MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
1832 MI.addOperand(MachineOperand::CreateMBB(*PI));
1833 }
1834 }
1835
1836 // Create a list of all blocks in order.
1837 SmallVector<MachineBasicBlock *, 8> Blocks;
1838 llvm::copy(PeeledFront, std::back_inserter(Blocks));
1839 Blocks.push_back(BB);
1840 llvm::copy(PeeledBack, std::back_inserter(Blocks));
1841
1842 // Iterate in reverse order over all instructions, remapping as we go.
1843 for (MachineBasicBlock *B : reverse(Blocks)) {
1844 for (auto I = B->instr_rbegin();
1845 I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
1846 MachineBasicBlock::reverse_instr_iterator MI = I++;
1847 rewriteUsesOf(&*MI);
1848 }
1849 }
1850 for (auto *MI : IllegalPhisToDelete) {
1851 if (LIS)
1852 LIS->RemoveMachineInstrFromMaps(*MI);
1853 MI->eraseFromParent();
1854 }
1855 IllegalPhisToDelete.clear();
1856
1857 // Now all remapping has been done, we're free to optimize the generated code.
1858 for (MachineBasicBlock *B : reverse(Blocks))
1859 EliminateDeadPhis(B, MRI, LIS);
1860 EliminateDeadPhis(ExitingBB, MRI, LIS);
1861 }
1862
CreateLCSSAExitingBlock()1863 MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1864 MachineFunction &MF = *BB->getParent();
1865 MachineBasicBlock *Exit = *BB->succ_begin();
1866 if (Exit == BB)
1867 Exit = *std::next(BB->succ_begin());
1868
1869 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1870 MF.insert(std::next(BB->getIterator()), NewBB);
1871
1872 // Clone all phis in BB into NewBB and rewrite.
1873 for (MachineInstr &MI : BB->phis()) {
1874 auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
1875 Register OldR = MI.getOperand(3).getReg();
1876 Register R = MRI.createVirtualRegister(RC);
1877 SmallVector<MachineInstr *, 4> Uses;
1878 for (MachineInstr &Use : MRI.use_instructions(OldR))
1879 if (Use.getParent() != BB)
1880 Uses.push_back(&Use);
1881 for (MachineInstr *Use : Uses)
1882 Use->substituteRegister(OldR, R, /*SubIdx=*/0,
1883 *MRI.getTargetRegisterInfo());
1884 MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1885 .addReg(OldR)
1886 .addMBB(BB);
1887 BlockMIs[{NewBB, &MI}] = NI;
1888 CanonicalMIs[NI] = &MI;
1889 }
1890 BB->replaceSuccessor(Exit, NewBB);
1891 Exit->replacePhiUsesWith(BB, NewBB);
1892 NewBB->addSuccessor(Exit);
1893
1894 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1895 SmallVector<MachineOperand, 4> Cond;
1896 bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
1897 (void)CanAnalyzeBr;
1898 assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1899 TII->removeBranch(*BB);
1900 TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
1901 Cond, DebugLoc());
1902 TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
1903 return NewBB;
1904 }
1905
1906 Register
getEquivalentRegisterIn(Register Reg,MachineBasicBlock * BB)1907 PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1908 MachineBasicBlock *BB) {
1909 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1910 unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1911 return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
1912 }
1913
rewriteUsesOf(MachineInstr * MI)1914 void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1915 if (MI->isPHI()) {
1916 // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1917 // and it is produced by this block.
1918 Register PhiR = MI->getOperand(0).getReg();
1919 Register R = MI->getOperand(3).getReg();
1920 int RMIStage = getStage(MRI.getUniqueVRegDef(R));
1921 if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
1922 R = MI->getOperand(1).getReg();
1923 MRI.setRegClass(R, MRI.getRegClass(PhiR));
1924 MRI.replaceRegWith(PhiR, R);
1925 // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1926 // later to figure out how to remap registers.
1927 MI->getOperand(0).setReg(PhiR);
1928 IllegalPhisToDelete.push_back(MI);
1929 return;
1930 }
1931
1932 int Stage = getStage(MI);
1933 if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
1934 LiveStages[MI->getParent()].test(Stage))
1935 // Instruction is live, no rewriting to do.
1936 return;
1937
1938 for (MachineOperand &DefMO : MI->defs()) {
1939 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1940 for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1941 // Only PHIs can use values from this block by construction.
1942 // Match with the equivalent PHI in B.
1943 assert(UseMI.isPHI());
1944 Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1945 MI->getParent());
1946 Subs.emplace_back(&UseMI, Reg);
1947 }
1948 for (auto &Sub : Subs)
1949 Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1950 *MRI.getTargetRegisterInfo());
1951 }
1952 if (LIS)
1953 LIS->RemoveMachineInstrFromMaps(*MI);
1954 MI->eraseFromParent();
1955 }
1956
fixupBranches()1957 void PeelingModuloScheduleExpander::fixupBranches() {
1958 // Work outwards from the kernel.
1959 bool KernelDisposed = false;
1960 int TC = Schedule.getNumStages() - 1;
1961 for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1962 ++PI, ++EI, --TC) {
1963 MachineBasicBlock *Prolog = *PI;
1964 MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1965 MachineBasicBlock *Epilog = *EI;
1966 SmallVector<MachineOperand, 4> Cond;
1967 TII->removeBranch(*Prolog);
1968 std::optional<bool> StaticallyGreater =
1969 LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
1970 if (!StaticallyGreater) {
1971 LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1972 // Dynamically branch based on Cond.
1973 TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
1974 } else if (*StaticallyGreater == false) {
1975 LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1976 // Prolog never falls through; branch to epilog and orphan interior
1977 // blocks. Leave it to unreachable-block-elim to clean up.
1978 Prolog->removeSuccessor(Fallthrough);
1979 for (MachineInstr &P : Fallthrough->phis()) {
1980 P.removeOperand(2);
1981 P.removeOperand(1);
1982 }
1983 TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
1984 KernelDisposed = true;
1985 } else {
1986 LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
1987 // Prolog always falls through; remove incoming values in epilog.
1988 Prolog->removeSuccessor(Epilog);
1989 for (MachineInstr &P : Epilog->phis()) {
1990 P.removeOperand(4);
1991 P.removeOperand(3);
1992 }
1993 }
1994 }
1995
1996 if (!KernelDisposed) {
1997 LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
1998 LoopInfo->setPreheader(Prologs.back());
1999 } else {
2000 LoopInfo->disposed();
2001 }
2002 }
2003
rewriteKernel()2004 void PeelingModuloScheduleExpander::rewriteKernel() {
2005 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2006 KR.rewrite();
2007 }
2008
expand()2009 void PeelingModuloScheduleExpander::expand() {
2010 BB = Schedule.getLoop()->getTopBlock();
2011 Preheader = Schedule.getLoop()->getLoopPreheader();
2012 LLVM_DEBUG(Schedule.dump());
2013 LoopInfo = TII->analyzeLoopForPipelining(BB);
2014 assert(LoopInfo);
2015
2016 rewriteKernel();
2017 peelPrologAndEpilogs();
2018 fixupBranches();
2019 }
2020
validateAgainstModuloScheduleExpander()2021 void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2022 BB = Schedule.getLoop()->getTopBlock();
2023 Preheader = Schedule.getLoop()->getLoopPreheader();
2024
2025 // Dump the schedule before we invalidate and remap all its instructions.
2026 // Stash it in a string so we can print it if we found an error.
2027 std::string ScheduleDump;
2028 raw_string_ostream OS(ScheduleDump);
2029 Schedule.print(OS);
2030 OS.flush();
2031
2032 // First, run the normal ModuleScheduleExpander. We don't support any
2033 // InstrChanges.
2034 assert(LIS && "Requires LiveIntervals!");
2035 ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2036 ModuloScheduleExpander::InstrChangesTy());
2037 MSE.expand();
2038 MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2039 if (!ExpandedKernel) {
2040 // The expander optimized away the kernel. We can't do any useful checking.
2041 MSE.cleanup();
2042 return;
2043 }
2044 // Before running the KernelRewriter, re-add BB into the CFG.
2045 Preheader->addSuccessor(BB);
2046
2047 // Now run the new expansion algorithm.
2048 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2049 KR.rewrite();
2050 peelPrologAndEpilogs();
2051
2052 // Collect all illegal phis that the new algorithm created. We'll give these
2053 // to KernelOperandInfo.
2054 SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2055 for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2056 if (NI->isPHI())
2057 IllegalPhis.insert(&*NI);
2058 }
2059
2060 // Co-iterate across both kernels. We expect them to be identical apart from
2061 // phis and full COPYs (we look through both).
2062 SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2063 auto OI = ExpandedKernel->begin();
2064 auto NI = BB->begin();
2065 for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2066 while (OI->isPHI() || OI->isFullCopy())
2067 ++OI;
2068 while (NI->isPHI() || NI->isFullCopy())
2069 ++NI;
2070 assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2071 // Analyze every operand separately.
2072 for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2073 OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2074 KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2075 KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2076 }
2077
2078 bool Failed = false;
2079 for (auto &OldAndNew : KOIs) {
2080 if (OldAndNew.first == OldAndNew.second)
2081 continue;
2082 Failed = true;
2083 errs() << "Modulo kernel validation error: [\n";
2084 errs() << " [golden] ";
2085 OldAndNew.first.print(errs());
2086 errs() << " ";
2087 OldAndNew.second.print(errs());
2088 errs() << "]\n";
2089 }
2090
2091 if (Failed) {
2092 errs() << "Golden reference kernel:\n";
2093 ExpandedKernel->print(errs());
2094 errs() << "New kernel:\n";
2095 BB->print(errs());
2096 errs() << ScheduleDump;
2097 report_fatal_error(
2098 "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2099 }
2100
2101 // Cleanup by removing BB from the CFG again as the original
2102 // ModuloScheduleExpander intended.
2103 Preheader->removeSuccessor(BB);
2104 MSE.cleanup();
2105 }
2106
cloneInstr(MachineInstr * OldMI)2107 MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2108 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2109
2110 // TODO: Offset information needs to be corrected.
2111 NewMI->dropMemRefs(MF);
2112
2113 return NewMI;
2114 }
2115
2116 /// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2117 /// If it is already dedicated exit, return it. Otherwise, insert a new
2118 /// block between them and return the new block.
createDedicatedExit(MachineBasicBlock * Loop,MachineBasicBlock * Exit)2119 static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2120 MachineBasicBlock *Exit) {
2121 if (Exit->pred_size() == 1)
2122 return Exit;
2123
2124 MachineFunction *MF = Loop->getParent();
2125 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2126
2127 MachineBasicBlock *NewExit =
2128 MF->CreateMachineBasicBlock(Loop->getBasicBlock());
2129 MF->insert(Loop->getIterator(), NewExit);
2130
2131 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2132 SmallVector<MachineOperand, 4> Cond;
2133 TII->analyzeBranch(*Loop, TBB, FBB, Cond);
2134 if (TBB == Loop)
2135 FBB = NewExit;
2136 else if (FBB == Loop)
2137 TBB = NewExit;
2138 else
2139 llvm_unreachable("unexpected loop structure");
2140 TII->removeBranch(*Loop);
2141 TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
2142 Loop->replaceSuccessor(Exit, NewExit);
2143 TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
2144 NewExit->addSuccessor(Exit);
2145
2146 Exit->replacePhiUsesWith(Loop, NewExit);
2147
2148 return NewExit;
2149 }
2150
2151 /// Insert branch code into the end of MBB. It branches to GreaterThan if the
2152 /// remaining trip count for instructions in LastStage0Insts is greater than
2153 /// RequiredTC, and to Otherwise otherwise.
insertCondBranch(MachineBasicBlock & MBB,int RequiredTC,InstrMapTy & LastStage0Insts,MachineBasicBlock & GreaterThan,MachineBasicBlock & Otherwise)2154 void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2155 int RequiredTC,
2156 InstrMapTy &LastStage0Insts,
2157 MachineBasicBlock &GreaterThan,
2158 MachineBasicBlock &Otherwise) {
2159 SmallVector<MachineOperand, 4> Cond;
2160 LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond,
2161 LastStage0Insts);
2162
2163 if (SwapBranchTargetsMVE) {
2164 // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2165 // FBB for optimal performance.
2166 if (TII->reverseBranchCondition(Cond))
2167 llvm_unreachable("can not reverse branch condition");
2168 TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc());
2169 } else {
2170 TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc());
2171 }
2172 }
2173
2174 /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2175 /// other necessary blocks. The control flow is modified to execute the
2176 /// pipelined loop if the trip count satisfies the condition, otherwise the
2177 /// original loop. The original loop is also used to execute the remainder
2178 /// iterations which occur due to unrolling.
generatePipelinedLoop()2179 void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2180 // The control flow for pipelining with MVE:
2181 //
2182 // OrigPreheader:
2183 // // The block that is originally the loop preheader
2184 // goto Check
2185 //
2186 // Check:
2187 // // Check whether the trip count satisfies the requirements to pipeline.
2188 // if (LoopCounter > NumStages + NumUnroll - 2)
2189 // // The minimum number of iterations to pipeline =
2190 // // iterations executed in prolog/epilog (NumStages-1) +
2191 // // iterations executed in one kernel run (NumUnroll)
2192 // goto Prolog
2193 // // fallback to the original loop
2194 // goto NewPreheader
2195 //
2196 // Prolog:
2197 // // All prolog stages. There are no direct branches to the epilogue.
2198 // goto NewKernel
2199 //
2200 // NewKernel:
2201 // // NumUnroll copies of the kernel
2202 // if (LoopCounter > MVE-1)
2203 // goto NewKernel
2204 // goto Epilog
2205 //
2206 // Epilog:
2207 // // All epilog stages.
2208 // if (LoopCounter > 0)
2209 // // The remainder is executed in the original loop
2210 // goto NewPreheader
2211 // goto NewExit
2212 //
2213 // NewPreheader:
2214 // // Newly created preheader for the original loop.
2215 // // The initial values of the phis in the loop are merged from two paths.
2216 // NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2217 // goto OrigKernel
2218 //
2219 // OrigKernel:
2220 // // The original loop block.
2221 // if (LoopCounter != 0)
2222 // goto OrigKernel
2223 // goto NewExit
2224 //
2225 // NewExit:
2226 // // Newly created dedicated exit for the original loop.
2227 // // Merge values which are referenced after the loop
2228 // Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2229 // goto OrigExit
2230 //
2231 // OrigExit:
2232 // // The block that is originally the loop exit.
2233 // // If it is already deicated exit, NewExit is not created.
2234
2235 // An example of where each stage is executed:
2236 // Assume #Stages 3, #MVE 4, #Iterations 12
2237 // Iter 0 1 2 3 4 5 6 7 8 9 10-11
2238 // -------------------------------------------------
2239 // Stage 0 Prolog#0
2240 // Stage 1 0 Prolog#1
2241 // Stage 2 1 0 Kernel Unroll#0 Iter#0
2242 // Stage 2 1 0 Kernel Unroll#1 Iter#0
2243 // Stage 2 1 0 Kernel Unroll#2 Iter#0
2244 // Stage 2 1 0 Kernel Unroll#3 Iter#0
2245 // Stage 2 1 0 Kernel Unroll#0 Iter#1
2246 // Stage 2 1 0 Kernel Unroll#1 Iter#1
2247 // Stage 2 1 0 Kernel Unroll#2 Iter#1
2248 // Stage 2 1 0 Kernel Unroll#3 Iter#1
2249 // Stage 2 1 Epilog#0
2250 // Stage 2 Epilog#1
2251 // Stage 0-2 OrigKernel
2252
2253 LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
2254 assert(LoopInfo && "Must be able to analyze loop!");
2255
2256 calcNumUnroll();
2257
2258 Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2259 Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2260 NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2261 Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2262 NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2263
2264 MF.insert(OrigKernel->getIterator(), Check);
2265 MF.insert(OrigKernel->getIterator(), Prolog);
2266 MF.insert(OrigKernel->getIterator(), NewKernel);
2267 MF.insert(OrigKernel->getIterator(), Epilog);
2268 MF.insert(OrigKernel->getIterator(), NewPreheader);
2269
2270 NewExit = createDedicatedExit(OrigKernel, OrigExit);
2271
2272 NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
2273 TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
2274
2275 OrigPreheader->addSuccessor(Check);
2276 TII->removeBranch(*OrigPreheader);
2277 TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
2278
2279 Check->addSuccessor(Prolog);
2280 Check->addSuccessor(NewPreheader);
2281
2282 Prolog->addSuccessor(NewKernel);
2283
2284 NewKernel->addSuccessor(NewKernel);
2285 NewKernel->addSuccessor(Epilog);
2286
2287 Epilog->addSuccessor(NewPreheader);
2288 Epilog->addSuccessor(NewExit);
2289
2290 InstrMapTy LastStage0Insts;
2291 insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2,
2292 LastStage0Insts, *Prolog, *NewPreheader);
2293
2294 // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2295 // register#
2296 SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2297 generateProlog(PrologVRMap);
2298 generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2299 generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2300 }
2301
2302 /// Replace MI's use operands according to the maps.
updateInstrUse(MachineInstr * MI,int StageNum,int PhaseNum,SmallVectorImpl<ValueMapTy> & CurVRMap,SmallVectorImpl<ValueMapTy> * PrevVRMap)2303 void ModuloScheduleExpanderMVE::updateInstrUse(
2304 MachineInstr *MI, int StageNum, int PhaseNum,
2305 SmallVectorImpl<ValueMapTy> &CurVRMap,
2306 SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2307 // If MI is in the prolog/kernel/epilog block, CurVRMap is
2308 // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2309 // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2310 // Refer to the appropriate map according to the stage difference between
2311 // MI and the definition of an operand.
2312
2313 for (MachineOperand &UseMO : MI->uses()) {
2314 if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2315 continue;
2316 int DiffStage = 0;
2317 Register OrigReg = UseMO.getReg();
2318 MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
2319 if (!DefInst || DefInst->getParent() != OrigKernel)
2320 continue;
2321 unsigned InitReg = 0;
2322 unsigned DefReg = OrigReg;
2323 if (DefInst->isPHI()) {
2324 ++DiffStage;
2325 unsigned LoopReg;
2326 getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
2327 // LoopReg is guaranteed to be defined within the loop by canApply()
2328 DefReg = LoopReg;
2329 DefInst = MRI.getVRegDef(LoopReg);
2330 }
2331 unsigned DefStageNum = Schedule.getStage(DefInst);
2332 DiffStage += StageNum - DefStageNum;
2333 Register NewReg;
2334 if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
2335 // NewReg is defined in a previous phase of the same block
2336 NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2337 else if (!PrevVRMap)
2338 // Since this is the first iteration, refer the initial register of the
2339 // loop
2340 NewReg = InitReg;
2341 else
2342 // Cases where DiffStage is larger than PhaseNum.
2343 // If MI is in the kernel block, the value is defined by the previous
2344 // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2345 // value is defined in the kernel block and KernelVRMap is referenced.
2346 NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2347
2348 const TargetRegisterClass *NRC =
2349 MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
2350 if (NRC)
2351 UseMO.setReg(NewReg);
2352 else {
2353 Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2354 BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
2355 SplitReg)
2356 .addReg(NewReg);
2357 UseMO.setReg(SplitReg);
2358 }
2359 }
2360 }
2361
2362 /// Return a phi if Reg is referenced by the phi.
2363 /// canApply() guarantees that at most only one such phi exists.
getLoopPhiUser(Register Reg,MachineBasicBlock * Loop)2364 static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2365 for (MachineInstr &Phi : Loop->phis()) {
2366 unsigned InitVal, LoopVal;
2367 getPhiRegs(Phi, Loop, InitVal, LoopVal);
2368 if (LoopVal == Reg)
2369 return Φ
2370 }
2371 return nullptr;
2372 }
2373
2374 /// Generate phis for registers defined by OrigMI.
generatePhi(MachineInstr * OrigMI,int UnrollNum,SmallVectorImpl<ValueMapTy> & PrologVRMap,SmallVectorImpl<ValueMapTy> & KernelVRMap,SmallVectorImpl<ValueMapTy> & PhiVRMap)2375 void ModuloScheduleExpanderMVE::generatePhi(
2376 MachineInstr *OrigMI, int UnrollNum,
2377 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2378 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2379 SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2380 int StageNum = Schedule.getStage(OrigMI);
2381 bool UsePrologReg;
2382 if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2383 UsePrologReg = true;
2384 else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2385 UsePrologReg = false;
2386 else
2387 return;
2388
2389 // Examples that show which stages are merged by phi.
2390 // Meaning of the symbol following the stage number:
2391 // a/b: Stages with the same letter are merged (UsePrologReg == true)
2392 // +: Merged with the initial value (UsePrologReg == false)
2393 // *: No phis required
2394 //
2395 // #Stages 3, #MVE 4
2396 // Iter 0 1 2 3 4 5 6 7 8
2397 // -----------------------------------------
2398 // Stage 0a Prolog#0
2399 // Stage 1a 0b Prolog#1
2400 // Stage 2* 1* 0* Kernel Unroll#0
2401 // Stage 2* 1* 0+ Kernel Unroll#1
2402 // Stage 2* 1+ 0a Kernel Unroll#2
2403 // Stage 2+ 1a 0b Kernel Unroll#3
2404 //
2405 // #Stages 3, #MVE 2
2406 // Iter 0 1 2 3 4 5 6 7 8
2407 // -----------------------------------------
2408 // Stage 0a Prolog#0
2409 // Stage 1a 0b Prolog#1
2410 // Stage 2* 1+ 0a Kernel Unroll#0
2411 // Stage 2+ 1a 0b Kernel Unroll#1
2412 //
2413 // #Stages 3, #MVE 1
2414 // Iter 0 1 2 3 4 5 6 7 8
2415 // -----------------------------------------
2416 // Stage 0* Prolog#0
2417 // Stage 1a 0b Prolog#1
2418 // Stage 2+ 1a 0b Kernel Unroll#0
2419
2420 for (MachineOperand &DefMO : OrigMI->defs()) {
2421 if (!DefMO.isReg() || DefMO.isDead())
2422 continue;
2423 Register OrigReg = DefMO.getReg();
2424 auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
2425 if (NewReg == KernelVRMap[UnrollNum].end())
2426 continue;
2427 Register CorrespondReg;
2428 if (UsePrologReg) {
2429 int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2430 CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2431 } else {
2432 MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
2433 if (!Phi)
2434 continue;
2435 CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
2436 }
2437
2438 assert(CorrespondReg.isValid());
2439 Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2440 BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
2441 TII->get(TargetOpcode::PHI), PhiReg)
2442 .addReg(NewReg->second)
2443 .addMBB(NewKernel)
2444 .addReg(CorrespondReg)
2445 .addMBB(Prolog);
2446 PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2447 }
2448 }
2449
replacePhiSrc(MachineInstr & Phi,Register OrigReg,Register NewReg,MachineBasicBlock * NewMBB)2450 static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2451 MachineBasicBlock *NewMBB) {
2452 for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2453 if (Phi.getOperand(Idx).getReg() == OrigReg) {
2454 Phi.getOperand(Idx).setReg(NewReg);
2455 Phi.getOperand(Idx + 1).setMBB(NewMBB);
2456 return;
2457 }
2458 }
2459 }
2460
2461 /// Generate phis that merge values from multiple routes
mergeRegUsesAfterPipeline(Register OrigReg,Register NewReg)2462 void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2463 Register NewReg) {
2464 SmallVector<MachineOperand *> UsesAfterLoop;
2465 SmallVector<MachineInstr *> LoopPhis;
2466 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
2467 E = MRI.use_end();
2468 I != E; ++I) {
2469 MachineOperand &O = *I;
2470 if (O.getParent()->getParent() != OrigKernel &&
2471 O.getParent()->getParent() != Prolog &&
2472 O.getParent()->getParent() != NewKernel &&
2473 O.getParent()->getParent() != Epilog)
2474 UsesAfterLoop.push_back(&O);
2475 if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2476 LoopPhis.push_back(O.getParent());
2477 }
2478
2479 // Merge the route that only execute the pipelined loop (when there are no
2480 // remaining iterations) with the route that execute the original loop.
2481 if (!UsesAfterLoop.empty()) {
2482 Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2483 BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
2484 TII->get(TargetOpcode::PHI), PhiReg)
2485 .addReg(OrigReg)
2486 .addMBB(OrigKernel)
2487 .addReg(NewReg)
2488 .addMBB(Epilog);
2489
2490 for (MachineOperand *MO : UsesAfterLoop)
2491 MO->setReg(PhiReg);
2492
2493 if (!LIS.hasInterval(PhiReg))
2494 LIS.createEmptyInterval(PhiReg);
2495 }
2496
2497 // Merge routes from the pipelined loop and the bypassed route before the
2498 // original loop
2499 if (!LoopPhis.empty()) {
2500 for (MachineInstr *Phi : LoopPhis) {
2501 unsigned InitReg, LoopReg;
2502 getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
2503 Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
2504 BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(),
2505 TII->get(TargetOpcode::PHI), NewInit)
2506 .addReg(InitReg)
2507 .addMBB(Check)
2508 .addReg(NewReg)
2509 .addMBB(Epilog);
2510 replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
2511 }
2512 }
2513 }
2514
generateProlog(SmallVectorImpl<ValueMapTy> & PrologVRMap)2515 void ModuloScheduleExpanderMVE::generateProlog(
2516 SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2517 PrologVRMap.clear();
2518 PrologVRMap.resize(Schedule.getNumStages() - 1);
2519 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2520 for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2521 ++PrologNum) {
2522 for (MachineInstr *MI : Schedule.getInstructions()) {
2523 if (MI->isPHI())
2524 continue;
2525 int StageNum = Schedule.getStage(MI);
2526 if (StageNum > PrologNum)
2527 continue;
2528 MachineInstr *NewMI = cloneInstr(MI);
2529 updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
2530 NewMIMap[NewMI] = {PrologNum, StageNum};
2531 Prolog->push_back(NewMI);
2532 }
2533 }
2534
2535 for (auto I : NewMIMap) {
2536 MachineInstr *MI = I.first;
2537 int PrologNum = I.second.first;
2538 int StageNum = I.second.second;
2539 updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
2540 }
2541
2542 LLVM_DEBUG({
2543 dbgs() << "prolog:\n";
2544 Prolog->dump();
2545 });
2546 }
2547
generateKernel(SmallVectorImpl<ValueMapTy> & PrologVRMap,SmallVectorImpl<ValueMapTy> & KernelVRMap,InstrMapTy & LastStage0Insts)2548 void ModuloScheduleExpanderMVE::generateKernel(
2549 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2550 SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2551 KernelVRMap.clear();
2552 KernelVRMap.resize(NumUnroll);
2553 SmallVector<ValueMapTy> PhiVRMap;
2554 PhiVRMap.resize(NumUnroll);
2555 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2556 DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0;
2557 for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2558 for (MachineInstr *MI : Schedule.getInstructions()) {
2559 if (MI->isPHI())
2560 continue;
2561 int StageNum = Schedule.getStage(MI);
2562 MachineInstr *NewMI = cloneInstr(MI);
2563 if (UnrollNum == NumUnroll - 1)
2564 LastStage0Insts[MI] = NewMI;
2565 updateInstrDef(NewMI, KernelVRMap[UnrollNum],
2566 (UnrollNum == NumUnroll - 1 && StageNum == 0));
2567 generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2568 NewMIMap[NewMI] = {UnrollNum, StageNum};
2569 NewKernel->push_back(NewMI);
2570 }
2571 }
2572
2573 for (auto I : NewMIMap) {
2574 MachineInstr *MI = I.first;
2575 int UnrollNum = I.second.first;
2576 int StageNum = I.second.second;
2577 updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
2578 }
2579
2580 // If remaining trip count is greater than NumUnroll-1, loop continues
2581 insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel,
2582 *Epilog);
2583
2584 LLVM_DEBUG({
2585 dbgs() << "kernel:\n";
2586 NewKernel->dump();
2587 });
2588 }
2589
generateEpilog(SmallVectorImpl<ValueMapTy> & KernelVRMap,SmallVectorImpl<ValueMapTy> & EpilogVRMap,InstrMapTy & LastStage0Insts)2590 void ModuloScheduleExpanderMVE::generateEpilog(
2591 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2592 SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2593 EpilogVRMap.clear();
2594 EpilogVRMap.resize(Schedule.getNumStages() - 1);
2595 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2596 for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2597 ++EpilogNum) {
2598 for (MachineInstr *MI : Schedule.getInstructions()) {
2599 if (MI->isPHI())
2600 continue;
2601 int StageNum = Schedule.getStage(MI);
2602 if (StageNum <= EpilogNum)
2603 continue;
2604 MachineInstr *NewMI = cloneInstr(MI);
2605 updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
2606 NewMIMap[NewMI] = {EpilogNum, StageNum};
2607 Epilog->push_back(NewMI);
2608 }
2609 }
2610
2611 for (auto I : NewMIMap) {
2612 MachineInstr *MI = I.first;
2613 int EpilogNum = I.second.first;
2614 int StageNum = I.second.second;
2615 updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
2616 }
2617
2618 // If there are remaining iterations, they are executed in the original loop.
2619 // Instructions related to loop control, such as loop counter comparison,
2620 // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2621 // in stage 0. Thus, the map is for the last one in the kernel.
2622 insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit);
2623
2624 LLVM_DEBUG({
2625 dbgs() << "epilog:\n";
2626 Epilog->dump();
2627 });
2628 }
2629
2630 /// Calculate the number of unroll required and set it to NumUnroll
calcNumUnroll()2631 void ModuloScheduleExpanderMVE::calcNumUnroll() {
2632 DenseMap<MachineInstr *, unsigned> Inst2Idx;
2633 NumUnroll = 1;
2634 for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2635 Inst2Idx[Schedule.getInstructions()[I]] = I;
2636
2637 for (MachineInstr *MI : Schedule.getInstructions()) {
2638 if (MI->isPHI())
2639 continue;
2640 int StageNum = Schedule.getStage(MI);
2641 for (const MachineOperand &MO : MI->uses()) {
2642 if (!MO.isReg() || !MO.getReg().isVirtual())
2643 continue;
2644 MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
2645 if (DefMI->getParent() != OrigKernel)
2646 continue;
2647
2648 int NumUnrollLocal = 1;
2649 if (DefMI->isPHI()) {
2650 ++NumUnrollLocal;
2651 // canApply() guarantees that DefMI is not phi and is an instruction in
2652 // the loop
2653 DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
2654 }
2655 NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
2656 if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2657 --NumUnrollLocal;
2658 NumUnroll = std::max(NumUnroll, NumUnrollLocal);
2659 }
2660 }
2661 LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2662 }
2663
2664 /// Create new virtual registers for definitions of NewMI and update NewMI.
2665 /// If the definitions are referenced after the pipelined loop, phis are
2666 /// created to merge with other routes.
updateInstrDef(MachineInstr * NewMI,ValueMapTy & VRMap,bool LastDef)2667 void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2668 ValueMapTy &VRMap,
2669 bool LastDef) {
2670 for (MachineOperand &MO : NewMI->operands()) {
2671 if (!MO.isReg() || !MO.getReg().isVirtual() || !MO.isDef())
2672 continue;
2673 Register Reg = MO.getReg();
2674 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2675 Register NewReg = MRI.createVirtualRegister(RC);
2676 MO.setReg(NewReg);
2677 VRMap[Reg] = NewReg;
2678 if (LastDef)
2679 mergeRegUsesAfterPipeline(Reg, NewReg);
2680 }
2681 }
2682
expand()2683 void ModuloScheduleExpanderMVE::expand() {
2684 OrigKernel = Schedule.getLoop()->getTopBlock();
2685 OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2686 OrigExit = Schedule.getLoop()->getExitBlock();
2687
2688 LLVM_DEBUG(Schedule.dump());
2689
2690 generatePipelinedLoop();
2691 }
2692
2693 /// Check if ModuloScheduleExpanderMVE can be applied to L
canApply(MachineLoop & L)2694 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2695 if (!L.getExitBlock()) {
2696 LLVM_DEBUG(
2697 dbgs() << "Can not apply MVE expander: No single exit block.\n";);
2698 return false;
2699 }
2700
2701 MachineBasicBlock *BB = L.getTopBlock();
2702 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2703
2704 // Put some constraints on the operands of the phis to simplify the
2705 // transformation
2706 DenseSet<unsigned> UsedByPhi;
2707 for (MachineInstr &MI : BB->phis()) {
2708 // Registers defined by phis must be used only inside the loop and be never
2709 // used by phis.
2710 for (MachineOperand &MO : MI.defs())
2711 if (MO.isReg())
2712 for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
2713 if (Ref.getParent() != BB || Ref.isPHI()) {
2714 LLVM_DEBUG(dbgs()
2715 << "Can not apply MVE expander: A phi result is "
2716 "referenced outside of the loop or by phi.\n";);
2717 return false;
2718 }
2719
2720 // A source register from the loop block must be defined inside the loop.
2721 // A register defined inside the loop must be referenced by only one phi at
2722 // most.
2723 unsigned InitVal, LoopVal;
2724 getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
2725 if (!Register(LoopVal).isVirtual() ||
2726 MRI.getVRegDef(LoopVal)->getParent() != BB) {
2727 LLVM_DEBUG(
2728 dbgs() << "Can not apply MVE expander: A phi source value coming "
2729 "from the loop is not defined in the loop.\n";);
2730 return false;
2731 }
2732 if (UsedByPhi.count(LoopVal)) {
2733 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2734 "loop is referenced by two or more phis.\n";);
2735 return false;
2736 }
2737 UsedByPhi.insert(LoopVal);
2738 }
2739
2740 return true;
2741 }
2742
2743 //===----------------------------------------------------------------------===//
2744 // ModuloScheduleTestPass implementation
2745 //===----------------------------------------------------------------------===//
2746 // This pass constructs a ModuloSchedule from its module and runs
2747 // ModuloScheduleExpander.
2748 //
2749 // The module is expected to contain a single-block analyzable loop.
2750 // The total order of instructions is taken from the loop as-is.
2751 // Instructions are expected to be annotated with a PostInstrSymbol.
2752 // This PostInstrSymbol must have the following format:
2753 // "Stage=%d Cycle=%d".
2754 //===----------------------------------------------------------------------===//
2755
2756 namespace {
2757 class ModuloScheduleTest : public MachineFunctionPass {
2758 public:
2759 static char ID;
2760
ModuloScheduleTest()2761 ModuloScheduleTest() : MachineFunctionPass(ID) {
2762 initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
2763 }
2764
2765 bool runOnMachineFunction(MachineFunction &MF) override;
2766 void runOnLoop(MachineFunction &MF, MachineLoop &L);
2767
getAnalysisUsage(AnalysisUsage & AU) const2768 void getAnalysisUsage(AnalysisUsage &AU) const override {
2769 AU.addRequired<MachineLoopInfoWrapperPass>();
2770 AU.addRequired<LiveIntervalsWrapperPass>();
2771 MachineFunctionPass::getAnalysisUsage(AU);
2772 }
2773 };
2774 } // namespace
2775
2776 char ModuloScheduleTest::ID = 0;
2777
2778 INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2779 "Modulo Schedule test pass", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)2780 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2781 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2782 INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2783 "Modulo Schedule test pass", false, false)
2784
2785 bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2786 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2787 for (auto *L : MLI) {
2788 if (L->getTopBlock() != L->getBottomBlock())
2789 continue;
2790 runOnLoop(MF, *L);
2791 return false;
2792 }
2793 return false;
2794 }
2795
parseSymbolString(StringRef S,int & Cycle,int & Stage)2796 static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2797 std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
2798 std::pair<StringRef, StringRef> StageTokenAndValue =
2799 getToken(StageAndCycle.first, "-");
2800 std::pair<StringRef, StringRef> CycleTokenAndValue =
2801 getToken(StageAndCycle.second, "-");
2802 if (StageTokenAndValue.first != "Stage" ||
2803 CycleTokenAndValue.first != "_Cycle") {
2804 llvm_unreachable(
2805 "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2806 return;
2807 }
2808
2809 StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
2810 CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
2811
2812 dbgs() << " Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2813 }
2814
runOnLoop(MachineFunction & MF,MachineLoop & L)2815 void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2816 LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2817 MachineBasicBlock *BB = L.getTopBlock();
2818 dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2819
2820 DenseMap<MachineInstr *, int> Cycle, Stage;
2821 std::vector<MachineInstr *> Instrs;
2822 for (MachineInstr &MI : *BB) {
2823 if (MI.isTerminator())
2824 continue;
2825 Instrs.push_back(&MI);
2826 if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2827 dbgs() << "Parsing post-instr symbol for " << MI;
2828 parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
2829 }
2830 }
2831
2832 ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2833 std::move(Stage));
2834 ModuloScheduleExpander MSE(
2835 MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2836 MSE.expand();
2837 MSE.cleanup();
2838 }
2839
2840 //===----------------------------------------------------------------------===//
2841 // ModuloScheduleTestAnnotater implementation
2842 //===----------------------------------------------------------------------===//
2843
annotate()2844 void ModuloScheduleTestAnnotater::annotate() {
2845 for (MachineInstr *MI : S.getInstructions()) {
2846 SmallVector<char, 16> SV;
2847 raw_svector_ostream OS(SV);
2848 OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2849 MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
2850 MI->setPostInstrSymbol(MF, Sym);
2851 }
2852 }
2853