1 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Software pipelining (SWP) is an instruction scheduling technique for loops 10 // that overlaps loop iterations and exploits ILP via compiler transformations. 11 // 12 // There are multiple methods for analyzing a loop and creating a schedule. 13 // An example algorithm is Swing Modulo Scheduling (implemented by the 14 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant 15 // for the task of actually rewriting a loop to adhere to the schedule, which 16 // is what this file does. 17 // 18 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note 19 // that we only support single-block loops, so "block" and "loop" can be used 20 // interchangably. 21 // 22 // The Cycle of an instruction defines a partial order of the instructions in 23 // the remapped loop. Instructions within a cycle must not consume the output 24 // of any instruction in the same cycle. Cycle information is assumed to have 25 // been calculated such that the processor will execute instructions in 26 // lock-step (for example in a VLIW ISA). 27 // 28 // The Stage of an instruction defines the mapping between logical loop 29 // iterations and pipelined loop iterations. An example (unrolled) pipeline 30 // may look something like: 31 // 32 // I0[0] Execute instruction I0 of iteration 0 33 // I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1 34 // I1[1], I0[2] 35 // I1[2], I0[3] 36 // 37 // In the schedule for this unrolled sequence we would say that I0 was scheduled 38 // in stage 0 and I1 in stage 1: 39 // 40 // loop: 41 // [stage 0] x = I0 42 // [stage 1] I1 x (from stage 0) 43 // 44 // And to actually generate valid code we must insert a phi: 45 // 46 // loop: 47 // x' = phi(x) 48 // x = I0 49 // I1 x' 50 // 51 // This is a simple example; the rules for how to generate correct code given 52 // an arbitrary schedule containing loop-carried values are complex. 53 // 54 // Note that these examples only mention the steady-state kernel of the 55 // generated loop; prologs and epilogs must be generated also that prime and 56 // flush the pipeline. Doing so is nontrivial. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H 61 #define LLVM_CODEGEN_MODULOSCHEDULE_H 62 63 #include "llvm/CodeGen/MachineFunction.h" 64 #include "llvm/CodeGen/MachineLoopUtils.h" 65 #include "llvm/CodeGen/TargetInstrInfo.h" 66 #include "llvm/CodeGen/TargetSubtargetInfo.h" 67 #include <deque> 68 #include <map> 69 #include <vector> 70 71 namespace llvm { 72 class MachineBasicBlock; 73 class MachineLoop; 74 class MachineRegisterInfo; 75 class MachineInstr; 76 class LiveIntervals; 77 78 /// Represents a schedule for a single-block loop. For every instruction we 79 /// maintain a Cycle and Stage. 80 class ModuloSchedule { 81 private: 82 /// The block containing the loop instructions. 83 MachineLoop *Loop; 84 85 /// The instructions to be generated, in total order. Cycle provides a partial 86 /// order; the total order within cycles has been decided by the schedule 87 /// producer. 88 std::vector<MachineInstr *> ScheduledInstrs; 89 90 /// The cycle for each instruction. 91 DenseMap<MachineInstr *, int> Cycle; 92 93 /// The stage for each instruction. 94 DenseMap<MachineInstr *, int> Stage; 95 96 /// The number of stages in this schedule (Max(Stage) + 1). 97 int NumStages; 98 99 public: 100 /// Create a new ModuloSchedule. 101 /// \arg ScheduledInstrs The new loop instructions, in total resequenced 102 /// order. 103 /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does 104 /// not need to start at zero. ScheduledInstrs must be partially ordered by 105 /// Cycle. 106 /// \arg Stage Stage index for all instructions in ScheduleInstrs. ModuloSchedule(MachineFunction & MF,MachineLoop * Loop,std::vector<MachineInstr * > ScheduledInstrs,DenseMap<MachineInstr *,int> Cycle,DenseMap<MachineInstr *,int> Stage)107 ModuloSchedule(MachineFunction &MF, MachineLoop *Loop, 108 std::vector<MachineInstr *> ScheduledInstrs, 109 DenseMap<MachineInstr *, int> Cycle, 110 DenseMap<MachineInstr *, int> Stage) 111 : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)), 112 Stage(std::move(Stage)) { 113 NumStages = 0; 114 for (auto &KV : this->Stage) 115 NumStages = std::max(NumStages, KV.second); 116 ++NumStages; 117 } 118 119 /// Return the single-block loop being scheduled. getLoop()120 MachineLoop *getLoop() const { return Loop; } 121 122 /// Return the number of stages contained in this schedule, which is the 123 /// largest stage index + 1. getNumStages()124 int getNumStages() const { return NumStages; } 125 126 /// Return the first cycle in the schedule, which is the cycle index of the 127 /// first instruction. getFirstCycle()128 int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; } 129 130 /// Return the final cycle in the schedule, which is the cycle index of the 131 /// last instruction. getFinalCycle()132 int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; } 133 134 /// Return the stage that MI is scheduled in, or -1. getStage(MachineInstr * MI)135 int getStage(MachineInstr *MI) { 136 auto I = Stage.find(MI); 137 return I == Stage.end() ? -1 : I->second; 138 } 139 140 /// Return the cycle that MI is scheduled at, or -1. getCycle(MachineInstr * MI)141 int getCycle(MachineInstr *MI) { 142 auto I = Cycle.find(MI); 143 return I == Cycle.end() ? -1 : I->second; 144 } 145 146 /// Set the stage of a newly created instruction. setStage(MachineInstr * MI,int MIStage)147 void setStage(MachineInstr *MI, int MIStage) { 148 assert(Stage.count(MI) == 0); 149 Stage[MI] = MIStage; 150 } 151 152 /// Return the rescheduled instructions in order. getInstructions()153 ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; } 154 dump()155 void dump() { print(dbgs()); } 156 void print(raw_ostream &OS); 157 }; 158 159 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, 160 /// rewriting the old loop and inserting prologs and epilogs as required. 161 class ModuloScheduleExpander { 162 public: 163 using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>; 164 165 private: 166 using ValueMapTy = DenseMap<unsigned, unsigned>; 167 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; 168 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; 169 170 ModuloSchedule &Schedule; 171 MachineFunction &MF; 172 const TargetSubtargetInfo &ST; 173 MachineRegisterInfo &MRI; 174 const TargetInstrInfo *TII = nullptr; 175 LiveIntervals &LIS; 176 177 MachineBasicBlock *BB = nullptr; 178 MachineBasicBlock *Preheader = nullptr; 179 MachineBasicBlock *NewKernel = nullptr; 180 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; 181 182 /// Map for each register and the max difference between its uses and def. 183 /// The first element in the pair is the max difference in stages. The 184 /// second is true if the register defines a Phi value and loop value is 185 /// scheduled before the Phi. 186 std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff; 187 188 /// Instructions to change when emitting the final schedule. 189 InstrChangesTy InstrChanges; 190 191 void generatePipelinedLoop(); 192 void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB, 193 ValueMapTy *VRMap, MBBVectorTy &PrologBBs); 194 void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB, 195 MachineBasicBlock *OrigBB, ValueMapTy *VRMap, 196 ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs, 197 MBBVectorTy &PrologBBs); 198 void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, 199 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, 200 ValueMapTy *VRMap, InstrMapTy &InstrMap, 201 unsigned LastStageNum, unsigned CurStageNum, 202 bool IsLast); 203 void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, 204 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, 205 ValueMapTy *VRMap, ValueMapTy *VRMapPhi, 206 InstrMapTy &InstrMap, unsigned LastStageNum, 207 unsigned CurStageNum, bool IsLast); 208 void removeDeadInstructions(MachineBasicBlock *KernelBB, 209 MBBVectorTy &EpilogBBs); 210 void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs); 211 void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs, 212 MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, 213 ValueMapTy *VRMap); 214 bool computeDelta(MachineInstr &MI, unsigned &Delta); 215 void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, 216 unsigned Num); 217 MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum, 218 unsigned InstStageNum); 219 MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum, 220 unsigned InstStageNum); 221 void updateInstruction(MachineInstr *NewMI, bool LastDef, 222 unsigned CurStageNum, unsigned InstrStageNum, 223 ValueMapTy *VRMap); 224 MachineInstr *findDefInLoop(unsigned Reg); 225 unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal, 226 unsigned LoopStage, ValueMapTy *VRMap, 227 MachineBasicBlock *BB); 228 void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum, 229 ValueMapTy *VRMap, InstrMapTy &InstrMap); 230 void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap, 231 unsigned CurStageNum, unsigned PhiNum, 232 MachineInstr *Phi, unsigned OldReg, 233 unsigned NewReg, unsigned PrevReg = 0); 234 bool isLoopCarried(MachineInstr &Phi); 235 236 /// Return the max. number of stages/iterations that can occur between a 237 /// register definition and its uses. getStagesForReg(int Reg,unsigned CurStage)238 unsigned getStagesForReg(int Reg, unsigned CurStage) { 239 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; 240 if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 && 241 Stages.second) 242 return 1; 243 return Stages.first; 244 } 245 246 /// The number of stages for a Phi is a little different than other 247 /// instructions. The minimum value computed in RegToStageDiff is 1 248 /// because we assume the Phi is needed for at least 1 iteration. 249 /// This is not the case if the loop value is scheduled prior to the 250 /// Phi in the same stage. This function returns the number of stages 251 /// or iterations needed between the Phi definition and any uses. getStagesForPhi(int Reg)252 unsigned getStagesForPhi(int Reg) { 253 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; 254 if (Stages.second) 255 return Stages.first; 256 return Stages.first - 1; 257 } 258 259 public: 260 /// Create a new ModuloScheduleExpander. 261 /// \arg InstrChanges Modifications to make to instructions with memory 262 /// operands. 263 /// FIXME: InstrChanges is opaque and is an implementation detail of an 264 /// optimization in MachinePipeliner that crosses abstraction boundaries. ModuloScheduleExpander(MachineFunction & MF,ModuloSchedule & S,LiveIntervals & LIS,InstrChangesTy InstrChanges)265 ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, 266 LiveIntervals &LIS, InstrChangesTy InstrChanges) 267 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), 268 TII(ST.getInstrInfo()), LIS(LIS), 269 InstrChanges(std::move(InstrChanges)) {} 270 271 /// Performs the actual expansion. 272 void expand(); 273 /// Performs final cleanup after expansion. 274 void cleanup(); 275 276 /// Returns the newly rewritten kernel block, or nullptr if this was 277 /// optimized away. getRewrittenKernel()278 MachineBasicBlock *getRewrittenKernel() { return NewKernel; } 279 }; 280 281 /// A reimplementation of ModuloScheduleExpander. It works by generating a 282 /// standalone kernel loop and peeling out the prologs and epilogs. 283 class PeelingModuloScheduleExpander { 284 public: PeelingModuloScheduleExpander(MachineFunction & MF,ModuloSchedule & S,LiveIntervals * LIS)285 PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, 286 LiveIntervals *LIS) 287 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), 288 TII(ST.getInstrInfo()), LIS(LIS) {} 289 290 void expand(); 291 292 /// Runs ModuloScheduleExpander and treats it as a golden input to validate 293 /// aspects of the code generated by PeelingModuloScheduleExpander. 294 void validateAgainstModuloScheduleExpander(); 295 296 protected: 297 ModuloSchedule &Schedule; 298 MachineFunction &MF; 299 const TargetSubtargetInfo &ST; 300 MachineRegisterInfo &MRI; 301 const TargetInstrInfo *TII = nullptr; 302 LiveIntervals *LIS = nullptr; 303 304 /// The original loop block that gets rewritten in-place. 305 MachineBasicBlock *BB = nullptr; 306 /// The original loop preheader. 307 MachineBasicBlock *Preheader = nullptr; 308 /// All prolog and epilog blocks. 309 SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs; 310 /// For every block, the stages that are produced. 311 DenseMap<MachineBasicBlock *, BitVector> LiveStages; 312 /// For every block, the stages that are available. A stage can be available 313 /// but not produced (in the epilog) or produced but not available (in the 314 /// prolog). 315 DenseMap<MachineBasicBlock *, BitVector> AvailableStages; 316 /// When peeling the epilogue keep track of the distance between the phi 317 /// nodes and the kernel. 318 DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration; 319 320 /// CanonicalMIs and BlockMIs form a bidirectional map between any of the 321 /// loop kernel clones. 322 DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs; 323 DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *> 324 BlockMIs; 325 326 /// State passed from peelKernel to peelPrologAndEpilogs(). 327 std::deque<MachineBasicBlock *> PeeledFront, PeeledBack; 328 /// Illegal phis that need to be deleted once we re-link stages. 329 SmallVector<MachineInstr *, 4> IllegalPhisToDelete; 330 331 /// Converts BB from the original loop body to the rewritten, pipelined 332 /// steady-state. 333 void rewriteKernel(); 334 335 /// Peels one iteration of the rewritten kernel (BB) in the specified 336 /// direction. 337 MachineBasicBlock *peelKernel(LoopPeelDirection LPD); 338 // Delete instructions whose stage is less than MinStage in the given basic 339 // block. 340 void filterInstructions(MachineBasicBlock *MB, int MinStage); 341 // Move instructions of the given stage from sourceBB to DestBB. Remap the phi 342 // instructions to keep a valid IR. 343 void moveStageBetweenBlocks(MachineBasicBlock *DestBB, 344 MachineBasicBlock *SourceBB, unsigned Stage); 345 /// Peel the kernel forwards and backwards to produce prologs and epilogs, 346 /// and stitch them together. 347 void peelPrologAndEpilogs(); 348 /// All prolog and epilog blocks are clones of the kernel, so any produced 349 /// register in one block has an corollary in all other blocks. 350 Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB); 351 /// Change all users of MI, if MI is predicated out 352 /// (LiveStages[MI->getParent()] == false). 353 void rewriteUsesOf(MachineInstr *MI); 354 /// Insert branches between prologs, kernel and epilogs. 355 void fixupBranches(); 356 /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block 357 /// to a block dominated by all prologs and epilogs. This allows us to treat 358 /// the loop exiting block as any other kernel clone. 359 MachineBasicBlock *CreateLCSSAExitingBlock(); 360 /// Helper to get the stage of an instruction in the schedule. getStage(MachineInstr * MI)361 unsigned getStage(MachineInstr *MI) { 362 if (CanonicalMIs.count(MI)) 363 MI = CanonicalMIs[MI]; 364 return Schedule.getStage(MI); 365 } 366 /// Helper function to find the right canonical register for a phi instruction 367 /// coming from a peeled out prologue. 368 Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi); 369 /// Target loop info before kernel peeling. 370 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; 371 }; 372 373 /// Expand the kernel using modulo variable expansion algorithm (MVE). 374 /// It unrolls the kernel enough to avoid overlap of register lifetime. 375 class ModuloScheduleExpanderMVE { 376 private: 377 using ValueMapTy = DenseMap<unsigned, unsigned>; 378 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; 379 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; 380 381 ModuloSchedule &Schedule; 382 MachineFunction &MF; 383 const TargetSubtargetInfo &ST; 384 MachineRegisterInfo &MRI; 385 const TargetInstrInfo *TII = nullptr; 386 LiveIntervals &LIS; 387 388 MachineBasicBlock *OrigKernel = nullptr; 389 MachineBasicBlock *OrigPreheader = nullptr; 390 MachineBasicBlock *OrigExit = nullptr; 391 MachineBasicBlock *Check = nullptr; 392 MachineBasicBlock *Prolog = nullptr; 393 MachineBasicBlock *NewKernel = nullptr; 394 MachineBasicBlock *Epilog = nullptr; 395 MachineBasicBlock *NewPreheader = nullptr; 396 MachineBasicBlock *NewExit = nullptr; 397 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; 398 399 /// The number of unroll required to avoid overlap of live ranges. 400 /// NumUnroll = 1 means no unrolling. 401 int NumUnroll; 402 403 void calcNumUnroll(); 404 void generatePipelinedLoop(); 405 void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap); 406 void generatePhi(MachineInstr *OrigMI, int UnrollNum, 407 SmallVectorImpl<ValueMapTy> &PrologVRMap, 408 SmallVectorImpl<ValueMapTy> &KernelVRMap, 409 SmallVectorImpl<ValueMapTy> &PhiVRMap); 410 void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap, 411 SmallVectorImpl<ValueMapTy> &KernelVRMap, 412 InstrMapTy &LastStage0Insts); 413 void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap, 414 SmallVectorImpl<ValueMapTy> &EpilogVRMap, 415 InstrMapTy &LastStage0Insts); 416 void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg); 417 418 MachineInstr *cloneInstr(MachineInstr *OldMI); 419 420 void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef); 421 422 void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal, 423 unsigned UnrollNum, 424 SmallVectorImpl<ValueMapTy> &VRMapProlog, 425 SmallVectorImpl<ValueMapTy> &VRMapPhi); 426 void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum, 427 SmallVectorImpl<ValueMapTy> &CurVRMap, 428 SmallVectorImpl<ValueMapTy> *PrevVRMap); 429 430 void insertCondBranch(MachineBasicBlock &MBB, int RequiredTC, 431 InstrMapTy &LastStage0Insts, 432 MachineBasicBlock &GreaterThan, 433 MachineBasicBlock &Otherwise); 434 435 public: ModuloScheduleExpanderMVE(MachineFunction & MF,ModuloSchedule & S,LiveIntervals & LIS)436 ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S, 437 LiveIntervals &LIS) 438 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), 439 TII(ST.getInstrInfo()), LIS(LIS) {} 440 441 void expand(); 442 static bool canApply(MachineLoop &L); 443 }; 444 445 /// Expander that simply annotates each scheduled instruction with a post-instr 446 /// symbol that can be consumed by the ModuloScheduleTest pass. 447 /// 448 /// The post-instr symbol is a way of annotating an instruction that can be 449 /// roundtripped in MIR. The syntax is: 450 /// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5> 451 class ModuloScheduleTestAnnotater { 452 MachineFunction &MF; 453 ModuloSchedule &S; 454 455 public: ModuloScheduleTestAnnotater(MachineFunction & MF,ModuloSchedule & S)456 ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S) 457 : MF(MF), S(S) {} 458 459 /// Performs the annotation. 460 void annotate(); 461 }; 462 463 } // end namespace llvm 464 465 #endif // LLVM_CODEGEN_MODULOSCHEDULE_H 466