xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp (revision 9c77fb6aaa366cbabc80ee1b834bcfe4df135491)
1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11 // form and a wave-level control flow graph.
12 //
13 // Before this pass, values that are semantically i1 and are defined and used
14 // within the same basic block are already represented as lane masks in scalar
15 // registers. However, values that cross basic blocks are always transferred
16 // between basic blocks in vreg_1 virtual registers and are lowered by this
17 // pass.
18 //
19 // The only instructions that use or define vreg_1 virtual registers are COPY,
20 // PHI, and IMPLICIT_DEF.
21 //
22 //===----------------------------------------------------------------------===//
23 
24 #include "SILowerI1Copies.h"
25 #include "AMDGPU.h"
26 #include "llvm/CodeGen/MachineSSAUpdater.h"
27 #include "llvm/InitializePasses.h"
28 
29 #define DEBUG_TYPE "si-i1-copies"
30 
31 using namespace llvm;
32 
33 static Register
34 insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
35                     MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
36 
37 namespace {
38 
39 class Vreg1LoweringHelper : public PhiLoweringHelper {
40 public:
41   Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
42                       MachinePostDominatorTree *PDT);
43 
44 private:
45   DenseSet<Register> ConstrainRegs;
46 
47 public:
48   void markAsLaneMask(Register DstReg) const override;
49   void getCandidatesForLowering(
50       SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
51   void collectIncomingValuesFromPhi(
52       const MachineInstr *MI,
53       SmallVectorImpl<Incoming> &Incomings) const override;
54   void replaceDstReg(Register NewReg, Register OldReg,
55                      MachineBasicBlock *MBB) override;
56   void buildMergeLaneMasks(MachineBasicBlock &MBB,
57                            MachineBasicBlock::iterator I, const DebugLoc &DL,
58                            Register DstReg, Register PrevReg,
59                            Register CurReg) override;
60   void constrainAsLaneMask(Incoming &In) override;
61 
62   bool lowerCopiesFromI1();
63   bool lowerCopiesToI1();
64   bool cleanConstrainRegs(bool Changed);
65   bool isVreg1(Register Reg) const {
66     return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
67   }
68 };
69 
70 Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
71                                          MachineDominatorTree *DT,
72                                          MachinePostDominatorTree *PDT)
73     : PhiLoweringHelper(MF, DT, PDT) {}
74 
75 bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
76   assert(Changed || ConstrainRegs.empty());
77   for (Register Reg : ConstrainRegs)
78     MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
79   ConstrainRegs.clear();
80 
81   return Changed;
82 }
83 
84 /// Helper class that determines the relationship between incoming values of a
85 /// phi in the control flow graph to determine where an incoming value can
86 /// simply be taken as a scalar lane mask as-is, and where it needs to be
87 /// merged with another, previously defined lane mask.
88 ///
89 /// The approach is as follows:
90 ///  - Determine all basic blocks which, starting from the incoming blocks,
91 ///    a wave may reach before entering the def block (the block containing the
92 ///    phi).
93 ///  - If an incoming block has no predecessors in this set, we can take the
94 ///    incoming value as a scalar lane mask as-is.
95 ///  -- A special case of this is when the def block has a self-loop.
96 ///  - Otherwise, the incoming value needs to be merged with a previously
97 ///    defined lane mask.
98 ///  - If there is a path into the set of reachable blocks that does _not_ go
99 ///    through an incoming block where we can take the scalar lane mask as-is,
100 ///    we need to invent an available value for the SSAUpdater. Choices are
101 ///    0 and undef, with differing consequences for how to merge values etc.
102 ///
103 /// TODO: We could use region analysis to quickly skip over SESE regions during
104 ///       the traversal.
105 ///
106 class PhiIncomingAnalysis {
107   MachinePostDominatorTree &PDT;
108   const SIInstrInfo *TII;
109 
110   // For each reachable basic block, whether it is a source in the induced
111   // subgraph of the CFG.
112   MapVector<MachineBasicBlock *, bool> ReachableMap;
113   SmallVector<MachineBasicBlock *, 4> Stack;
114   SmallVector<MachineBasicBlock *, 4> Predecessors;
115 
116 public:
117   PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII)
118       : PDT(PDT), TII(TII) {}
119 
120   /// Returns whether \p MBB is a source in the induced subgraph of reachable
121   /// blocks.
122   bool isSource(MachineBasicBlock &MBB) const {
123     return ReachableMap.find(&MBB)->second;
124   }
125 
126   ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
127 
128   void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
129     assert(Stack.empty());
130     ReachableMap.clear();
131     Predecessors.clear();
132 
133     // Insert the def block first, so that it acts as an end point for the
134     // traversal.
135     ReachableMap.try_emplace(&DefBlock, false);
136 
137     for (auto Incoming : Incomings) {
138       MachineBasicBlock *MBB = Incoming.Block;
139       if (MBB == &DefBlock) {
140         ReachableMap[&DefBlock] = true; // self-loop on DefBlock
141         continue;
142       }
143 
144       ReachableMap.try_emplace(MBB, false);
145 
146       // If this block has a divergent terminator and the def block is its
147       // post-dominator, the wave may first visit the other successors.
148       if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB))
149         append_range(Stack, MBB->successors());
150     }
151 
152     while (!Stack.empty()) {
153       MachineBasicBlock *MBB = Stack.pop_back_val();
154       if (ReachableMap.try_emplace(MBB, false).second)
155         append_range(Stack, MBB->successors());
156     }
157 
158     for (auto &[MBB, Reachable] : ReachableMap) {
159       bool HaveReachablePred = false;
160       for (MachineBasicBlock *Pred : MBB->predecessors()) {
161         if (ReachableMap.count(Pred)) {
162           HaveReachablePred = true;
163         } else {
164           Stack.push_back(Pred);
165         }
166       }
167       if (!HaveReachablePred)
168         Reachable = true;
169       if (HaveReachablePred) {
170         for (MachineBasicBlock *UnreachablePred : Stack) {
171           if (!llvm::is_contained(Predecessors, UnreachablePred))
172             Predecessors.push_back(UnreachablePred);
173         }
174       }
175       Stack.clear();
176     }
177   }
178 };
179 
180 /// Helper class that detects loops which require us to lower an i1 COPY into
181 /// bitwise manipulation.
182 ///
183 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
184 /// between loops with the same header. Consider this example:
185 ///
186 ///  A-+-+
187 ///  | | |
188 ///  B-+ |
189 ///  |   |
190 ///  C---+
191 ///
192 /// A is the header of a loop containing A, B, and C as far as LoopInfo is
193 /// concerned. However, an i1 COPY in B that is used in C must be lowered to
194 /// bitwise operations to combine results from different loop iterations when
195 /// B has a divergent branch (since by default we will compile this code such
196 /// that threads in a wave are merged at the entry of C).
197 ///
198 /// The following rule is implemented to determine whether bitwise operations
199 /// are required: use the bitwise lowering for a def in block B if a backward
200 /// edge to B is reachable without going through the nearest common
201 /// post-dominator of B and all uses of the def.
202 ///
203 /// TODO: This rule is conservative because it does not check whether the
204 ///       relevant branches are actually divergent.
205 ///
206 /// The class is designed to cache the CFG traversal so that it can be re-used
207 /// for multiple defs within the same basic block.
208 ///
209 /// TODO: We could use region analysis to quickly skip over SESE regions during
210 ///       the traversal.
211 ///
212 class LoopFinder {
213   MachineDominatorTree &DT;
214   MachinePostDominatorTree &PDT;
215 
216   // All visited / reachable block, tagged by level (level 0 is the def block,
217   // level 1 are all blocks reachable including but not going through the def
218   // block's IPDOM, etc.).
219   DenseMap<MachineBasicBlock *, unsigned> Visited;
220 
221   // Nearest common dominator of all visited blocks by level (level 0 is the
222   // def block). Used for seeding the SSAUpdater.
223   SmallVector<MachineBasicBlock *, 4> CommonDominators;
224 
225   // Post-dominator of all visited blocks.
226   MachineBasicBlock *VisitedPostDom = nullptr;
227 
228   // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
229   // reachable without going through the IPDOM of the def block (if the IPDOM
230   // itself has an edge to the def block, the loop level is 2), etc.
231   unsigned FoundLoopLevel = ~0u;
232 
233   MachineBasicBlock *DefBlock = nullptr;
234   SmallVector<MachineBasicBlock *, 4> Stack;
235   SmallVector<MachineBasicBlock *, 4> NextLevel;
236 
237 public:
238   LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
239       : DT(DT), PDT(PDT) {}
240 
241   void initialize(MachineBasicBlock &MBB) {
242     Visited.clear();
243     CommonDominators.clear();
244     Stack.clear();
245     NextLevel.clear();
246     VisitedPostDom = nullptr;
247     FoundLoopLevel = ~0u;
248 
249     DefBlock = &MBB;
250   }
251 
252   /// Check whether a backward edge can be reached without going through the
253   /// given \p PostDom of the def block.
254   ///
255   /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
256   unsigned findLoop(MachineBasicBlock *PostDom) {
257     MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
258 
259     if (!VisitedPostDom)
260       advanceLevel();
261 
262     unsigned Level = 0;
263     while (PDNode->getBlock() != PostDom) {
264       if (PDNode->getBlock() == VisitedPostDom)
265         advanceLevel();
266       PDNode = PDNode->getIDom();
267       Level++;
268       if (FoundLoopLevel == Level)
269         return Level;
270     }
271 
272     return 0;
273   }
274 
275   /// Add undef values dominating the loop and the optionally given additional
276   /// blocks, so that the SSA updater doesn't have to search all the way to the
277   /// function entry.
278   void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
279                       MachineRegisterInfo &MRI,
280                       MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
281                       ArrayRef<Incoming> Incomings = {}) {
282     assert(LoopLevel < CommonDominators.size());
283 
284     MachineBasicBlock *Dom = CommonDominators[LoopLevel];
285     for (auto &Incoming : Incomings)
286       Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
287 
288     if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
289       SSAUpdater.AddAvailableValue(
290           Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
291     } else {
292       // The dominator is part of the loop or the given blocks, so add the
293       // undef value to unreachable predecessors instead.
294       for (MachineBasicBlock *Pred : Dom->predecessors()) {
295         if (!inLoopLevel(*Pred, LoopLevel, Incomings))
296           SSAUpdater.AddAvailableValue(
297               Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
298       }
299     }
300   }
301 
302 private:
303   bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
304                    ArrayRef<Incoming> Incomings) const {
305     auto DomIt = Visited.find(&MBB);
306     if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
307       return true;
308 
309     for (auto &Incoming : Incomings)
310       if (Incoming.Block == &MBB)
311         return true;
312 
313     return false;
314   }
315 
316   void advanceLevel() {
317     MachineBasicBlock *VisitedDom;
318 
319     if (!VisitedPostDom) {
320       VisitedPostDom = DefBlock;
321       VisitedDom = DefBlock;
322       Stack.push_back(DefBlock);
323     } else {
324       VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
325       VisitedDom = CommonDominators.back();
326 
327       for (unsigned i = 0; i < NextLevel.size();) {
328         if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
329           Stack.push_back(NextLevel[i]);
330 
331           NextLevel[i] = NextLevel.back();
332           NextLevel.pop_back();
333         } else {
334           i++;
335         }
336       }
337     }
338 
339     unsigned Level = CommonDominators.size();
340     while (!Stack.empty()) {
341       MachineBasicBlock *MBB = Stack.pop_back_val();
342       if (!PDT.dominates(VisitedPostDom, MBB))
343         NextLevel.push_back(MBB);
344 
345       Visited[MBB] = Level;
346       VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
347 
348       for (MachineBasicBlock *Succ : MBB->successors()) {
349         if (Succ == DefBlock) {
350           if (MBB == VisitedPostDom)
351             FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
352           else
353             FoundLoopLevel = std::min(FoundLoopLevel, Level);
354           continue;
355         }
356 
357         if (Visited.try_emplace(Succ, ~0u).second) {
358           if (MBB == VisitedPostDom)
359             NextLevel.push_back(Succ);
360           else
361             Stack.push_back(Succ);
362         }
363       }
364     }
365 
366     CommonDominators.push_back(VisitedDom);
367   }
368 };
369 
370 } // End anonymous namespace.
371 
372 Register
373 llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
374                         MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
375   return MRI->createVirtualRegister(LaneMaskRegAttrs);
376 }
377 
378 static Register
379 insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
380                     MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
381   MachineFunction &MF = *MBB->getParent();
382   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
383   const SIInstrInfo *TII = ST.getInstrInfo();
384   Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
385   BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
386           UndefReg);
387   return UndefReg;
388 }
389 
390 #ifndef NDEBUG
391 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
392                                 const MachineRegisterInfo &MRI,
393                                 Register Reg) {
394   unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
395   return Size == 1 || Size == 32;
396 }
397 #endif
398 
399 bool Vreg1LoweringHelper::lowerCopiesFromI1() {
400   bool Changed = false;
401   SmallVector<MachineInstr *, 4> DeadCopies;
402 
403   for (MachineBasicBlock &MBB : *MF) {
404     for (MachineInstr &MI : MBB) {
405       if (MI.getOpcode() != AMDGPU::COPY)
406         continue;
407 
408       Register DstReg = MI.getOperand(0).getReg();
409       Register SrcReg = MI.getOperand(1).getReg();
410       if (!isVreg1(SrcReg))
411         continue;
412 
413       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
414         continue;
415 
416       Changed = true;
417 
418       // Copy into a 32-bit vector register.
419       LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
420       DebugLoc DL = MI.getDebugLoc();
421 
422       assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
423       assert(!MI.getOperand(0).getSubReg());
424 
425       ConstrainRegs.insert(SrcReg);
426       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
427           .addImm(0)
428           .addImm(0)
429           .addImm(0)
430           .addImm(-1)
431           .addReg(SrcReg);
432       DeadCopies.push_back(&MI);
433     }
434 
435     for (MachineInstr *MI : DeadCopies)
436       MI->eraseFromParent();
437     DeadCopies.clear();
438   }
439   return Changed;
440 }
441 
442 PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF,
443                                      MachineDominatorTree *DT,
444                                      MachinePostDominatorTree *PDT)
445     : MF(MF), DT(DT), PDT(PDT) {
446   MRI = &MF->getRegInfo();
447 
448   ST = &MF->getSubtarget<GCNSubtarget>();
449   TII = ST->getInstrInfo();
450   IsWave32 = ST->isWave32();
451 
452   if (IsWave32) {
453     ExecReg = AMDGPU::EXEC_LO;
454     MovOp = AMDGPU::S_MOV_B32;
455     AndOp = AMDGPU::S_AND_B32;
456     OrOp = AMDGPU::S_OR_B32;
457     XorOp = AMDGPU::S_XOR_B32;
458     AndN2Op = AMDGPU::S_ANDN2_B32;
459     OrN2Op = AMDGPU::S_ORN2_B32;
460   } else {
461     ExecReg = AMDGPU::EXEC;
462     MovOp = AMDGPU::S_MOV_B64;
463     AndOp = AMDGPU::S_AND_B64;
464     OrOp = AMDGPU::S_OR_B64;
465     XorOp = AMDGPU::S_XOR_B64;
466     AndN2Op = AMDGPU::S_ANDN2_B64;
467     OrN2Op = AMDGPU::S_ORN2_B64;
468   }
469 }
470 
471 bool PhiLoweringHelper::lowerPhis() {
472   MachineSSAUpdater SSAUpdater(*MF);
473   LoopFinder LF(*DT, *PDT);
474   PhiIncomingAnalysis PIA(*PDT, TII);
475   SmallVector<MachineInstr *, 4> Vreg1Phis;
476   SmallVector<Incoming, 4> Incomings;
477 
478   getCandidatesForLowering(Vreg1Phis);
479   if (Vreg1Phis.empty())
480     return false;
481 
482   DT->updateDFSNumbers();
483   MachineBasicBlock *PrevMBB = nullptr;
484   for (MachineInstr *MI : Vreg1Phis) {
485     MachineBasicBlock &MBB = *MI->getParent();
486     if (&MBB != PrevMBB) {
487       LF.initialize(MBB);
488       PrevMBB = &MBB;
489     }
490 
491     LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
492 
493     Register DstReg = MI->getOperand(0).getReg();
494     markAsLaneMask(DstReg);
495     initializeLaneMaskRegisterAttributes(DstReg);
496 
497     collectIncomingValuesFromPhi(MI, Incomings);
498 
499     // Sort the incomings such that incoming values that dominate other incoming
500     // values are sorted earlier. This allows us to do some amount of on-the-fly
501     // constant folding.
502     // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
503     llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
504       return DT->getNode(LHS.Block)->getDFSNumIn() <
505              DT->getNode(RHS.Block)->getDFSNumIn();
506     });
507 
508 #ifndef NDEBUG
509     PhiRegisters.insert(DstReg);
510 #endif
511 
512     // Phis in a loop that are observed outside the loop receive a simple but
513     // conservatively correct treatment.
514     std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
515     for (MachineInstr &Use : MRI->use_instructions(DstReg))
516       DomBlocks.push_back(Use.getParent());
517 
518     MachineBasicBlock *PostDomBound =
519         PDT->findNearestCommonDominator(DomBlocks);
520 
521     // FIXME: This fails to find irreducible cycles. If we have a def (other
522     // than a constant) in a pair of blocks that end up looping back to each
523     // other, it will be mishandle. Due to structurization this shouldn't occur
524     // in practice.
525     unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
526 
527     SSAUpdater.Initialize(DstReg);
528 
529     if (FoundLoopLevel) {
530       LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
531                         Incomings);
532 
533       for (auto &Incoming : Incomings) {
534         Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
535         SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg);
536       }
537 
538       for (auto &Incoming : Incomings) {
539         MachineBasicBlock &IMBB = *Incoming.Block;
540         buildMergeLaneMasks(
541             IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg,
542             SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg);
543       }
544     } else {
545       // The phi is not observed from outside a loop. Use a more accurate
546       // lowering.
547       PIA.analyze(MBB, Incomings);
548 
549       for (MachineBasicBlock *MBB : PIA.predecessors())
550         SSAUpdater.AddAvailableValue(
551             MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs));
552 
553       for (auto &Incoming : Incomings) {
554         MachineBasicBlock &IMBB = *Incoming.Block;
555         if (PIA.isSource(IMBB)) {
556           constrainAsLaneMask(Incoming);
557           SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
558         } else {
559           Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
560           SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg);
561         }
562       }
563 
564       for (auto &Incoming : Incomings) {
565         if (!Incoming.UpdatedReg.isValid())
566           continue;
567 
568         MachineBasicBlock &IMBB = *Incoming.Block;
569         buildMergeLaneMasks(
570             IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg,
571             SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg);
572       }
573     }
574 
575     Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
576     if (NewReg != DstReg) {
577       replaceDstReg(NewReg, DstReg, &MBB);
578       MI->eraseFromParent();
579     }
580 
581     Incomings.clear();
582   }
583   return true;
584 }
585 
586 bool Vreg1LoweringHelper::lowerCopiesToI1() {
587   bool Changed = false;
588   MachineSSAUpdater SSAUpdater(*MF);
589   LoopFinder LF(*DT, *PDT);
590   SmallVector<MachineInstr *, 4> DeadCopies;
591 
592   for (MachineBasicBlock &MBB : *MF) {
593     LF.initialize(MBB);
594 
595     for (MachineInstr &MI : MBB) {
596       if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
597           MI.getOpcode() != AMDGPU::COPY)
598         continue;
599 
600       Register DstReg = MI.getOperand(0).getReg();
601       if (!isVreg1(DstReg))
602         continue;
603 
604       Changed = true;
605 
606       if (MRI->use_empty(DstReg)) {
607         DeadCopies.push_back(&MI);
608         continue;
609       }
610 
611       LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
612 
613       markAsLaneMask(DstReg);
614       initializeLaneMaskRegisterAttributes(DstReg);
615 
616       if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
617         continue;
618 
619       DebugLoc DL = MI.getDebugLoc();
620       Register SrcReg = MI.getOperand(1).getReg();
621       assert(!MI.getOperand(1).getSubReg());
622 
623       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
624         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
625         Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
626         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
627             .addReg(SrcReg)
628             .addImm(0);
629         MI.getOperand(1).setReg(TmpReg);
630         SrcReg = TmpReg;
631       } else {
632         // SrcReg needs to be live beyond copy.
633         MI.getOperand(1).setIsKill(false);
634       }
635 
636       // Defs in a loop that are observed outside the loop must be transformed
637       // into appropriate bit manipulation.
638       std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
639       for (MachineInstr &Use : MRI->use_instructions(DstReg))
640         DomBlocks.push_back(Use.getParent());
641 
642       MachineBasicBlock *PostDomBound =
643           PDT->findNearestCommonDominator(DomBlocks);
644       unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
645       if (FoundLoopLevel) {
646         SSAUpdater.Initialize(DstReg);
647         SSAUpdater.AddAvailableValue(&MBB, DstReg);
648         LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
649 
650         buildMergeLaneMasks(MBB, MI, DL, DstReg,
651                             SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
652         DeadCopies.push_back(&MI);
653       }
654     }
655 
656     for (MachineInstr *MI : DeadCopies)
657       MI->eraseFromParent();
658     DeadCopies.clear();
659   }
660   return Changed;
661 }
662 
663 bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const {
664   const MachineInstr *MI;
665   for (;;) {
666     MI = MRI->getUniqueVRegDef(Reg);
667     if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
668       return true;
669 
670     if (MI->getOpcode() != AMDGPU::COPY)
671       break;
672 
673     Reg = MI->getOperand(1).getReg();
674     if (!Reg.isVirtual())
675       return false;
676     if (!isLaneMaskReg(Reg))
677       return false;
678   }
679 
680   if (MI->getOpcode() != MovOp)
681     return false;
682 
683   if (!MI->getOperand(1).isImm())
684     return false;
685 
686   int64_t Imm = MI->getOperand(1).getImm();
687   if (Imm == 0) {
688     Val = false;
689     return true;
690   }
691   if (Imm == -1) {
692     Val = true;
693     return true;
694   }
695 
696   return false;
697 }
698 
699 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
700   Def = false;
701   Use = false;
702 
703   for (const MachineOperand &MO : MI.operands()) {
704     if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
705       if (MO.isUse())
706         Use = true;
707       else
708         Def = true;
709     }
710   }
711 }
712 
713 /// Return a point at the end of the given \p MBB to insert SALU instructions
714 /// for lane mask calculation. Take terminators and SCC into account.
715 MachineBasicBlock::iterator
716 PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
717   auto InsertionPt = MBB.getFirstTerminator();
718   bool TerminatorsUseSCC = false;
719   for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
720     bool DefsSCC;
721     instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
722     if (TerminatorsUseSCC || DefsSCC)
723       break;
724   }
725 
726   if (!TerminatorsUseSCC)
727     return InsertionPt;
728 
729   while (InsertionPt != MBB.begin()) {
730     InsertionPt--;
731 
732     bool DefSCC, UseSCC;
733     instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
734     if (DefSCC)
735       return InsertionPt;
736   }
737 
738   // We should have at least seen an IMPLICIT_DEF or COPY
739   llvm_unreachable("SCC used by terminator but no def in block");
740 }
741 
742 // VReg_1 -> SReg_32 or SReg_64
743 void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
744   MRI->setRegClass(DstReg, ST->getBoolRC());
745 }
746 
747 void Vreg1LoweringHelper::getCandidatesForLowering(
748     SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
749   for (MachineBasicBlock &MBB : *MF) {
750     for (MachineInstr &MI : MBB.phis()) {
751       if (isVreg1(MI.getOperand(0).getReg()))
752         Vreg1Phis.push_back(&MI);
753     }
754   }
755 }
756 
757 void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
758     const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
759   for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
760     assert(i + 1 < MI->getNumOperands());
761     Register IncomingReg = MI->getOperand(i).getReg();
762     MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
763     MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
764 
765     if (IncomingDef->getOpcode() == AMDGPU::COPY) {
766       IncomingReg = IncomingDef->getOperand(1).getReg();
767       assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
768       assert(!IncomingDef->getOperand(1).getSubReg());
769     } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
770       continue;
771     } else {
772       assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
773     }
774 
775     Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
776   }
777 }
778 
779 void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
780                                         MachineBasicBlock *MBB) {
781   MRI->replaceRegWith(NewReg, OldReg);
782 }
783 
784 void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
785                                               MachineBasicBlock::iterator I,
786                                               const DebugLoc &DL,
787                                               Register DstReg, Register PrevReg,
788                                               Register CurReg) {
789   bool PrevVal = false;
790   bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
791   bool CurVal = false;
792   bool CurConstant = isConstantLaneMask(CurReg, CurVal);
793 
794   if (PrevConstant && CurConstant) {
795     if (PrevVal == CurVal) {
796       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
797     } else if (CurVal) {
798       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
799     } else {
800       BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
801           .addReg(ExecReg)
802           .addImm(-1);
803     }
804     return;
805   }
806 
807   Register PrevMaskedReg;
808   Register CurMaskedReg;
809   if (!PrevConstant) {
810     if (CurConstant && CurVal) {
811       PrevMaskedReg = PrevReg;
812     } else {
813       PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
814       BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
815           .addReg(PrevReg)
816           .addReg(ExecReg);
817     }
818   }
819   if (!CurConstant) {
820     // TODO: check whether CurReg is already masked by EXEC
821     if (PrevConstant && PrevVal) {
822       CurMaskedReg = CurReg;
823     } else {
824       CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
825       BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
826           .addReg(CurReg)
827           .addReg(ExecReg);
828     }
829   }
830 
831   if (PrevConstant && !PrevVal) {
832     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
833         .addReg(CurMaskedReg);
834   } else if (CurConstant && !CurVal) {
835     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
836         .addReg(PrevMaskedReg);
837   } else if (PrevConstant && PrevVal) {
838     BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
839         .addReg(CurMaskedReg)
840         .addReg(ExecReg);
841   } else {
842     BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
843         .addReg(PrevMaskedReg)
844         .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
845   }
846 }
847 
848 void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
849 
850 /// Lower all instructions that def or use vreg_1 registers.
851 ///
852 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
853 /// occur around inline assembly. We do this first, before vreg_1 registers
854 /// are changed to scalar mask registers.
855 ///
856 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
857 /// all others, because phi lowering looks through copies and can therefore
858 /// often make copy lowering unnecessary.
859 static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT,
860                            MachinePostDominatorTree &MPDT) {
861   // Only need to run this in SelectionDAG path.
862   if (MF.getProperties().hasSelected())
863     return false;
864 
865   Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT);
866   bool Changed = false;
867   Changed |= Helper.lowerCopiesFromI1();
868   Changed |= Helper.lowerPhis();
869   Changed |= Helper.lowerCopiesToI1();
870   return Helper.cleanConstrainRegs(Changed);
871 }
872 
873 PreservedAnalyses
874 SILowerI1CopiesPass::run(MachineFunction &MF,
875                          MachineFunctionAnalysisManager &MFAM) {
876   MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
877   MachinePostDominatorTree &MPDT =
878       MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
879   bool Changed = runFixI1Copies(MF, MDT, MPDT);
880   if (!Changed)
881     return PreservedAnalyses::all();
882 
883   // TODO: Probably preserves most.
884   PreservedAnalyses PA;
885   PA.preserveSet<CFGAnalyses>();
886   return PA;
887 }
888 
889 class SILowerI1CopiesLegacy : public MachineFunctionPass {
890 public:
891   static char ID;
892 
893   SILowerI1CopiesLegacy() : MachineFunctionPass(ID) {
894     initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry());
895   }
896 
897   bool runOnMachineFunction(MachineFunction &MF) override;
898 
899   StringRef getPassName() const override { return "SI Lower i1 Copies"; }
900 
901   void getAnalysisUsage(AnalysisUsage &AU) const override {
902     AU.setPreservesCFG();
903     AU.addRequired<MachineDominatorTreeWrapperPass>();
904     AU.addRequired<MachinePostDominatorTreeWrapperPass>();
905     MachineFunctionPass::getAnalysisUsage(AU);
906   }
907 };
908 
909 bool SILowerI1CopiesLegacy::runOnMachineFunction(MachineFunction &MF) {
910   MachineDominatorTree &MDT =
911       getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
912   MachinePostDominatorTree &MPDT =
913       getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
914   return runFixI1Copies(MF, MDT, MPDT);
915 }
916 
917 INITIALIZE_PASS_BEGIN(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies",
918                       false, false)
919 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
920 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
921 INITIALIZE_PASS_END(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies",
922                     false, false)
923 
924 char SILowerI1CopiesLegacy::ID = 0;
925 
926 char &llvm::SILowerI1CopiesLegacyID = SILowerI1CopiesLegacy::ID;
927 
928 FunctionPass *llvm::createSILowerI1CopiesLegacyPass() {
929   return new SILowerI1CopiesLegacy();
930 }
931