xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp (revision 9dba64be9536c28e4800e06512b7f29b43ade345)
1 //===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // If a load follows a store and reloads data that the store has written to
10 // memory, Intel microarchitectures can in many cases forward the data directly
11 // from the store to the load, This "store forwarding" saves cycles by enabling
12 // the load to directly obtain the data instead of accessing the data from
13 // cache or memory.
14 // A "store forward block" occurs in cases that a store cannot be forwarded to
15 // the load. The most typical case of store forward block on Intel Core
16 // microarchitecture that a small store cannot be forwarded to a large load.
17 // The estimated penalty for a store forward block is ~13 cycles.
18 //
19 // This pass tries to recognize and handle cases where "store forward block"
20 // is created by the compiler when lowering memcpy calls to a sequence
21 // of a load and a store.
22 //
23 // The pass currently only handles cases where memcpy is lowered to
24 // XMM/YMM registers, it tries to break the memcpy into smaller copies.
25 // breaking the memcpy should be possible since there is no atomicity
26 // guarantee for loads and stores to XMM/YMM.
27 //
28 // It could be better for performance to solve the problem by loading
29 // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30 // to memory, but this will result in a more conservative optimization since it
31 // requires we prove that all memory accesses between the blocking store and the
32 // load must alias/don't alias before we can move the store, whereas the
33 // transformation done here is correct regardless to other memory accesses.
34 //===----------------------------------------------------------------------===//
35 
36 #include "X86InstrInfo.h"
37 #include "X86Subtarget.h"
38 #include "llvm/Analysis/AliasAnalysis.h"
39 #include "llvm/CodeGen/MachineBasicBlock.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineOperand.h"
45 #include "llvm/CodeGen/MachineRegisterInfo.h"
46 #include "llvm/IR/DebugInfoMetadata.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/MC/MCInstrDesc.h"
50 
51 using namespace llvm;
52 
53 #define DEBUG_TYPE "x86-avoid-SFB"
54 
55 static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
56     "x86-disable-avoid-SFB", cl::Hidden,
57     cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
58 
59 static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
60     "x86-sfb-inspection-limit",
61     cl::desc("X86: Number of instructions backward to "
62              "inspect for store forwarding blocks."),
63     cl::init(20), cl::Hidden);
64 
65 namespace {
66 
67 using DisplacementSizeMap = std::map<int64_t, unsigned>;
68 
69 class X86AvoidSFBPass : public MachineFunctionPass {
70 public:
71   static char ID;
72   X86AvoidSFBPass() : MachineFunctionPass(ID) { }
73 
74   StringRef getPassName() const override {
75     return "X86 Avoid Store Forwarding Blocks";
76   }
77 
78   bool runOnMachineFunction(MachineFunction &MF) override;
79 
80   void getAnalysisUsage(AnalysisUsage &AU) const override {
81     MachineFunctionPass::getAnalysisUsage(AU);
82     AU.addRequired<AAResultsWrapperPass>();
83   }
84 
85 private:
86   MachineRegisterInfo *MRI;
87   const X86InstrInfo *TII;
88   const X86RegisterInfo *TRI;
89   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
90       BlockedLoadsStoresPairs;
91   SmallVector<MachineInstr *, 2> ForRemoval;
92   AliasAnalysis *AA;
93 
94   /// Returns couples of Load then Store to memory which look
95   ///  like a memcpy.
96   void findPotentiallylBlockedCopies(MachineFunction &MF);
97   /// Break the memcpy's load and store into smaller copies
98   /// such that each memory load that was blocked by a smaller store
99   /// would now be copied separately.
100   void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
101                           const DisplacementSizeMap &BlockingStoresDispSizeMap);
102   /// Break a copy of size Size to smaller copies.
103   void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
104                    MachineInstr *StoreInst, int64_t StDispImm,
105                    int64_t LMMOffset, int64_t SMMOffset);
106 
107   void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
108                  MachineInstr *StoreInst, unsigned NStoreOpcode,
109                  int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
110                  int64_t SMMOffset);
111 
112   bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
113 
114   unsigned getRegSizeInBytes(MachineInstr *Inst);
115 };
116 
117 } // end anonymous namespace
118 
119 char X86AvoidSFBPass::ID = 0;
120 
121 INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
122                       false, false)
123 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
124 INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
125                     false)
126 
127 FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
128   return new X86AvoidSFBPass();
129 }
130 
131 static bool isXMMLoadOpcode(unsigned Opcode) {
132   return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
133          Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
134          Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
135          Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
136          Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
137          Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
138          Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
139          Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
140 }
141 static bool isYMMLoadOpcode(unsigned Opcode) {
142   return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
143          Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
144          Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
145          Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
146          Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
147          Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
148          Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
149 }
150 
151 static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
152   return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
153 }
154 
155 static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
156   switch (LdOpcode) {
157   case X86::MOVUPSrm:
158   case X86::MOVAPSrm:
159     return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
160   case X86::VMOVUPSrm:
161   case X86::VMOVAPSrm:
162     return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
163   case X86::VMOVUPDrm:
164   case X86::VMOVAPDrm:
165     return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
166   case X86::VMOVDQUrm:
167   case X86::VMOVDQArm:
168     return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
169   case X86::VMOVUPSZ128rm:
170   case X86::VMOVAPSZ128rm:
171     return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
172   case X86::VMOVUPDZ128rm:
173   case X86::VMOVAPDZ128rm:
174     return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
175   case X86::VMOVUPSYrm:
176   case X86::VMOVAPSYrm:
177     return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
178   case X86::VMOVUPDYrm:
179   case X86::VMOVAPDYrm:
180     return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
181   case X86::VMOVDQUYrm:
182   case X86::VMOVDQAYrm:
183     return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
184   case X86::VMOVUPSZ256rm:
185   case X86::VMOVAPSZ256rm:
186     return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
187   case X86::VMOVUPDZ256rm:
188   case X86::VMOVAPDZ256rm:
189     return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
190   case X86::VMOVDQU64Z128rm:
191   case X86::VMOVDQA64Z128rm:
192     return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
193   case X86::VMOVDQU32Z128rm:
194   case X86::VMOVDQA32Z128rm:
195     return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
196   case X86::VMOVDQU64Z256rm:
197   case X86::VMOVDQA64Z256rm:
198     return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
199   case X86::VMOVDQU32Z256rm:
200   case X86::VMOVDQA32Z256rm:
201     return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
202   default:
203     return false;
204   }
205 }
206 
207 static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
208   bool PBlock = false;
209   PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
210             Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
211             Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
212             Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
213   if (isYMMLoadOpcode(LoadOpcode))
214     PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
215               Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
216               Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
217               Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
218               Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
219               Opcode == X86::VMOVDQU64Z128mr ||
220               Opcode == X86::VMOVDQA64Z128mr ||
221               Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
222   return PBlock;
223 }
224 
225 static const int MOV128SZ = 16;
226 static const int MOV64SZ = 8;
227 static const int MOV32SZ = 4;
228 static const int MOV16SZ = 2;
229 static const int MOV8SZ = 1;
230 
231 static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
232   switch (LoadOpcode) {
233   case X86::VMOVUPSYrm:
234   case X86::VMOVAPSYrm:
235     return X86::VMOVUPSrm;
236   case X86::VMOVUPDYrm:
237   case X86::VMOVAPDYrm:
238     return X86::VMOVUPDrm;
239   case X86::VMOVDQUYrm:
240   case X86::VMOVDQAYrm:
241     return X86::VMOVDQUrm;
242   case X86::VMOVUPSZ256rm:
243   case X86::VMOVAPSZ256rm:
244     return X86::VMOVUPSZ128rm;
245   case X86::VMOVUPDZ256rm:
246   case X86::VMOVAPDZ256rm:
247     return X86::VMOVUPDZ128rm;
248   case X86::VMOVDQU64Z256rm:
249   case X86::VMOVDQA64Z256rm:
250     return X86::VMOVDQU64Z128rm;
251   case X86::VMOVDQU32Z256rm:
252   case X86::VMOVDQA32Z256rm:
253     return X86::VMOVDQU32Z128rm;
254   default:
255     llvm_unreachable("Unexpected Load Instruction Opcode");
256   }
257   return 0;
258 }
259 
260 static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
261   switch (StoreOpcode) {
262   case X86::VMOVUPSYmr:
263   case X86::VMOVAPSYmr:
264     return X86::VMOVUPSmr;
265   case X86::VMOVUPDYmr:
266   case X86::VMOVAPDYmr:
267     return X86::VMOVUPDmr;
268   case X86::VMOVDQUYmr:
269   case X86::VMOVDQAYmr:
270     return X86::VMOVDQUmr;
271   case X86::VMOVUPSZ256mr:
272   case X86::VMOVAPSZ256mr:
273     return X86::VMOVUPSZ128mr;
274   case X86::VMOVUPDZ256mr:
275   case X86::VMOVAPDZ256mr:
276     return X86::VMOVUPDZ128mr;
277   case X86::VMOVDQU64Z256mr:
278   case X86::VMOVDQA64Z256mr:
279     return X86::VMOVDQU64Z128mr;
280   case X86::VMOVDQU32Z256mr:
281   case X86::VMOVDQA32Z256mr:
282     return X86::VMOVDQU32Z128mr;
283   default:
284     llvm_unreachable("Unexpected Load Instruction Opcode");
285   }
286   return 0;
287 }
288 
289 static int getAddrOffset(MachineInstr *MI) {
290   const MCInstrDesc &Descl = MI->getDesc();
291   int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
292   assert(AddrOffset != -1 && "Expected Memory Operand");
293   AddrOffset += X86II::getOperandBias(Descl);
294   return AddrOffset;
295 }
296 
297 static MachineOperand &getBaseOperand(MachineInstr *MI) {
298   int AddrOffset = getAddrOffset(MI);
299   return MI->getOperand(AddrOffset + X86::AddrBaseReg);
300 }
301 
302 static MachineOperand &getDispOperand(MachineInstr *MI) {
303   int AddrOffset = getAddrOffset(MI);
304   return MI->getOperand(AddrOffset + X86::AddrDisp);
305 }
306 
307 // Relevant addressing modes contain only base register and immediate
308 // displacement or frameindex and immediate displacement.
309 // TODO: Consider expanding to other addressing modes in the future
310 static bool isRelevantAddressingMode(MachineInstr *MI) {
311   int AddrOffset = getAddrOffset(MI);
312   MachineOperand &Base = getBaseOperand(MI);
313   MachineOperand &Disp = getDispOperand(MI);
314   MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
315   MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
316   MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
317 
318   if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
319     return false;
320   if (!Disp.isImm())
321     return false;
322   if (Scale.getImm() != 1)
323     return false;
324   if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
325     return false;
326   if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
327     return false;
328   return true;
329 }
330 
331 // Collect potentially blocking stores.
332 // Limit the number of instructions backwards we want to inspect
333 // since the effect of store block won't be visible if the store
334 // and load instructions have enough instructions in between to
335 // keep the core busy.
336 static SmallVector<MachineInstr *, 2>
337 findPotentialBlockers(MachineInstr *LoadInst) {
338   SmallVector<MachineInstr *, 2> PotentialBlockers;
339   unsigned BlockCount = 0;
340   const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
341   for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
342             E = LoadInst->getParent()->rend();
343        PBInst != E; ++PBInst) {
344     if (PBInst->isMetaInstruction())
345       continue;
346     BlockCount++;
347     if (BlockCount >= InspectionLimit)
348       break;
349     MachineInstr &MI = *PBInst;
350     if (MI.getDesc().isCall())
351       return PotentialBlockers;
352     PotentialBlockers.push_back(&MI);
353   }
354   // If we didn't get to the instructions limit try predecessing blocks.
355   // Ideally we should traverse the predecessor blocks in depth with some
356   // coloring algorithm, but for now let's just look at the first order
357   // predecessors.
358   if (BlockCount < InspectionLimit) {
359     MachineBasicBlock *MBB = LoadInst->getParent();
360     int LimitLeft = InspectionLimit - BlockCount;
361     for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
362                                           PE = MBB->pred_end();
363          PB != PE; ++PB) {
364       MachineBasicBlock *PMBB = *PB;
365       int PredCount = 0;
366       for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
367                                                PME = PMBB->rend();
368            PBInst != PME; ++PBInst) {
369         if (PBInst->isMetaInstruction())
370           continue;
371         PredCount++;
372         if (PredCount >= LimitLeft)
373           break;
374         if (PBInst->getDesc().isCall())
375           break;
376         PotentialBlockers.push_back(&*PBInst);
377       }
378     }
379   }
380   return PotentialBlockers;
381 }
382 
383 void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
384                                 int64_t LoadDisp, MachineInstr *StoreInst,
385                                 unsigned NStoreOpcode, int64_t StoreDisp,
386                                 unsigned Size, int64_t LMMOffset,
387                                 int64_t SMMOffset) {
388   MachineOperand &LoadBase = getBaseOperand(LoadInst);
389   MachineOperand &StoreBase = getBaseOperand(StoreInst);
390   MachineBasicBlock *MBB = LoadInst->getParent();
391   MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
392   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
393 
394   Register Reg1 = MRI->createVirtualRegister(
395       TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
396   MachineInstr *NewLoad =
397       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
398               Reg1)
399           .add(LoadBase)
400           .addImm(1)
401           .addReg(X86::NoRegister)
402           .addImm(LoadDisp)
403           .addReg(X86::NoRegister)
404           .addMemOperand(
405               MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
406   if (LoadBase.isReg())
407     getBaseOperand(NewLoad).setIsKill(false);
408   LLVM_DEBUG(NewLoad->dump());
409   // If the load and store are consecutive, use the loadInst location to
410   // reduce register pressure.
411   MachineInstr *StInst = StoreInst;
412   auto PrevInstrIt = skipDebugInstructionsBackward(
413       std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
414       MBB->instr_begin());
415   if (PrevInstrIt.getNodePtr() == LoadInst)
416     StInst = LoadInst;
417   MachineInstr *NewStore =
418       BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
419           .add(StoreBase)
420           .addImm(1)
421           .addReg(X86::NoRegister)
422           .addImm(StoreDisp)
423           .addReg(X86::NoRegister)
424           .addReg(Reg1)
425           .addMemOperand(
426               MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
427   if (StoreBase.isReg())
428     getBaseOperand(NewStore).setIsKill(false);
429   MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
430   assert(StoreSrcVReg.isReg() && "Expected virtual register");
431   NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
432   LLVM_DEBUG(NewStore->dump());
433 }
434 
435 void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
436                                   int64_t LdDispImm, MachineInstr *StoreInst,
437                                   int64_t StDispImm, int64_t LMMOffset,
438                                   int64_t SMMOffset) {
439   int LdDisp = LdDispImm;
440   int StDisp = StDispImm;
441   while (Size > 0) {
442     if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
443       Size = Size - MOV128SZ;
444       buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
445                 StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
446                 StDisp, MOV128SZ, LMMOffset, SMMOffset);
447       LdDisp += MOV128SZ;
448       StDisp += MOV128SZ;
449       LMMOffset += MOV128SZ;
450       SMMOffset += MOV128SZ;
451       continue;
452     }
453     if (Size - MOV64SZ >= 0) {
454       Size = Size - MOV64SZ;
455       buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
456                 MOV64SZ, LMMOffset, SMMOffset);
457       LdDisp += MOV64SZ;
458       StDisp += MOV64SZ;
459       LMMOffset += MOV64SZ;
460       SMMOffset += MOV64SZ;
461       continue;
462     }
463     if (Size - MOV32SZ >= 0) {
464       Size = Size - MOV32SZ;
465       buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
466                 MOV32SZ, LMMOffset, SMMOffset);
467       LdDisp += MOV32SZ;
468       StDisp += MOV32SZ;
469       LMMOffset += MOV32SZ;
470       SMMOffset += MOV32SZ;
471       continue;
472     }
473     if (Size - MOV16SZ >= 0) {
474       Size = Size - MOV16SZ;
475       buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
476                 MOV16SZ, LMMOffset, SMMOffset);
477       LdDisp += MOV16SZ;
478       StDisp += MOV16SZ;
479       LMMOffset += MOV16SZ;
480       SMMOffset += MOV16SZ;
481       continue;
482     }
483     if (Size - MOV8SZ >= 0) {
484       Size = Size - MOV8SZ;
485       buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
486                 MOV8SZ, LMMOffset, SMMOffset);
487       LdDisp += MOV8SZ;
488       StDisp += MOV8SZ;
489       LMMOffset += MOV8SZ;
490       SMMOffset += MOV8SZ;
491       continue;
492     }
493   }
494   assert(Size == 0 && "Wrong size division");
495 }
496 
497 static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
498   MachineOperand &LoadBase = getBaseOperand(LoadInst);
499   MachineOperand &StoreBase = getBaseOperand(StoreInst);
500   auto StorePrevNonDbgInstr = skipDebugInstructionsBackward(
501           std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
502           LoadInst->getParent()->instr_begin()).getNodePtr();
503   if (LoadBase.isReg()) {
504     MachineInstr *LastLoad = LoadInst->getPrevNode();
505     // If the original load and store to xmm/ymm were consecutive
506     // then the partial copies were also created in
507     // a consecutive order to reduce register pressure,
508     // and the location of the last load is before the last store.
509     if (StorePrevNonDbgInstr == LoadInst)
510       LastLoad = LoadInst->getPrevNode()->getPrevNode();
511     getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
512   }
513   if (StoreBase.isReg()) {
514     MachineInstr *StInst = StoreInst;
515     if (StorePrevNonDbgInstr == LoadInst)
516       StInst = LoadInst;
517     getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
518   }
519 }
520 
521 bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
522                             const MachineMemOperand &Op2) const {
523   if (!Op1.getValue() || !Op2.getValue())
524     return true;
525 
526   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
527   int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
528   int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
529 
530   AliasResult AAResult =
531       AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
532                 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
533   return AAResult != NoAlias;
534 }
535 
536 void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
537   for (auto &MBB : MF)
538     for (auto &MI : MBB) {
539       if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
540         continue;
541       int DefVR = MI.getOperand(0).getReg();
542       if (!MRI->hasOneNonDBGUse(DefVR))
543         continue;
544       for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
545            UI != UE;) {
546         MachineOperand &StoreMO = *UI++;
547         MachineInstr &StoreMI = *StoreMO.getParent();
548         // Skip cases where the memcpy may overlap.
549         if (StoreMI.getParent() == MI.getParent() &&
550             isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
551             isRelevantAddressingMode(&MI) &&
552             isRelevantAddressingMode(&StoreMI)) {
553           assert(MI.hasOneMemOperand() &&
554                  "Expected one memory operand for load instruction");
555           assert(StoreMI.hasOneMemOperand() &&
556                  "Expected one memory operand for store instruction");
557           if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
558             BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
559         }
560       }
561     }
562 }
563 
564 unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
565   auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
566                               *LoadInst->getParent()->getParent());
567   return TRI->getRegSizeInBits(*TRC) / 8;
568 }
569 
570 void X86AvoidSFBPass::breakBlockedCopies(
571     MachineInstr *LoadInst, MachineInstr *StoreInst,
572     const DisplacementSizeMap &BlockingStoresDispSizeMap) {
573   int64_t LdDispImm = getDispOperand(LoadInst).getImm();
574   int64_t StDispImm = getDispOperand(StoreInst).getImm();
575   int64_t LMMOffset = 0;
576   int64_t SMMOffset = 0;
577 
578   int64_t LdDisp1 = LdDispImm;
579   int64_t LdDisp2 = 0;
580   int64_t StDisp1 = StDispImm;
581   int64_t StDisp2 = 0;
582   unsigned Size1 = 0;
583   unsigned Size2 = 0;
584   int64_t LdStDelta = StDispImm - LdDispImm;
585 
586   for (auto DispSizePair : BlockingStoresDispSizeMap) {
587     LdDisp2 = DispSizePair.first;
588     StDisp2 = DispSizePair.first + LdStDelta;
589     Size2 = DispSizePair.second;
590     // Avoid copying overlapping areas.
591     if (LdDisp2 < LdDisp1) {
592       int OverlapDelta = LdDisp1 - LdDisp2;
593       LdDisp2 += OverlapDelta;
594       StDisp2 += OverlapDelta;
595       Size2 -= OverlapDelta;
596     }
597     Size1 = LdDisp2 - LdDisp1;
598 
599     // Build a copy for the point until the current blocking store's
600     // displacement.
601     buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
602                 SMMOffset);
603     // Build a copy for the current blocking store.
604     buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
605                 SMMOffset + Size1);
606     LdDisp1 = LdDisp2 + Size2;
607     StDisp1 = StDisp2 + Size2;
608     LMMOffset += Size1 + Size2;
609     SMMOffset += Size1 + Size2;
610   }
611   unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
612   buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
613               LMMOffset);
614 }
615 
616 static bool hasSameBaseOpValue(MachineInstr *LoadInst,
617                                MachineInstr *StoreInst) {
618   MachineOperand &LoadBase = getBaseOperand(LoadInst);
619   MachineOperand &StoreBase = getBaseOperand(StoreInst);
620   if (LoadBase.isReg() != StoreBase.isReg())
621     return false;
622   if (LoadBase.isReg())
623     return LoadBase.getReg() == StoreBase.getReg();
624   return LoadBase.getIndex() == StoreBase.getIndex();
625 }
626 
627 static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
628                             int64_t StoreDispImm, unsigned StoreSize) {
629   return ((StoreDispImm >= LoadDispImm) &&
630           (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
631 }
632 
633 // Keep track of all stores blocking a load
634 static void
635 updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
636                                 int64_t DispImm, unsigned Size) {
637   if (BlockingStoresDispSizeMap.count(DispImm)) {
638     // Choose the smallest blocking store starting at this displacement.
639     if (BlockingStoresDispSizeMap[DispImm] > Size)
640       BlockingStoresDispSizeMap[DispImm] = Size;
641 
642   } else
643     BlockingStoresDispSizeMap[DispImm] = Size;
644 }
645 
646 // Remove blocking stores contained in each other.
647 static void
648 removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
649   if (BlockingStoresDispSizeMap.size() <= 1)
650     return;
651 
652   SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
653   for (auto DispSizePair : BlockingStoresDispSizeMap) {
654     int64_t CurrDisp = DispSizePair.first;
655     unsigned CurrSize = DispSizePair.second;
656     while (DispSizeStack.size()) {
657       int64_t PrevDisp = DispSizeStack.back().first;
658       unsigned PrevSize = DispSizeStack.back().second;
659       if (CurrDisp + CurrSize > PrevDisp + PrevSize)
660         break;
661       DispSizeStack.pop_back();
662     }
663     DispSizeStack.push_back(DispSizePair);
664   }
665   BlockingStoresDispSizeMap.clear();
666   for (auto Disp : DispSizeStack)
667     BlockingStoresDispSizeMap.insert(Disp);
668 }
669 
670 bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
671   bool Changed = false;
672 
673   if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
674       !MF.getSubtarget<X86Subtarget>().is64Bit())
675     return false;
676 
677   MRI = &MF.getRegInfo();
678   assert(MRI->isSSA() && "Expected MIR to be in SSA form");
679   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
680   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
681   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
682   LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
683   // Look for a load then a store to XMM/YMM which look like a memcpy
684   findPotentiallylBlockedCopies(MF);
685 
686   for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
687     MachineInstr *LoadInst = LoadStoreInstPair.first;
688     int64_t LdDispImm = getDispOperand(LoadInst).getImm();
689     DisplacementSizeMap BlockingStoresDispSizeMap;
690 
691     SmallVector<MachineInstr *, 2> PotentialBlockers =
692         findPotentialBlockers(LoadInst);
693     for (auto PBInst : PotentialBlockers) {
694       if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
695                                         LoadInst->getOpcode()) ||
696           !isRelevantAddressingMode(PBInst))
697         continue;
698       int64_t PBstDispImm = getDispOperand(PBInst).getImm();
699       assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
700       unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
701       // This check doesn't cover all cases, but it will suffice for now.
702       // TODO: take branch probability into consideration, if the blocking
703       // store is in an unreached block, breaking the memcopy could lose
704       // performance.
705       if (hasSameBaseOpValue(LoadInst, PBInst) &&
706           isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
707                           PBstSize))
708         updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
709                                         PBstSize);
710     }
711 
712     if (BlockingStoresDispSizeMap.empty())
713       continue;
714 
715     // We found a store forward block, break the memcpy's load and store
716     // into smaller copies such that each smaller store that was causing
717     // a store block would now be copied separately.
718     MachineInstr *StoreInst = LoadStoreInstPair.second;
719     LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
720     LLVM_DEBUG(LoadInst->dump());
721     LLVM_DEBUG(StoreInst->dump());
722     LLVM_DEBUG(dbgs() << "Replaced with:\n");
723     removeRedundantBlockingStores(BlockingStoresDispSizeMap);
724     breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
725     updateKillStatus(LoadInst, StoreInst);
726     ForRemoval.push_back(LoadInst);
727     ForRemoval.push_back(StoreInst);
728   }
729   for (auto RemovedInst : ForRemoval) {
730     RemovedInst->eraseFromParent();
731   }
732   ForRemoval.clear();
733   BlockedLoadsStoresPairs.clear();
734   LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
735 
736   return Changed;
737 }
738