xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
1  //
2  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3  // See https://llvm.org/LICENSE.txt for license information.
4  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5  //
6  //===----------------------------------------------------------------------===//
7  //
8  // This file contains a pass that performs optimization on SIMD instructions
9  // with high latency by splitting them into more efficient series of
10  // instructions.
11  //
12  // 1. Rewrite certain SIMD instructions with vector element due to their
13  // inefficiency on some targets.
14  //
15  // For example:
16  //    fmla v0.4s, v1.4s, v2.s[1]
17  //
18  // Is rewritten into:
19  //    dup v3.4s, v2.s[1]
20  //    fmla v0.4s, v1.4s, v3.4s
21  //
22  // 2. Rewrite interleaved memory access instructions due to their
23  // inefficiency on some targets.
24  //
25  // For example:
26  //    st2 {v0.4s, v1.4s}, addr
27  //
28  // Is rewritten into:
29  //    zip1 v2.4s, v0.4s, v1.4s
30  //    zip2 v3.4s, v0.4s, v1.4s
31  //    stp  q2, q3,  addr
32  //
33  //===----------------------------------------------------------------------===//
34  
35  #include "AArch64InstrInfo.h"
36  #include "llvm/ADT/SmallVector.h"
37  #include "llvm/ADT/Statistic.h"
38  #include "llvm/ADT/StringRef.h"
39  #include "llvm/CodeGen/MachineBasicBlock.h"
40  #include "llvm/CodeGen/MachineFunction.h"
41  #include "llvm/CodeGen/MachineFunctionPass.h"
42  #include "llvm/CodeGen/MachineInstr.h"
43  #include "llvm/CodeGen/MachineInstrBuilder.h"
44  #include "llvm/CodeGen/MachineOperand.h"
45  #include "llvm/CodeGen/MachineRegisterInfo.h"
46  #include "llvm/CodeGen/TargetInstrInfo.h"
47  #include "llvm/CodeGen/TargetSchedule.h"
48  #include "llvm/CodeGen/TargetSubtargetInfo.h"
49  #include "llvm/MC/MCInstrDesc.h"
50  #include "llvm/MC/MCSchedule.h"
51  #include "llvm/Pass.h"
52  #include <unordered_map>
53  #include <map>
54  
55  using namespace llvm;
56  
57  #define DEBUG_TYPE "aarch64-simdinstr-opt"
58  
59  STATISTIC(NumModifiedInstr,
60            "Number of SIMD instructions modified");
61  
62  #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
63    "AArch64 SIMD instructions optimization pass"
64  
65  namespace {
66  
67  struct AArch64SIMDInstrOpt : public MachineFunctionPass {
68    static char ID;
69  
70    const TargetInstrInfo *TII;
71    MachineRegisterInfo *MRI;
72    TargetSchedModel SchedModel;
73  
74    // The two maps below are used to cache decisions instead of recomputing:
75    // This is used to cache instruction replacement decisions within function
76    // units and across function units.
77    std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
78    // This is used to cache the decision of whether to leave the interleaved
79    // store instructions replacement pass early or not for a particular target.
80    std::unordered_map<std::string, bool> InterlEarlyExit;
81  
82    typedef enum {
83      VectorElem,
84      Interleave
85    } Subpass;
86  
87    // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
88    struct InstReplInfo {
89      unsigned OrigOpc;
90  		std::vector<unsigned> ReplOpc;
91      const TargetRegisterClass RC;
92    };
93  
94  #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
95    {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
96  #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
97                  OpcR7, OpcR8, OpcR9, RC) \
98    {OpcOrg, \
99     {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
100  
101    // The Instruction Replacement Table:
102    std::vector<InstReplInfo> IRT = {
103      // ST2 instructions
104      RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
105            AArch64::STPQi, AArch64::FPR128RegClass),
106      RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
107            AArch64::STPQi, AArch64::FPR128RegClass),
108      RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
109            AArch64::STPDi, AArch64::FPR64RegClass),
110      RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
111            AArch64::STPQi, AArch64::FPR128RegClass),
112      RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
113            AArch64::STPDi, AArch64::FPR64RegClass),
114      RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
115            AArch64::STPQi, AArch64::FPR128RegClass),
116      RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
117            AArch64::STPDi, AArch64::FPR64RegClass),
118      // ST4 instructions
119      RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
120            AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
121            AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
122            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
123      RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
124            AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
125            AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
126            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
127      RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
128            AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
129            AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
130            AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
131      RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
132            AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
133            AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
134            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
135      RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
136            AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
137            AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
138            AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
139      RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
140            AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
141            AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
142            AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
143      RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
144            AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
145            AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
146            AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
147    };
148  
149    // A costly instruction is replaced in this work by N efficient instructions
150    // The maximum of N is curently 10 and it is for ST4 case.
151    static const unsigned MaxNumRepl = 10;
152  
153    AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
154      initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
155    }
156  
157    /// Based only on latency of instructions, determine if it is cost efficient
158    /// to replace the instruction InstDesc by the instructions stored in the
159    /// array InstDescRepl.
160    /// Return true if replacement is expected to be faster.
161    bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
162                           SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
163  
164    /// Determine if we need to exit the instruction replacement optimization
165    /// passes early. This makes sure that no compile time is spent in this pass
166    /// for targets with no need for any of these optimizations.
167    /// Return true if early exit of the pass is recommended.
168    bool shouldExitEarly(MachineFunction *MF, Subpass SP);
169  
170    /// Check whether an equivalent DUP instruction has already been
171    /// created or not.
172    /// Return true when the DUP instruction already exists. In this case,
173    /// DestReg will point to the destination of the already created DUP.
174    bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
175                  unsigned LaneNumber, unsigned *DestReg) const;
176  
177    /// Certain SIMD instructions with vector element operand are not efficient.
178    /// Rewrite them into SIMD instructions with vector operands. This rewrite
179    /// is driven by the latency of the instructions.
180    /// Return true if the SIMD instruction is modified.
181    bool optimizeVectElement(MachineInstr &MI);
182  
183    /// Process The REG_SEQUENCE instruction, and extract the source
184    /// operands of the ST2/4 instruction from it.
185    /// Example of such instructions.
186    ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
187    /// Return true when the instruction is processed successfully.
188    bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
189                           unsigned* StRegKill, unsigned NumArg) const;
190  
191    /// Load/Store Interleaving instructions are not always beneficial.
192    /// Replace them by ZIP instructionand classical load/store.
193    /// Return true if the SIMD instruction is modified.
194    bool optimizeLdStInterleave(MachineInstr &MI);
195  
196    /// Return the number of useful source registers for this
197    /// instruction (2 for ST2 and 4 for ST4).
198    unsigned determineSrcReg(MachineInstr &MI) const;
199  
200    bool runOnMachineFunction(MachineFunction &Fn) override;
201  
202    StringRef getPassName() const override {
203      return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
204    }
205  };
206  
207  char AArch64SIMDInstrOpt::ID = 0;
208  
209  } // end anonymous namespace
210  
211  INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
212                  AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
213  
214  /// Based only on latency of instructions, determine if it is cost efficient
215  /// to replace the instruction InstDesc by the instructions stored in the
216  /// array InstDescRepl.
217  /// Return true if replacement is expected to be faster.
218  bool AArch64SIMDInstrOpt::
219  shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
220                    SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
221    // Check if replacement decision is already available in the cached table.
222    // if so, return it.
223    std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
224    auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
225    auto It = SIMDInstrTable.find(InstID);
226    if (It != SIMDInstrTable.end())
227      return It->second;
228  
229    unsigned SCIdx = InstDesc->getSchedClass();
230    const MCSchedClassDesc *SCDesc =
231      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
232  
233    // If a target does not define resources for the instructions
234    // of interest, then return false for no replacement.
235    const MCSchedClassDesc *SCDescRepl;
236    if (!SCDesc->isValid() || SCDesc->isVariant())
237    {
238      SIMDInstrTable[InstID] = false;
239      return false;
240    }
241    for (const auto *IDesc : InstDescRepl)
242    {
243      SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
244        IDesc->getSchedClass());
245      if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
246      {
247        SIMDInstrTable[InstID] = false;
248        return false;
249      }
250    }
251  
252    // Replacement cost.
253    unsigned ReplCost = 0;
254    for (const auto *IDesc :InstDescRepl)
255      ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
256  
257    if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
258    {
259      SIMDInstrTable[InstID] = true;
260      return true;
261    }
262    else
263    {
264      SIMDInstrTable[InstID] = false;
265      return false;
266    }
267  }
268  
269  /// Determine if we need to exit this pass for a kind of instruction replacement
270  /// early. This makes sure that no compile time is spent in this pass for
271  /// targets with no need for any of these optimizations beyond performing this
272  /// check.
273  /// Return true if early exit of this pass for a kind of instruction
274  /// replacement is recommended for a target.
275  bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
276    const MCInstrDesc* OriginalMCID;
277    SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
278  
279    switch (SP) {
280    // For this optimization, check by comparing the latency of a representative
281    // instruction to that of the replacement instructions.
282    // TODO: check for all concerned instructions.
283    case VectorElem:
284      OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
285      ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
286      ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
287      if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
288        return false;
289      break;
290  
291    // For this optimization, check for all concerned instructions.
292    case Interleave:
293      std::string Subtarget =
294          std::string(SchedModel.getSubtargetInfo()->getCPU());
295      auto It = InterlEarlyExit.find(Subtarget);
296      if (It != InterlEarlyExit.end())
297        return It->second;
298  
299      for (auto &I : IRT) {
300        OriginalMCID = &TII->get(I.OrigOpc);
301        for (auto &Repl : I.ReplOpc)
302          ReplInstrMCID.push_back(&TII->get(Repl));
303        if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
304          InterlEarlyExit[Subtarget] = false;
305          return false;
306        }
307        ReplInstrMCID.clear();
308      }
309      InterlEarlyExit[Subtarget] = true;
310      break;
311    }
312  
313    return true;
314  }
315  
316  /// Check whether an equivalent DUP instruction has already been
317  /// created or not.
318  /// Return true when the DUP instruction already exists. In this case,
319  /// DestReg will point to the destination of the already created DUP.
320  bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
321                                           unsigned SrcReg, unsigned LaneNumber,
322                                           unsigned *DestReg) const {
323    for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
324         MII != MIE;) {
325      MII--;
326      MachineInstr *CurrentMI = &*MII;
327  
328      if (CurrentMI->getOpcode() == DupOpcode &&
329          CurrentMI->getNumOperands() == 3 &&
330          CurrentMI->getOperand(1).getReg() == SrcReg &&
331          CurrentMI->getOperand(2).getImm() == LaneNumber) {
332        *DestReg = CurrentMI->getOperand(0).getReg();
333        return true;
334      }
335    }
336  
337    return false;
338  }
339  
340  /// Certain SIMD instructions with vector element operand are not efficient.
341  /// Rewrite them into SIMD instructions with vector operands. This rewrite
342  /// is driven by the latency of the instructions.
343  /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
344  /// and FMULX and hence they are hardcoded.
345  ///
346  /// For example:
347  ///    fmla v0.4s, v1.4s, v2.s[1]
348  ///
349  /// Is rewritten into
350  ///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant
351  ///    fmla v0.4s, v1.4s, v3.4s
352  ///
353  /// Return true if the SIMD instruction is modified.
354  bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
355    const MCInstrDesc *MulMCID, *DupMCID;
356    const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
357  
358    switch (MI.getOpcode()) {
359    default:
360      return false;
361  
362    // 4X32 instructions
363    case AArch64::FMLAv4i32_indexed:
364      DupMCID = &TII->get(AArch64::DUPv4i32lane);
365      MulMCID = &TII->get(AArch64::FMLAv4f32);
366      break;
367    case AArch64::FMLSv4i32_indexed:
368      DupMCID = &TII->get(AArch64::DUPv4i32lane);
369      MulMCID = &TII->get(AArch64::FMLSv4f32);
370      break;
371    case AArch64::FMULXv4i32_indexed:
372      DupMCID = &TII->get(AArch64::DUPv4i32lane);
373      MulMCID = &TII->get(AArch64::FMULXv4f32);
374      break;
375    case AArch64::FMULv4i32_indexed:
376      DupMCID = &TII->get(AArch64::DUPv4i32lane);
377      MulMCID = &TII->get(AArch64::FMULv4f32);
378      break;
379  
380    // 2X64 instructions
381    case AArch64::FMLAv2i64_indexed:
382      DupMCID = &TII->get(AArch64::DUPv2i64lane);
383      MulMCID = &TII->get(AArch64::FMLAv2f64);
384      break;
385    case AArch64::FMLSv2i64_indexed:
386      DupMCID = &TII->get(AArch64::DUPv2i64lane);
387      MulMCID = &TII->get(AArch64::FMLSv2f64);
388      break;
389    case AArch64::FMULXv2i64_indexed:
390      DupMCID = &TII->get(AArch64::DUPv2i64lane);
391      MulMCID = &TII->get(AArch64::FMULXv2f64);
392      break;
393    case AArch64::FMULv2i64_indexed:
394      DupMCID = &TII->get(AArch64::DUPv2i64lane);
395      MulMCID = &TII->get(AArch64::FMULv2f64);
396      break;
397  
398    // 2X32 instructions
399    case AArch64::FMLAv2i32_indexed:
400      RC = &AArch64::FPR64RegClass;
401      DupMCID = &TII->get(AArch64::DUPv2i32lane);
402      MulMCID = &TII->get(AArch64::FMLAv2f32);
403      break;
404    case AArch64::FMLSv2i32_indexed:
405      RC = &AArch64::FPR64RegClass;
406      DupMCID = &TII->get(AArch64::DUPv2i32lane);
407      MulMCID = &TII->get(AArch64::FMLSv2f32);
408      break;
409    case AArch64::FMULXv2i32_indexed:
410      RC = &AArch64::FPR64RegClass;
411      DupMCID = &TII->get(AArch64::DUPv2i32lane);
412      MulMCID = &TII->get(AArch64::FMULXv2f32);
413      break;
414    case AArch64::FMULv2i32_indexed:
415      RC = &AArch64::FPR64RegClass;
416      DupMCID = &TII->get(AArch64::DUPv2i32lane);
417      MulMCID = &TII->get(AArch64::FMULv2f32);
418      break;
419    }
420  
421    SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
422    ReplInstrMCID.push_back(DupMCID);
423    ReplInstrMCID.push_back(MulMCID);
424    if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
425                           ReplInstrMCID))
426      return false;
427  
428    const DebugLoc &DL = MI.getDebugLoc();
429    MachineBasicBlock &MBB = *MI.getParent();
430    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
431  
432    // Get the operands of the current SIMD arithmetic instruction.
433    Register MulDest = MI.getOperand(0).getReg();
434    Register SrcReg0 = MI.getOperand(1).getReg();
435    unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
436    Register SrcReg1 = MI.getOperand(2).getReg();
437    unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
438    unsigned DupDest;
439  
440    // Instructions of interest have either 4 or 5 operands.
441    if (MI.getNumOperands() == 5) {
442      Register SrcReg2 = MI.getOperand(3).getReg();
443      unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
444      unsigned LaneNumber = MI.getOperand(4).getImm();
445      // Create a new DUP instruction. Note that if an equivalent DUP instruction
446      // has already been created before, then use that one instead of creating
447      // a new one.
448      if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
449        DupDest = MRI.createVirtualRegister(RC);
450        BuildMI(MBB, MI, DL, *DupMCID, DupDest)
451            .addReg(SrcReg2, Src2IsKill)
452            .addImm(LaneNumber);
453      }
454      BuildMI(MBB, MI, DL, *MulMCID, MulDest)
455          .addReg(SrcReg0, Src0IsKill)
456          .addReg(SrcReg1, Src1IsKill)
457          .addReg(DupDest, Src2IsKill);
458    } else if (MI.getNumOperands() == 4) {
459      unsigned LaneNumber = MI.getOperand(3).getImm();
460      if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
461        DupDest = MRI.createVirtualRegister(RC);
462        BuildMI(MBB, MI, DL, *DupMCID, DupDest)
463            .addReg(SrcReg1, Src1IsKill)
464            .addImm(LaneNumber);
465      }
466      BuildMI(MBB, MI, DL, *MulMCID, MulDest)
467          .addReg(SrcReg0, Src0IsKill)
468          .addReg(DupDest, Src1IsKill);
469    } else {
470      return false;
471    }
472  
473    ++NumModifiedInstr;
474    return true;
475  }
476  
477  /// Load/Store Interleaving instructions are not always beneficial.
478  /// Replace them by ZIP instructions and classical load/store.
479  ///
480  /// For example:
481  ///    st2 {v0.4s, v1.4s}, addr
482  ///
483  /// Is rewritten into:
484  ///    zip1 v2.4s, v0.4s, v1.4s
485  ///    zip2 v3.4s, v0.4s, v1.4s
486  ///    stp  q2, q3, addr
487  //
488  /// For example:
489  ///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
490  ///
491  /// Is rewritten into:
492  ///    zip1 v4.4s, v0.4s, v2.4s
493  ///    zip2 v5.4s, v0.4s, v2.4s
494  ///    zip1 v6.4s, v1.4s, v3.4s
495  ///    zip2 v7.4s, v1.4s, v3.4s
496  ///    zip1 v8.4s, v4.4s, v6.4s
497  ///    zip2 v9.4s, v4.4s, v6.4s
498  ///    zip1 v10.4s, v5.4s, v7.4s
499  ///    zip2 v11.4s, v5.4s, v7.4s
500  ///    stp  q8, q9, addr
501  ///    stp  q10, q11, addr+32
502  ///
503  /// Currently only instructions related to ST2 and ST4 are considered.
504  /// Other may be added later.
505  /// Return true if the SIMD instruction is modified.
506  bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
507  
508    unsigned SeqReg, AddrReg;
509    unsigned StReg[4], StRegKill[4];
510    MachineInstr *DefiningMI;
511    const DebugLoc &DL = MI.getDebugLoc();
512    MachineBasicBlock &MBB = *MI.getParent();
513    SmallVector<unsigned, MaxNumRepl> ZipDest;
514    SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
515  
516    // If current instruction matches any of the rewriting rules, then
517    // gather information about parameters of the new instructions.
518    bool Match = false;
519    for (auto &I : IRT) {
520      if (MI.getOpcode() == I.OrigOpc) {
521        SeqReg  = MI.getOperand(0).getReg();
522        AddrReg = MI.getOperand(1).getReg();
523        DefiningMI = MRI->getUniqueVRegDef(SeqReg);
524        unsigned NumReg = determineSrcReg(MI);
525        if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
526          return false;
527  
528        for (auto &Repl : I.ReplOpc) {
529          ReplInstrMCID.push_back(&TII->get(Repl));
530          // Generate destination registers but only for non-store instruction.
531          if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
532            ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
533        }
534        Match = true;
535        break;
536      }
537    }
538  
539    if (!Match)
540      return false;
541  
542    // Determine if it is profitable to replace MI by the series of instructions
543    // represented in ReplInstrMCID.
544    if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
545                           ReplInstrMCID))
546      return false;
547  
548    // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
549    // this point, the code generation is hardcoded and does not rely on the IRT
550    // table used above given that code generation for ST2 replacement is somewhat
551    // different than for ST4 replacement. We could have added more info into the
552    // table related to how we build new instructions but we may be adding more
553    // complexity with that).
554    switch (MI.getOpcode()) {
555    default:
556      return false;
557  
558    case AArch64::ST2Twov16b:
559    case AArch64::ST2Twov8b:
560    case AArch64::ST2Twov8h:
561    case AArch64::ST2Twov4h:
562    case AArch64::ST2Twov4s:
563    case AArch64::ST2Twov2s:
564    case AArch64::ST2Twov2d:
565      // ZIP instructions
566      BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
567          .addReg(StReg[0])
568          .addReg(StReg[1]);
569      BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
570          .addReg(StReg[0], StRegKill[0])
571          .addReg(StReg[1], StRegKill[1]);
572      // STP instructions
573      BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
574          .addReg(ZipDest[0])
575          .addReg(ZipDest[1])
576          .addReg(AddrReg)
577          .addImm(0);
578      break;
579  
580    case AArch64::ST4Fourv16b:
581    case AArch64::ST4Fourv8b:
582    case AArch64::ST4Fourv8h:
583    case AArch64::ST4Fourv4h:
584    case AArch64::ST4Fourv4s:
585    case AArch64::ST4Fourv2s:
586    case AArch64::ST4Fourv2d:
587      // ZIP instructions
588      BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
589          .addReg(StReg[0])
590          .addReg(StReg[2]);
591      BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
592          .addReg(StReg[0], StRegKill[0])
593          .addReg(StReg[2], StRegKill[2]);
594      BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
595          .addReg(StReg[1])
596          .addReg(StReg[3]);
597      BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
598          .addReg(StReg[1], StRegKill[1])
599          .addReg(StReg[3], StRegKill[3]);
600      BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
601          .addReg(ZipDest[0])
602          .addReg(ZipDest[2]);
603      BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
604          .addReg(ZipDest[0])
605          .addReg(ZipDest[2]);
606      BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
607          .addReg(ZipDest[1])
608          .addReg(ZipDest[3]);
609      BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
610          .addReg(ZipDest[1])
611          .addReg(ZipDest[3]);
612      // stp instructions
613      BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
614          .addReg(ZipDest[4])
615          .addReg(ZipDest[5])
616          .addReg(AddrReg)
617          .addImm(0);
618      BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
619          .addReg(ZipDest[6])
620          .addReg(ZipDest[7])
621          .addReg(AddrReg)
622          .addImm(2);
623      break;
624    }
625  
626    ++NumModifiedInstr;
627    return true;
628  }
629  
630  /// Process The REG_SEQUENCE instruction, and extract the source
631  /// operands of the ST2/4 instruction from it.
632  /// Example of such instruction.
633  ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
634  /// Return true when the instruction is processed successfully.
635  bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
636       unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
637    assert(DefiningMI != nullptr);
638    if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
639      return false;
640  
641    for (unsigned i=0; i<NumArg; i++) {
642      StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();
643      StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
644  
645      // Validation check for the other arguments.
646      if (DefiningMI->getOperand(2*i+2).isImm()) {
647        switch (DefiningMI->getOperand(2*i+2).getImm()) {
648        default:
649          return false;
650  
651        case AArch64::dsub0:
652        case AArch64::dsub1:
653        case AArch64::dsub2:
654        case AArch64::dsub3:
655        case AArch64::qsub0:
656        case AArch64::qsub1:
657        case AArch64::qsub2:
658        case AArch64::qsub3:
659          break;
660        }
661      }
662      else
663        return false;
664    }
665    return true;
666  }
667  
668  /// Return the number of useful source registers for this instruction
669  /// (2 for ST2 and 4 for ST4).
670  unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
671    switch (MI.getOpcode()) {
672    default:
673      llvm_unreachable("Unsupported instruction for this pass");
674  
675    case AArch64::ST2Twov16b:
676    case AArch64::ST2Twov8b:
677    case AArch64::ST2Twov8h:
678    case AArch64::ST2Twov4h:
679    case AArch64::ST2Twov4s:
680    case AArch64::ST2Twov2s:
681    case AArch64::ST2Twov2d:
682      return 2;
683  
684    case AArch64::ST4Fourv16b:
685    case AArch64::ST4Fourv8b:
686    case AArch64::ST4Fourv8h:
687    case AArch64::ST4Fourv4h:
688    case AArch64::ST4Fourv4s:
689    case AArch64::ST4Fourv2s:
690    case AArch64::ST4Fourv2d:
691      return 4;
692    }
693  }
694  
695  bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
696    if (skipFunction(MF.getFunction()))
697      return false;
698  
699    TII = MF.getSubtarget().getInstrInfo();
700    MRI = &MF.getRegInfo();
701    const TargetSubtargetInfo &ST = MF.getSubtarget();
702    const AArch64InstrInfo *AAII =
703        static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
704    if (!AAII)
705      return false;
706    SchedModel.init(&ST);
707    if (!SchedModel.hasInstrSchedModel())
708      return false;
709  
710    bool Changed = false;
711    for (auto OptimizationKind : {VectorElem, Interleave}) {
712      if (!shouldExitEarly(&MF, OptimizationKind)) {
713        SmallVector<MachineInstr *, 8> RemoveMIs;
714        for (MachineBasicBlock &MBB : MF) {
715          for (MachineInstr &MI : MBB) {
716            bool InstRewrite;
717            if (OptimizationKind == VectorElem)
718              InstRewrite = optimizeVectElement(MI) ;
719            else
720              InstRewrite = optimizeLdStInterleave(MI);
721            if (InstRewrite) {
722              // Add MI to the list of instructions to be removed given that it
723              // has been replaced.
724              RemoveMIs.push_back(&MI);
725              Changed = true;
726            }
727          }
728        }
729        for (MachineInstr *MI : RemoveMIs)
730          MI->eraseFromParent();
731      }
732    }
733  
734    return Changed;
735  }
736  
737  /// Returns an instance of the high cost ASIMD instruction replacement
738  /// optimization pass.
739  FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
740    return new AArch64SIMDInstrOpt();
741  }
742