10b57cec5SDimitry Andric // 20b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 30b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 40b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 50b57cec5SDimitry Andric // 60b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 70b57cec5SDimitry Andric // 80b57cec5SDimitry Andric // This file contains a pass that performs optimization on SIMD instructions 90b57cec5SDimitry Andric // with high latency by splitting them into more efficient series of 100b57cec5SDimitry Andric // instructions. 110b57cec5SDimitry Andric // 120b57cec5SDimitry Andric // 1. Rewrite certain SIMD instructions with vector element due to their 130b57cec5SDimitry Andric // inefficiency on some targets. 140b57cec5SDimitry Andric // 150b57cec5SDimitry Andric // For example: 160b57cec5SDimitry Andric // fmla v0.4s, v1.4s, v2.s[1] 170b57cec5SDimitry Andric // 180b57cec5SDimitry Andric // Is rewritten into: 190b57cec5SDimitry Andric // dup v3.4s, v2.s[1] 200b57cec5SDimitry Andric // fmla v0.4s, v1.4s, v3.4s 210b57cec5SDimitry Andric // 220b57cec5SDimitry Andric // 2. Rewrite interleaved memory access instructions due to their 230b57cec5SDimitry Andric // inefficiency on some targets. 240b57cec5SDimitry Andric // 250b57cec5SDimitry Andric // For example: 260b57cec5SDimitry Andric // st2 {v0.4s, v1.4s}, addr 270b57cec5SDimitry Andric // 280b57cec5SDimitry Andric // Is rewritten into: 290b57cec5SDimitry Andric // zip1 v2.4s, v0.4s, v1.4s 300b57cec5SDimitry Andric // zip2 v3.4s, v0.4s, v1.4s 310b57cec5SDimitry Andric // stp q2, q3, addr 320b57cec5SDimitry Andric // 330b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 340b57cec5SDimitry Andric 350b57cec5SDimitry Andric #include "AArch64InstrInfo.h" 360b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 370b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h" 380b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 390b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 400b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 410b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 420b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h" 430b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 440b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 450b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 460b57cec5SDimitry Andric #include "llvm/CodeGen/TargetInstrInfo.h" 470b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSchedule.h" 480b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSubtargetInfo.h" 490b57cec5SDimitry Andric #include "llvm/MC/MCInstrDesc.h" 500b57cec5SDimitry Andric #include "llvm/MC/MCSchedule.h" 510b57cec5SDimitry Andric #include "llvm/Pass.h" 520b57cec5SDimitry Andric #include <unordered_map> 53*5f757f3fSDimitry Andric #include <map> 540b57cec5SDimitry Andric 550b57cec5SDimitry Andric using namespace llvm; 560b57cec5SDimitry Andric 570b57cec5SDimitry Andric #define DEBUG_TYPE "aarch64-simdinstr-opt" 580b57cec5SDimitry Andric 590b57cec5SDimitry Andric STATISTIC(NumModifiedInstr, 600b57cec5SDimitry Andric "Number of SIMD instructions modified"); 610b57cec5SDimitry Andric 620b57cec5SDimitry Andric #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ 630b57cec5SDimitry Andric "AArch64 SIMD instructions optimization pass" 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric namespace { 660b57cec5SDimitry Andric 670b57cec5SDimitry Andric struct AArch64SIMDInstrOpt : public MachineFunctionPass { 680b57cec5SDimitry Andric static char ID; 690b57cec5SDimitry Andric 700b57cec5SDimitry Andric const TargetInstrInfo *TII; 710b57cec5SDimitry Andric MachineRegisterInfo *MRI; 720b57cec5SDimitry Andric TargetSchedModel SchedModel; 730b57cec5SDimitry Andric 740b57cec5SDimitry Andric // The two maps below are used to cache decisions instead of recomputing: 750b57cec5SDimitry Andric // This is used to cache instruction replacement decisions within function 760b57cec5SDimitry Andric // units and across function units. 770b57cec5SDimitry Andric std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; 780b57cec5SDimitry Andric // This is used to cache the decision of whether to leave the interleaved 790b57cec5SDimitry Andric // store instructions replacement pass early or not for a particular target. 800b57cec5SDimitry Andric std::unordered_map<std::string, bool> InterlEarlyExit; 810b57cec5SDimitry Andric 820b57cec5SDimitry Andric typedef enum { 830b57cec5SDimitry Andric VectorElem, 840b57cec5SDimitry Andric Interleave 850b57cec5SDimitry Andric } Subpass; 860b57cec5SDimitry Andric 870b57cec5SDimitry Andric // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. 880b57cec5SDimitry Andric struct InstReplInfo { 890b57cec5SDimitry Andric unsigned OrigOpc; 900b57cec5SDimitry Andric std::vector<unsigned> ReplOpc; 910b57cec5SDimitry Andric const TargetRegisterClass RC; 920b57cec5SDimitry Andric }; 930b57cec5SDimitry Andric 940b57cec5SDimitry Andric #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ 950b57cec5SDimitry Andric {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} 960b57cec5SDimitry Andric #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ 970b57cec5SDimitry Andric OpcR7, OpcR8, OpcR9, RC) \ 980b57cec5SDimitry Andric {OpcOrg, \ 990b57cec5SDimitry Andric {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} 1000b57cec5SDimitry Andric 1010b57cec5SDimitry Andric // The Instruction Replacement Table: 1020b57cec5SDimitry Andric std::vector<InstReplInfo> IRT = { 1030b57cec5SDimitry Andric // ST2 instructions 1040b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 1050b57cec5SDimitry Andric AArch64::STPQi, AArch64::FPR128RegClass), 1060b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 1070b57cec5SDimitry Andric AArch64::STPQi, AArch64::FPR128RegClass), 1080b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 1090b57cec5SDimitry Andric AArch64::STPDi, AArch64::FPR64RegClass), 1100b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 1110b57cec5SDimitry Andric AArch64::STPQi, AArch64::FPR128RegClass), 1120b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 1130b57cec5SDimitry Andric AArch64::STPDi, AArch64::FPR64RegClass), 1140b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 1150b57cec5SDimitry Andric AArch64::STPQi, AArch64::FPR128RegClass), 1160b57cec5SDimitry Andric RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 1170b57cec5SDimitry Andric AArch64::STPDi, AArch64::FPR64RegClass), 1180b57cec5SDimitry Andric // ST4 instructions 1190b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 1200b57cec5SDimitry Andric AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, 1210b57cec5SDimitry Andric AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 1220b57cec5SDimitry Andric AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 1230b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 1240b57cec5SDimitry Andric AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, 1250b57cec5SDimitry Andric AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 1260b57cec5SDimitry Andric AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 1270b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 1280b57cec5SDimitry Andric AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, 1290b57cec5SDimitry Andric AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 1300b57cec5SDimitry Andric AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 1310b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 1320b57cec5SDimitry Andric AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, 1330b57cec5SDimitry Andric AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 1340b57cec5SDimitry Andric AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 1350b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 1360b57cec5SDimitry Andric AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, 1370b57cec5SDimitry Andric AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 1380b57cec5SDimitry Andric AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 1390b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 1400b57cec5SDimitry Andric AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, 1410b57cec5SDimitry Andric AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 1420b57cec5SDimitry Andric AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 1430b57cec5SDimitry Andric RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 1440b57cec5SDimitry Andric AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, 1450b57cec5SDimitry Andric AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 1460b57cec5SDimitry Andric AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) 1470b57cec5SDimitry Andric }; 1480b57cec5SDimitry Andric 1490b57cec5SDimitry Andric // A costly instruction is replaced in this work by N efficient instructions 1500b57cec5SDimitry Andric // The maximum of N is curently 10 and it is for ST4 case. 1510b57cec5SDimitry Andric static const unsigned MaxNumRepl = 10; 1520b57cec5SDimitry Andric 1530b57cec5SDimitry Andric AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { 1540b57cec5SDimitry Andric initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric 1570b57cec5SDimitry Andric /// Based only on latency of instructions, determine if it is cost efficient 1580b57cec5SDimitry Andric /// to replace the instruction InstDesc by the instructions stored in the 1590b57cec5SDimitry Andric /// array InstDescRepl. 1600b57cec5SDimitry Andric /// Return true if replacement is expected to be faster. 1610b57cec5SDimitry Andric bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 1620b57cec5SDimitry Andric SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric /// Determine if we need to exit the instruction replacement optimization 1650b57cec5SDimitry Andric /// passes early. This makes sure that no compile time is spent in this pass 1660b57cec5SDimitry Andric /// for targets with no need for any of these optimizations. 1670b57cec5SDimitry Andric /// Return true if early exit of the pass is recommended. 1680b57cec5SDimitry Andric bool shouldExitEarly(MachineFunction *MF, Subpass SP); 1690b57cec5SDimitry Andric 1700b57cec5SDimitry Andric /// Check whether an equivalent DUP instruction has already been 1710b57cec5SDimitry Andric /// created or not. 1720b57cec5SDimitry Andric /// Return true when the DUP instruction already exists. In this case, 1730b57cec5SDimitry Andric /// DestReg will point to the destination of the already created DUP. 1740b57cec5SDimitry Andric bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, 1750b57cec5SDimitry Andric unsigned LaneNumber, unsigned *DestReg) const; 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric /// Certain SIMD instructions with vector element operand are not efficient. 1780b57cec5SDimitry Andric /// Rewrite them into SIMD instructions with vector operands. This rewrite 1790b57cec5SDimitry Andric /// is driven by the latency of the instructions. 1800b57cec5SDimitry Andric /// Return true if the SIMD instruction is modified. 1810b57cec5SDimitry Andric bool optimizeVectElement(MachineInstr &MI); 1820b57cec5SDimitry Andric 1830b57cec5SDimitry Andric /// Process The REG_SEQUENCE instruction, and extract the source 1840b57cec5SDimitry Andric /// operands of the ST2/4 instruction from it. 1850b57cec5SDimitry Andric /// Example of such instructions. 1860b57cec5SDimitry Andric /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 1870b57cec5SDimitry Andric /// Return true when the instruction is processed successfully. 1880b57cec5SDimitry Andric bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, 1890b57cec5SDimitry Andric unsigned* StRegKill, unsigned NumArg) const; 1900b57cec5SDimitry Andric 1910b57cec5SDimitry Andric /// Load/Store Interleaving instructions are not always beneficial. 1920b57cec5SDimitry Andric /// Replace them by ZIP instructionand classical load/store. 1930b57cec5SDimitry Andric /// Return true if the SIMD instruction is modified. 1940b57cec5SDimitry Andric bool optimizeLdStInterleave(MachineInstr &MI); 1950b57cec5SDimitry Andric 1960b57cec5SDimitry Andric /// Return the number of useful source registers for this 1970b57cec5SDimitry Andric /// instruction (2 for ST2 and 4 for ST4). 1980b57cec5SDimitry Andric unsigned determineSrcReg(MachineInstr &MI) const; 1990b57cec5SDimitry Andric 2000b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &Fn) override; 2010b57cec5SDimitry Andric 2020b57cec5SDimitry Andric StringRef getPassName() const override { 2030b57cec5SDimitry Andric return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; 2040b57cec5SDimitry Andric } 2050b57cec5SDimitry Andric }; 2060b57cec5SDimitry Andric 2070b57cec5SDimitry Andric char AArch64SIMDInstrOpt::ID = 0; 2080b57cec5SDimitry Andric 2090b57cec5SDimitry Andric } // end anonymous namespace 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", 2120b57cec5SDimitry Andric AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) 2130b57cec5SDimitry Andric 2140b57cec5SDimitry Andric /// Based only on latency of instructions, determine if it is cost efficient 2150b57cec5SDimitry Andric /// to replace the instruction InstDesc by the instructions stored in the 2160b57cec5SDimitry Andric /// array InstDescRepl. 2170b57cec5SDimitry Andric /// Return true if replacement is expected to be faster. 2180b57cec5SDimitry Andric bool AArch64SIMDInstrOpt:: 2190b57cec5SDimitry Andric shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 2200b57cec5SDimitry Andric SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { 2210b57cec5SDimitry Andric // Check if replacement decision is already available in the cached table. 2220b57cec5SDimitry Andric // if so, return it. 2235ffd83dbSDimitry Andric std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU()); 2240b57cec5SDimitry Andric auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); 225e8d8bef9SDimitry Andric auto It = SIMDInstrTable.find(InstID); 226e8d8bef9SDimitry Andric if (It != SIMDInstrTable.end()) 227e8d8bef9SDimitry Andric return It->second; 2280b57cec5SDimitry Andric 2290b57cec5SDimitry Andric unsigned SCIdx = InstDesc->getSchedClass(); 2300b57cec5SDimitry Andric const MCSchedClassDesc *SCDesc = 2310b57cec5SDimitry Andric SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); 2320b57cec5SDimitry Andric 2330b57cec5SDimitry Andric // If a target does not define resources for the instructions 2340b57cec5SDimitry Andric // of interest, then return false for no replacement. 2350b57cec5SDimitry Andric const MCSchedClassDesc *SCDescRepl; 2360b57cec5SDimitry Andric if (!SCDesc->isValid() || SCDesc->isVariant()) 2370b57cec5SDimitry Andric { 2380b57cec5SDimitry Andric SIMDInstrTable[InstID] = false; 2390b57cec5SDimitry Andric return false; 2400b57cec5SDimitry Andric } 241bdd1243dSDimitry Andric for (const auto *IDesc : InstDescRepl) 2420b57cec5SDimitry Andric { 2430b57cec5SDimitry Andric SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( 2440b57cec5SDimitry Andric IDesc->getSchedClass()); 2450b57cec5SDimitry Andric if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) 2460b57cec5SDimitry Andric { 2470b57cec5SDimitry Andric SIMDInstrTable[InstID] = false; 2480b57cec5SDimitry Andric return false; 2490b57cec5SDimitry Andric } 2500b57cec5SDimitry Andric } 2510b57cec5SDimitry Andric 2520b57cec5SDimitry Andric // Replacement cost. 2530b57cec5SDimitry Andric unsigned ReplCost = 0; 254bdd1243dSDimitry Andric for (const auto *IDesc :InstDescRepl) 2550b57cec5SDimitry Andric ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); 2560b57cec5SDimitry Andric 2570b57cec5SDimitry Andric if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) 2580b57cec5SDimitry Andric { 2590b57cec5SDimitry Andric SIMDInstrTable[InstID] = true; 2600b57cec5SDimitry Andric return true; 2610b57cec5SDimitry Andric } 2620b57cec5SDimitry Andric else 2630b57cec5SDimitry Andric { 2640b57cec5SDimitry Andric SIMDInstrTable[InstID] = false; 2650b57cec5SDimitry Andric return false; 2660b57cec5SDimitry Andric } 2670b57cec5SDimitry Andric } 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric /// Determine if we need to exit this pass for a kind of instruction replacement 2700b57cec5SDimitry Andric /// early. This makes sure that no compile time is spent in this pass for 2710b57cec5SDimitry Andric /// targets with no need for any of these optimizations beyond performing this 2720b57cec5SDimitry Andric /// check. 2730b57cec5SDimitry Andric /// Return true if early exit of this pass for a kind of instruction 2740b57cec5SDimitry Andric /// replacement is recommended for a target. 2750b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { 2760b57cec5SDimitry Andric const MCInstrDesc* OriginalMCID; 2770b57cec5SDimitry Andric SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 2780b57cec5SDimitry Andric 2790b57cec5SDimitry Andric switch (SP) { 2800b57cec5SDimitry Andric // For this optimization, check by comparing the latency of a representative 2810b57cec5SDimitry Andric // instruction to that of the replacement instructions. 2820b57cec5SDimitry Andric // TODO: check for all concerned instructions. 2830b57cec5SDimitry Andric case VectorElem: 2840b57cec5SDimitry Andric OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); 2850b57cec5SDimitry Andric ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); 2860b57cec5SDimitry Andric ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); 2870b57cec5SDimitry Andric if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) 2880b57cec5SDimitry Andric return false; 2890b57cec5SDimitry Andric break; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric // For this optimization, check for all concerned instructions. 2920b57cec5SDimitry Andric case Interleave: 2935ffd83dbSDimitry Andric std::string Subtarget = 2945ffd83dbSDimitry Andric std::string(SchedModel.getSubtargetInfo()->getCPU()); 295e8d8bef9SDimitry Andric auto It = InterlEarlyExit.find(Subtarget); 296e8d8bef9SDimitry Andric if (It != InterlEarlyExit.end()) 297e8d8bef9SDimitry Andric return It->second; 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric for (auto &I : IRT) { 3000b57cec5SDimitry Andric OriginalMCID = &TII->get(I.OrigOpc); 3010b57cec5SDimitry Andric for (auto &Repl : I.ReplOpc) 3020b57cec5SDimitry Andric ReplInstrMCID.push_back(&TII->get(Repl)); 3030b57cec5SDimitry Andric if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { 3040b57cec5SDimitry Andric InterlEarlyExit[Subtarget] = false; 3050b57cec5SDimitry Andric return false; 3060b57cec5SDimitry Andric } 3070b57cec5SDimitry Andric ReplInstrMCID.clear(); 3080b57cec5SDimitry Andric } 3090b57cec5SDimitry Andric InterlEarlyExit[Subtarget] = true; 3100b57cec5SDimitry Andric break; 3110b57cec5SDimitry Andric } 3120b57cec5SDimitry Andric 3130b57cec5SDimitry Andric return true; 3140b57cec5SDimitry Andric } 3150b57cec5SDimitry Andric 3160b57cec5SDimitry Andric /// Check whether an equivalent DUP instruction has already been 3170b57cec5SDimitry Andric /// created or not. 3180b57cec5SDimitry Andric /// Return true when the DUP instruction already exists. In this case, 3190b57cec5SDimitry Andric /// DestReg will point to the destination of the already created DUP. 3200b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, 3210b57cec5SDimitry Andric unsigned SrcReg, unsigned LaneNumber, 3220b57cec5SDimitry Andric unsigned *DestReg) const { 3230b57cec5SDimitry Andric for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); 3240b57cec5SDimitry Andric MII != MIE;) { 3250b57cec5SDimitry Andric MII--; 3260b57cec5SDimitry Andric MachineInstr *CurrentMI = &*MII; 3270b57cec5SDimitry Andric 3280b57cec5SDimitry Andric if (CurrentMI->getOpcode() == DupOpcode && 3290b57cec5SDimitry Andric CurrentMI->getNumOperands() == 3 && 3300b57cec5SDimitry Andric CurrentMI->getOperand(1).getReg() == SrcReg && 3310b57cec5SDimitry Andric CurrentMI->getOperand(2).getImm() == LaneNumber) { 3320b57cec5SDimitry Andric *DestReg = CurrentMI->getOperand(0).getReg(); 3330b57cec5SDimitry Andric return true; 3340b57cec5SDimitry Andric } 3350b57cec5SDimitry Andric } 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric return false; 3380b57cec5SDimitry Andric } 3390b57cec5SDimitry Andric 3400b57cec5SDimitry Andric /// Certain SIMD instructions with vector element operand are not efficient. 3410b57cec5SDimitry Andric /// Rewrite them into SIMD instructions with vector operands. This rewrite 3420b57cec5SDimitry Andric /// is driven by the latency of the instructions. 3430b57cec5SDimitry Andric /// The instruction of concerns are for the time being FMLA, FMLS, FMUL, 3440b57cec5SDimitry Andric /// and FMULX and hence they are hardcoded. 3450b57cec5SDimitry Andric /// 3460b57cec5SDimitry Andric /// For example: 3470b57cec5SDimitry Andric /// fmla v0.4s, v1.4s, v2.s[1] 3480b57cec5SDimitry Andric /// 3490b57cec5SDimitry Andric /// Is rewritten into 3500b57cec5SDimitry Andric /// dup v3.4s, v2.s[1] // DUP not necessary if redundant 3510b57cec5SDimitry Andric /// fmla v0.4s, v1.4s, v3.4s 3520b57cec5SDimitry Andric /// 3530b57cec5SDimitry Andric /// Return true if the SIMD instruction is modified. 3540b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { 3550b57cec5SDimitry Andric const MCInstrDesc *MulMCID, *DupMCID; 3560b57cec5SDimitry Andric const TargetRegisterClass *RC = &AArch64::FPR128RegClass; 3570b57cec5SDimitry Andric 3580b57cec5SDimitry Andric switch (MI.getOpcode()) { 3590b57cec5SDimitry Andric default: 3600b57cec5SDimitry Andric return false; 3610b57cec5SDimitry Andric 3620b57cec5SDimitry Andric // 4X32 instructions 3630b57cec5SDimitry Andric case AArch64::FMLAv4i32_indexed: 3640b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv4i32lane); 3650b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMLAv4f32); 3660b57cec5SDimitry Andric break; 3670b57cec5SDimitry Andric case AArch64::FMLSv4i32_indexed: 3680b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv4i32lane); 3690b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMLSv4f32); 3700b57cec5SDimitry Andric break; 3710b57cec5SDimitry Andric case AArch64::FMULXv4i32_indexed: 3720b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv4i32lane); 3730b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMULXv4f32); 3740b57cec5SDimitry Andric break; 3750b57cec5SDimitry Andric case AArch64::FMULv4i32_indexed: 3760b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv4i32lane); 3770b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMULv4f32); 3780b57cec5SDimitry Andric break; 3790b57cec5SDimitry Andric 3800b57cec5SDimitry Andric // 2X64 instructions 3810b57cec5SDimitry Andric case AArch64::FMLAv2i64_indexed: 3820b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i64lane); 3830b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMLAv2f64); 3840b57cec5SDimitry Andric break; 3850b57cec5SDimitry Andric case AArch64::FMLSv2i64_indexed: 3860b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i64lane); 3870b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMLSv2f64); 3880b57cec5SDimitry Andric break; 3890b57cec5SDimitry Andric case AArch64::FMULXv2i64_indexed: 3900b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i64lane); 3910b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMULXv2f64); 3920b57cec5SDimitry Andric break; 3930b57cec5SDimitry Andric case AArch64::FMULv2i64_indexed: 3940b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i64lane); 3950b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMULv2f64); 3960b57cec5SDimitry Andric break; 3970b57cec5SDimitry Andric 3980b57cec5SDimitry Andric // 2X32 instructions 3990b57cec5SDimitry Andric case AArch64::FMLAv2i32_indexed: 4000b57cec5SDimitry Andric RC = &AArch64::FPR64RegClass; 4010b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i32lane); 4020b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMLAv2f32); 4030b57cec5SDimitry Andric break; 4040b57cec5SDimitry Andric case AArch64::FMLSv2i32_indexed: 4050b57cec5SDimitry Andric RC = &AArch64::FPR64RegClass; 4060b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i32lane); 4070b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMLSv2f32); 4080b57cec5SDimitry Andric break; 4090b57cec5SDimitry Andric case AArch64::FMULXv2i32_indexed: 4100b57cec5SDimitry Andric RC = &AArch64::FPR64RegClass; 4110b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i32lane); 4120b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMULXv2f32); 4130b57cec5SDimitry Andric break; 4140b57cec5SDimitry Andric case AArch64::FMULv2i32_indexed: 4150b57cec5SDimitry Andric RC = &AArch64::FPR64RegClass; 4160b57cec5SDimitry Andric DupMCID = &TII->get(AArch64::DUPv2i32lane); 4170b57cec5SDimitry Andric MulMCID = &TII->get(AArch64::FMULv2f32); 4180b57cec5SDimitry Andric break; 4190b57cec5SDimitry Andric } 4200b57cec5SDimitry Andric 4210b57cec5SDimitry Andric SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; 4220b57cec5SDimitry Andric ReplInstrMCID.push_back(DupMCID); 4230b57cec5SDimitry Andric ReplInstrMCID.push_back(MulMCID); 4240b57cec5SDimitry Andric if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 4250b57cec5SDimitry Andric ReplInstrMCID)) 4260b57cec5SDimitry Andric return false; 4270b57cec5SDimitry Andric 4280b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 4290b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 4300b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4310b57cec5SDimitry Andric 4320b57cec5SDimitry Andric // Get the operands of the current SIMD arithmetic instruction. 4338bcb0991SDimitry Andric Register MulDest = MI.getOperand(0).getReg(); 4348bcb0991SDimitry Andric Register SrcReg0 = MI.getOperand(1).getReg(); 4350b57cec5SDimitry Andric unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); 4368bcb0991SDimitry Andric Register SrcReg1 = MI.getOperand(2).getReg(); 4370b57cec5SDimitry Andric unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); 4380b57cec5SDimitry Andric unsigned DupDest; 4390b57cec5SDimitry Andric 4400b57cec5SDimitry Andric // Instructions of interest have either 4 or 5 operands. 4410b57cec5SDimitry Andric if (MI.getNumOperands() == 5) { 4428bcb0991SDimitry Andric Register SrcReg2 = MI.getOperand(3).getReg(); 4430b57cec5SDimitry Andric unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); 4440b57cec5SDimitry Andric unsigned LaneNumber = MI.getOperand(4).getImm(); 4450b57cec5SDimitry Andric // Create a new DUP instruction. Note that if an equivalent DUP instruction 4460b57cec5SDimitry Andric // has already been created before, then use that one instead of creating 4470b57cec5SDimitry Andric // a new one. 4480b57cec5SDimitry Andric if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { 4490b57cec5SDimitry Andric DupDest = MRI.createVirtualRegister(RC); 4500b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *DupMCID, DupDest) 4510b57cec5SDimitry Andric .addReg(SrcReg2, Src2IsKill) 4520b57cec5SDimitry Andric .addImm(LaneNumber); 4530b57cec5SDimitry Andric } 4540b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *MulMCID, MulDest) 4550b57cec5SDimitry Andric .addReg(SrcReg0, Src0IsKill) 4560b57cec5SDimitry Andric .addReg(SrcReg1, Src1IsKill) 4570b57cec5SDimitry Andric .addReg(DupDest, Src2IsKill); 4580b57cec5SDimitry Andric } else if (MI.getNumOperands() == 4) { 4590b57cec5SDimitry Andric unsigned LaneNumber = MI.getOperand(3).getImm(); 4600b57cec5SDimitry Andric if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { 4610b57cec5SDimitry Andric DupDest = MRI.createVirtualRegister(RC); 4620b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *DupMCID, DupDest) 4630b57cec5SDimitry Andric .addReg(SrcReg1, Src1IsKill) 4640b57cec5SDimitry Andric .addImm(LaneNumber); 4650b57cec5SDimitry Andric } 4660b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *MulMCID, MulDest) 4670b57cec5SDimitry Andric .addReg(SrcReg0, Src0IsKill) 4680b57cec5SDimitry Andric .addReg(DupDest, Src1IsKill); 4690b57cec5SDimitry Andric } else { 4700b57cec5SDimitry Andric return false; 4710b57cec5SDimitry Andric } 4720b57cec5SDimitry Andric 4730b57cec5SDimitry Andric ++NumModifiedInstr; 4740b57cec5SDimitry Andric return true; 4750b57cec5SDimitry Andric } 4760b57cec5SDimitry Andric 4770b57cec5SDimitry Andric /// Load/Store Interleaving instructions are not always beneficial. 4780b57cec5SDimitry Andric /// Replace them by ZIP instructions and classical load/store. 4790b57cec5SDimitry Andric /// 4800b57cec5SDimitry Andric /// For example: 4810b57cec5SDimitry Andric /// st2 {v0.4s, v1.4s}, addr 4820b57cec5SDimitry Andric /// 4830b57cec5SDimitry Andric /// Is rewritten into: 4840b57cec5SDimitry Andric /// zip1 v2.4s, v0.4s, v1.4s 4850b57cec5SDimitry Andric /// zip2 v3.4s, v0.4s, v1.4s 4860b57cec5SDimitry Andric /// stp q2, q3, addr 4870b57cec5SDimitry Andric // 4880b57cec5SDimitry Andric /// For example: 4890b57cec5SDimitry Andric /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr 4900b57cec5SDimitry Andric /// 4910b57cec5SDimitry Andric /// Is rewritten into: 4920b57cec5SDimitry Andric /// zip1 v4.4s, v0.4s, v2.4s 4930b57cec5SDimitry Andric /// zip2 v5.4s, v0.4s, v2.4s 4940b57cec5SDimitry Andric /// zip1 v6.4s, v1.4s, v3.4s 4950b57cec5SDimitry Andric /// zip2 v7.4s, v1.4s, v3.4s 4960b57cec5SDimitry Andric /// zip1 v8.4s, v4.4s, v6.4s 4970b57cec5SDimitry Andric /// zip2 v9.4s, v4.4s, v6.4s 4980b57cec5SDimitry Andric /// zip1 v10.4s, v5.4s, v7.4s 4990b57cec5SDimitry Andric /// zip2 v11.4s, v5.4s, v7.4s 5000b57cec5SDimitry Andric /// stp q8, q9, addr 5010b57cec5SDimitry Andric /// stp q10, q11, addr+32 5020b57cec5SDimitry Andric /// 5030b57cec5SDimitry Andric /// Currently only instructions related to ST2 and ST4 are considered. 5040b57cec5SDimitry Andric /// Other may be added later. 5050b57cec5SDimitry Andric /// Return true if the SIMD instruction is modified. 5060b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { 5070b57cec5SDimitry Andric 5080b57cec5SDimitry Andric unsigned SeqReg, AddrReg; 5090b57cec5SDimitry Andric unsigned StReg[4], StRegKill[4]; 5100b57cec5SDimitry Andric MachineInstr *DefiningMI; 5110b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 5120b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 5130b57cec5SDimitry Andric SmallVector<unsigned, MaxNumRepl> ZipDest; 5140b57cec5SDimitry Andric SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 5150b57cec5SDimitry Andric 5160b57cec5SDimitry Andric // If current instruction matches any of the rewriting rules, then 5170b57cec5SDimitry Andric // gather information about parameters of the new instructions. 5180b57cec5SDimitry Andric bool Match = false; 5190b57cec5SDimitry Andric for (auto &I : IRT) { 5200b57cec5SDimitry Andric if (MI.getOpcode() == I.OrigOpc) { 5210b57cec5SDimitry Andric SeqReg = MI.getOperand(0).getReg(); 5220b57cec5SDimitry Andric AddrReg = MI.getOperand(1).getReg(); 5230b57cec5SDimitry Andric DefiningMI = MRI->getUniqueVRegDef(SeqReg); 5240b57cec5SDimitry Andric unsigned NumReg = determineSrcReg(MI); 5250b57cec5SDimitry Andric if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) 5260b57cec5SDimitry Andric return false; 5270b57cec5SDimitry Andric 5280b57cec5SDimitry Andric for (auto &Repl : I.ReplOpc) { 5290b57cec5SDimitry Andric ReplInstrMCID.push_back(&TII->get(Repl)); 5300b57cec5SDimitry Andric // Generate destination registers but only for non-store instruction. 5310b57cec5SDimitry Andric if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) 5320b57cec5SDimitry Andric ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); 5330b57cec5SDimitry Andric } 5340b57cec5SDimitry Andric Match = true; 5350b57cec5SDimitry Andric break; 5360b57cec5SDimitry Andric } 5370b57cec5SDimitry Andric } 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric if (!Match) 5400b57cec5SDimitry Andric return false; 5410b57cec5SDimitry Andric 5420b57cec5SDimitry Andric // Determine if it is profitable to replace MI by the series of instructions 5430b57cec5SDimitry Andric // represented in ReplInstrMCID. 5440b57cec5SDimitry Andric if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 5450b57cec5SDimitry Andric ReplInstrMCID)) 5460b57cec5SDimitry Andric return false; 5470b57cec5SDimitry Andric 5480b57cec5SDimitry Andric // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at 5490b57cec5SDimitry Andric // this point, the code generation is hardcoded and does not rely on the IRT 5500b57cec5SDimitry Andric // table used above given that code generation for ST2 replacement is somewhat 5510b57cec5SDimitry Andric // different than for ST4 replacement. We could have added more info into the 5520b57cec5SDimitry Andric // table related to how we build new instructions but we may be adding more 5530b57cec5SDimitry Andric // complexity with that). 5540b57cec5SDimitry Andric switch (MI.getOpcode()) { 5550b57cec5SDimitry Andric default: 5560b57cec5SDimitry Andric return false; 5570b57cec5SDimitry Andric 5580b57cec5SDimitry Andric case AArch64::ST2Twov16b: 5590b57cec5SDimitry Andric case AArch64::ST2Twov8b: 5600b57cec5SDimitry Andric case AArch64::ST2Twov8h: 5610b57cec5SDimitry Andric case AArch64::ST2Twov4h: 5620b57cec5SDimitry Andric case AArch64::ST2Twov4s: 5630b57cec5SDimitry Andric case AArch64::ST2Twov2s: 5640b57cec5SDimitry Andric case AArch64::ST2Twov2d: 5650b57cec5SDimitry Andric // ZIP instructions 5660b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 5670b57cec5SDimitry Andric .addReg(StReg[0]) 5680b57cec5SDimitry Andric .addReg(StReg[1]); 5690b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 5700b57cec5SDimitry Andric .addReg(StReg[0], StRegKill[0]) 5710b57cec5SDimitry Andric .addReg(StReg[1], StRegKill[1]); 5720b57cec5SDimitry Andric // STP instructions 5730b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) 5740b57cec5SDimitry Andric .addReg(ZipDest[0]) 5750b57cec5SDimitry Andric .addReg(ZipDest[1]) 5760b57cec5SDimitry Andric .addReg(AddrReg) 5770b57cec5SDimitry Andric .addImm(0); 5780b57cec5SDimitry Andric break; 5790b57cec5SDimitry Andric 5800b57cec5SDimitry Andric case AArch64::ST4Fourv16b: 5810b57cec5SDimitry Andric case AArch64::ST4Fourv8b: 5820b57cec5SDimitry Andric case AArch64::ST4Fourv8h: 5830b57cec5SDimitry Andric case AArch64::ST4Fourv4h: 5840b57cec5SDimitry Andric case AArch64::ST4Fourv4s: 5850b57cec5SDimitry Andric case AArch64::ST4Fourv2s: 5860b57cec5SDimitry Andric case AArch64::ST4Fourv2d: 5870b57cec5SDimitry Andric // ZIP instructions 5880b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 5890b57cec5SDimitry Andric .addReg(StReg[0]) 5900b57cec5SDimitry Andric .addReg(StReg[2]); 5910b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 5920b57cec5SDimitry Andric .addReg(StReg[0], StRegKill[0]) 5930b57cec5SDimitry Andric .addReg(StReg[2], StRegKill[2]); 5940b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) 5950b57cec5SDimitry Andric .addReg(StReg[1]) 5960b57cec5SDimitry Andric .addReg(StReg[3]); 5970b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) 5980b57cec5SDimitry Andric .addReg(StReg[1], StRegKill[1]) 5990b57cec5SDimitry Andric .addReg(StReg[3], StRegKill[3]); 6000b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) 6010b57cec5SDimitry Andric .addReg(ZipDest[0]) 6020b57cec5SDimitry Andric .addReg(ZipDest[2]); 6030b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) 6040b57cec5SDimitry Andric .addReg(ZipDest[0]) 6050b57cec5SDimitry Andric .addReg(ZipDest[2]); 6060b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) 6070b57cec5SDimitry Andric .addReg(ZipDest[1]) 6080b57cec5SDimitry Andric .addReg(ZipDest[3]); 6090b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) 6100b57cec5SDimitry Andric .addReg(ZipDest[1]) 6110b57cec5SDimitry Andric .addReg(ZipDest[3]); 6120b57cec5SDimitry Andric // stp instructions 6130b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) 6140b57cec5SDimitry Andric .addReg(ZipDest[4]) 6150b57cec5SDimitry Andric .addReg(ZipDest[5]) 6160b57cec5SDimitry Andric .addReg(AddrReg) 6170b57cec5SDimitry Andric .addImm(0); 6180b57cec5SDimitry Andric BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) 6190b57cec5SDimitry Andric .addReg(ZipDest[6]) 6200b57cec5SDimitry Andric .addReg(ZipDest[7]) 6210b57cec5SDimitry Andric .addReg(AddrReg) 6220b57cec5SDimitry Andric .addImm(2); 6230b57cec5SDimitry Andric break; 6240b57cec5SDimitry Andric } 6250b57cec5SDimitry Andric 6260b57cec5SDimitry Andric ++NumModifiedInstr; 6270b57cec5SDimitry Andric return true; 6280b57cec5SDimitry Andric } 6290b57cec5SDimitry Andric 6300b57cec5SDimitry Andric /// Process The REG_SEQUENCE instruction, and extract the source 6310b57cec5SDimitry Andric /// operands of the ST2/4 instruction from it. 6320b57cec5SDimitry Andric /// Example of such instruction. 6330b57cec5SDimitry Andric /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 6340b57cec5SDimitry Andric /// Return true when the instruction is processed successfully. 6350b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, 6360b57cec5SDimitry Andric unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { 63704eeddc0SDimitry Andric assert(DefiningMI != nullptr); 6380b57cec5SDimitry Andric if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) 6390b57cec5SDimitry Andric return false; 6400b57cec5SDimitry Andric 6410b57cec5SDimitry Andric for (unsigned i=0; i<NumArg; i++) { 6420b57cec5SDimitry Andric StReg[i] = DefiningMI->getOperand(2*i+1).getReg(); 6430b57cec5SDimitry Andric StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); 6440b57cec5SDimitry Andric 645349cc55cSDimitry Andric // Validation check for the other arguments. 6460b57cec5SDimitry Andric if (DefiningMI->getOperand(2*i+2).isImm()) { 6470b57cec5SDimitry Andric switch (DefiningMI->getOperand(2*i+2).getImm()) { 6480b57cec5SDimitry Andric default: 6490b57cec5SDimitry Andric return false; 6500b57cec5SDimitry Andric 6510b57cec5SDimitry Andric case AArch64::dsub0: 6520b57cec5SDimitry Andric case AArch64::dsub1: 6530b57cec5SDimitry Andric case AArch64::dsub2: 6540b57cec5SDimitry Andric case AArch64::dsub3: 6550b57cec5SDimitry Andric case AArch64::qsub0: 6560b57cec5SDimitry Andric case AArch64::qsub1: 6570b57cec5SDimitry Andric case AArch64::qsub2: 6580b57cec5SDimitry Andric case AArch64::qsub3: 6590b57cec5SDimitry Andric break; 6600b57cec5SDimitry Andric } 6610b57cec5SDimitry Andric } 6620b57cec5SDimitry Andric else 6630b57cec5SDimitry Andric return false; 6640b57cec5SDimitry Andric } 6650b57cec5SDimitry Andric return true; 6660b57cec5SDimitry Andric } 6670b57cec5SDimitry Andric 6680b57cec5SDimitry Andric /// Return the number of useful source registers for this instruction 6690b57cec5SDimitry Andric /// (2 for ST2 and 4 for ST4). 6700b57cec5SDimitry Andric unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { 6710b57cec5SDimitry Andric switch (MI.getOpcode()) { 6720b57cec5SDimitry Andric default: 6730b57cec5SDimitry Andric llvm_unreachable("Unsupported instruction for this pass"); 6740b57cec5SDimitry Andric 6750b57cec5SDimitry Andric case AArch64::ST2Twov16b: 6760b57cec5SDimitry Andric case AArch64::ST2Twov8b: 6770b57cec5SDimitry Andric case AArch64::ST2Twov8h: 6780b57cec5SDimitry Andric case AArch64::ST2Twov4h: 6790b57cec5SDimitry Andric case AArch64::ST2Twov4s: 6800b57cec5SDimitry Andric case AArch64::ST2Twov2s: 6810b57cec5SDimitry Andric case AArch64::ST2Twov2d: 6820b57cec5SDimitry Andric return 2; 6830b57cec5SDimitry Andric 6840b57cec5SDimitry Andric case AArch64::ST4Fourv16b: 6850b57cec5SDimitry Andric case AArch64::ST4Fourv8b: 6860b57cec5SDimitry Andric case AArch64::ST4Fourv8h: 6870b57cec5SDimitry Andric case AArch64::ST4Fourv4h: 6880b57cec5SDimitry Andric case AArch64::ST4Fourv4s: 6890b57cec5SDimitry Andric case AArch64::ST4Fourv2s: 6900b57cec5SDimitry Andric case AArch64::ST4Fourv2d: 6910b57cec5SDimitry Andric return 4; 6920b57cec5SDimitry Andric } 6930b57cec5SDimitry Andric } 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { 6960b57cec5SDimitry Andric if (skipFunction(MF.getFunction())) 6970b57cec5SDimitry Andric return false; 6980b57cec5SDimitry Andric 6990b57cec5SDimitry Andric TII = MF.getSubtarget().getInstrInfo(); 7000b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 7010b57cec5SDimitry Andric const TargetSubtargetInfo &ST = MF.getSubtarget(); 7020b57cec5SDimitry Andric const AArch64InstrInfo *AAII = 7030b57cec5SDimitry Andric static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 7040b57cec5SDimitry Andric if (!AAII) 7050b57cec5SDimitry Andric return false; 7060b57cec5SDimitry Andric SchedModel.init(&ST); 7070b57cec5SDimitry Andric if (!SchedModel.hasInstrSchedModel()) 7080b57cec5SDimitry Andric return false; 7090b57cec5SDimitry Andric 7100b57cec5SDimitry Andric bool Changed = false; 7110b57cec5SDimitry Andric for (auto OptimizationKind : {VectorElem, Interleave}) { 7120b57cec5SDimitry Andric if (!shouldExitEarly(&MF, OptimizationKind)) { 7130b57cec5SDimitry Andric SmallVector<MachineInstr *, 8> RemoveMIs; 7140b57cec5SDimitry Andric for (MachineBasicBlock &MBB : MF) { 715349cc55cSDimitry Andric for (MachineInstr &MI : MBB) { 7160b57cec5SDimitry Andric bool InstRewrite; 7170b57cec5SDimitry Andric if (OptimizationKind == VectorElem) 7180b57cec5SDimitry Andric InstRewrite = optimizeVectElement(MI) ; 7190b57cec5SDimitry Andric else 7200b57cec5SDimitry Andric InstRewrite = optimizeLdStInterleave(MI); 7210b57cec5SDimitry Andric if (InstRewrite) { 7220b57cec5SDimitry Andric // Add MI to the list of instructions to be removed given that it 7230b57cec5SDimitry Andric // has been replaced. 7240b57cec5SDimitry Andric RemoveMIs.push_back(&MI); 7250b57cec5SDimitry Andric Changed = true; 7260b57cec5SDimitry Andric } 7270b57cec5SDimitry Andric } 7280b57cec5SDimitry Andric } 7290b57cec5SDimitry Andric for (MachineInstr *MI : RemoveMIs) 7300b57cec5SDimitry Andric MI->eraseFromParent(); 7310b57cec5SDimitry Andric } 7320b57cec5SDimitry Andric } 7330b57cec5SDimitry Andric 7340b57cec5SDimitry Andric return Changed; 7350b57cec5SDimitry Andric } 7360b57cec5SDimitry Andric 7370b57cec5SDimitry Andric /// Returns an instance of the high cost ASIMD instruction replacement 7380b57cec5SDimitry Andric /// optimization pass. 7390b57cec5SDimitry Andric FunctionPass *llvm::createAArch64SIMDInstrOptPass() { 7400b57cec5SDimitry Andric return new AArch64SIMDInstrOpt(); 7410b57cec5SDimitry Andric } 742