1 // 2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 3 // See https://llvm.org/LICENSE.txt for license information. 4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5 // 6 //===----------------------------------------------------------------------===// 7 // 8 // This file contains a pass that performs optimization on SIMD instructions 9 // with high latency by splitting them into more efficient series of 10 // instructions. 11 // 12 // 1. Rewrite certain SIMD instructions with vector element due to their 13 // inefficiency on some targets. 14 // 15 // For example: 16 // fmla v0.4s, v1.4s, v2.s[1] 17 // 18 // Is rewritten into: 19 // dup v3.4s, v2.s[1] 20 // fmla v0.4s, v1.4s, v3.4s 21 // 22 // 2. Rewrite interleaved memory access instructions due to their 23 // inefficiency on some targets. 24 // 25 // For example: 26 // st2 {v0.4s, v1.4s}, addr 27 // 28 // Is rewritten into: 29 // zip1 v2.4s, v0.4s, v1.4s 30 // zip2 v3.4s, v0.4s, v1.4s 31 // stp q2, q3, addr 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "AArch64InstrInfo.h" 36 #include "llvm/ADT/SmallVector.h" 37 #include "llvm/ADT/Statistic.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/CodeGen/MachineBasicBlock.h" 40 #include "llvm/CodeGen/MachineFunction.h" 41 #include "llvm/CodeGen/MachineFunctionPass.h" 42 #include "llvm/CodeGen/MachineInstr.h" 43 #include "llvm/CodeGen/MachineInstrBuilder.h" 44 #include "llvm/CodeGen/MachineOperand.h" 45 #include "llvm/CodeGen/MachineRegisterInfo.h" 46 #include "llvm/CodeGen/TargetInstrInfo.h" 47 #include "llvm/CodeGen/TargetSchedule.h" 48 #include "llvm/CodeGen/TargetSubtargetInfo.h" 49 #include "llvm/MC/MCInstrDesc.h" 50 #include "llvm/MC/MCSchedule.h" 51 #include "llvm/Pass.h" 52 #include <unordered_map> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "aarch64-simdinstr-opt" 57 58 STATISTIC(NumModifiedInstr, 59 "Number of SIMD instructions modified"); 60 61 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ 62 "AArch64 SIMD instructions optimization pass" 63 64 namespace { 65 66 struct AArch64SIMDInstrOpt : public MachineFunctionPass { 67 static char ID; 68 69 const TargetInstrInfo *TII; 70 MachineRegisterInfo *MRI; 71 TargetSchedModel SchedModel; 72 73 // The two maps below are used to cache decisions instead of recomputing: 74 // This is used to cache instruction replacement decisions within function 75 // units and across function units. 76 std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; 77 // This is used to cache the decision of whether to leave the interleaved 78 // store instructions replacement pass early or not for a particular target. 79 std::unordered_map<std::string, bool> InterlEarlyExit; 80 81 typedef enum { 82 VectorElem, 83 Interleave 84 } Subpass; 85 86 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. 87 struct InstReplInfo { 88 unsigned OrigOpc; 89 std::vector<unsigned> ReplOpc; 90 const TargetRegisterClass RC; 91 }; 92 93 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ 94 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} 95 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ 96 OpcR7, OpcR8, OpcR9, RC) \ 97 {OpcOrg, \ 98 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} 99 100 // The Instruction Replacement Table: 101 std::vector<InstReplInfo> IRT = { 102 // ST2 instructions 103 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 104 AArch64::STPQi, AArch64::FPR128RegClass), 105 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 106 AArch64::STPQi, AArch64::FPR128RegClass), 107 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 108 AArch64::STPDi, AArch64::FPR64RegClass), 109 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 110 AArch64::STPQi, AArch64::FPR128RegClass), 111 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 112 AArch64::STPDi, AArch64::FPR64RegClass), 113 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 114 AArch64::STPQi, AArch64::FPR128RegClass), 115 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 116 AArch64::STPDi, AArch64::FPR64RegClass), 117 // ST4 instructions 118 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 119 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, 120 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 121 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 122 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 123 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, 124 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 125 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 126 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 127 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, 128 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 129 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 130 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 131 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, 132 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 133 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 134 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 135 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, 136 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 137 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 138 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 139 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, 140 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 141 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 142 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 143 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, 144 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 145 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) 146 }; 147 148 // A costly instruction is replaced in this work by N efficient instructions 149 // The maximum of N is curently 10 and it is for ST4 case. 150 static const unsigned MaxNumRepl = 10; 151 152 AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { 153 initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); 154 } 155 156 /// Based only on latency of instructions, determine if it is cost efficient 157 /// to replace the instruction InstDesc by the instructions stored in the 158 /// array InstDescRepl. 159 /// Return true if replacement is expected to be faster. 160 bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 161 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); 162 163 /// Determine if we need to exit the instruction replacement optimization 164 /// passes early. This makes sure that no compile time is spent in this pass 165 /// for targets with no need for any of these optimizations. 166 /// Return true if early exit of the pass is recommended. 167 bool shouldExitEarly(MachineFunction *MF, Subpass SP); 168 169 /// Check whether an equivalent DUP instruction has already been 170 /// created or not. 171 /// Return true when the DUP instruction already exists. In this case, 172 /// DestReg will point to the destination of the already created DUP. 173 bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, 174 unsigned LaneNumber, unsigned *DestReg) const; 175 176 /// Certain SIMD instructions with vector element operand are not efficient. 177 /// Rewrite them into SIMD instructions with vector operands. This rewrite 178 /// is driven by the latency of the instructions. 179 /// Return true if the SIMD instruction is modified. 180 bool optimizeVectElement(MachineInstr &MI); 181 182 /// Process The REG_SEQUENCE instruction, and extract the source 183 /// operands of the ST2/4 instruction from it. 184 /// Example of such instructions. 185 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 186 /// Return true when the instruction is processed successfully. 187 bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, 188 unsigned* StRegKill, unsigned NumArg) const; 189 190 /// Load/Store Interleaving instructions are not always beneficial. 191 /// Replace them by ZIP instructionand classical load/store. 192 /// Return true if the SIMD instruction is modified. 193 bool optimizeLdStInterleave(MachineInstr &MI); 194 195 /// Return the number of useful source registers for this 196 /// instruction (2 for ST2 and 4 for ST4). 197 unsigned determineSrcReg(MachineInstr &MI) const; 198 199 bool runOnMachineFunction(MachineFunction &Fn) override; 200 201 StringRef getPassName() const override { 202 return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; 203 } 204 }; 205 206 char AArch64SIMDInstrOpt::ID = 0; 207 208 } // end anonymous namespace 209 210 INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", 211 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) 212 213 /// Based only on latency of instructions, determine if it is cost efficient 214 /// to replace the instruction InstDesc by the instructions stored in the 215 /// array InstDescRepl. 216 /// Return true if replacement is expected to be faster. 217 bool AArch64SIMDInstrOpt:: 218 shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 219 SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { 220 // Check if replacement decision is already available in the cached table. 221 // if so, return it. 222 std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); 223 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); 224 if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) 225 return SIMDInstrTable[InstID]; 226 227 unsigned SCIdx = InstDesc->getSchedClass(); 228 const MCSchedClassDesc *SCDesc = 229 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); 230 231 // If a target does not define resources for the instructions 232 // of interest, then return false for no replacement. 233 const MCSchedClassDesc *SCDescRepl; 234 if (!SCDesc->isValid() || SCDesc->isVariant()) 235 { 236 SIMDInstrTable[InstID] = false; 237 return false; 238 } 239 for (auto IDesc : InstDescRepl) 240 { 241 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( 242 IDesc->getSchedClass()); 243 if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) 244 { 245 SIMDInstrTable[InstID] = false; 246 return false; 247 } 248 } 249 250 // Replacement cost. 251 unsigned ReplCost = 0; 252 for (auto IDesc :InstDescRepl) 253 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); 254 255 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) 256 { 257 SIMDInstrTable[InstID] = true; 258 return true; 259 } 260 else 261 { 262 SIMDInstrTable[InstID] = false; 263 return false; 264 } 265 } 266 267 /// Determine if we need to exit this pass for a kind of instruction replacement 268 /// early. This makes sure that no compile time is spent in this pass for 269 /// targets with no need for any of these optimizations beyond performing this 270 /// check. 271 /// Return true if early exit of this pass for a kind of instruction 272 /// replacement is recommended for a target. 273 bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { 274 const MCInstrDesc* OriginalMCID; 275 SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 276 277 switch (SP) { 278 // For this optimization, check by comparing the latency of a representative 279 // instruction to that of the replacement instructions. 280 // TODO: check for all concerned instructions. 281 case VectorElem: 282 OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); 283 ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); 284 ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); 285 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) 286 return false; 287 break; 288 289 // For this optimization, check for all concerned instructions. 290 case Interleave: 291 std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); 292 if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) 293 return InterlEarlyExit[Subtarget]; 294 295 for (auto &I : IRT) { 296 OriginalMCID = &TII->get(I.OrigOpc); 297 for (auto &Repl : I.ReplOpc) 298 ReplInstrMCID.push_back(&TII->get(Repl)); 299 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { 300 InterlEarlyExit[Subtarget] = false; 301 return false; 302 } 303 ReplInstrMCID.clear(); 304 } 305 InterlEarlyExit[Subtarget] = true; 306 break; 307 } 308 309 return true; 310 } 311 312 /// Check whether an equivalent DUP instruction has already been 313 /// created or not. 314 /// Return true when the DUP instruction already exists. In this case, 315 /// DestReg will point to the destination of the already created DUP. 316 bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, 317 unsigned SrcReg, unsigned LaneNumber, 318 unsigned *DestReg) const { 319 for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); 320 MII != MIE;) { 321 MII--; 322 MachineInstr *CurrentMI = &*MII; 323 324 if (CurrentMI->getOpcode() == DupOpcode && 325 CurrentMI->getNumOperands() == 3 && 326 CurrentMI->getOperand(1).getReg() == SrcReg && 327 CurrentMI->getOperand(2).getImm() == LaneNumber) { 328 *DestReg = CurrentMI->getOperand(0).getReg(); 329 return true; 330 } 331 } 332 333 return false; 334 } 335 336 /// Certain SIMD instructions with vector element operand are not efficient. 337 /// Rewrite them into SIMD instructions with vector operands. This rewrite 338 /// is driven by the latency of the instructions. 339 /// The instruction of concerns are for the time being FMLA, FMLS, FMUL, 340 /// and FMULX and hence they are hardcoded. 341 /// 342 /// For example: 343 /// fmla v0.4s, v1.4s, v2.s[1] 344 /// 345 /// Is rewritten into 346 /// dup v3.4s, v2.s[1] // DUP not necessary if redundant 347 /// fmla v0.4s, v1.4s, v3.4s 348 /// 349 /// Return true if the SIMD instruction is modified. 350 bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { 351 const MCInstrDesc *MulMCID, *DupMCID; 352 const TargetRegisterClass *RC = &AArch64::FPR128RegClass; 353 354 switch (MI.getOpcode()) { 355 default: 356 return false; 357 358 // 4X32 instructions 359 case AArch64::FMLAv4i32_indexed: 360 DupMCID = &TII->get(AArch64::DUPv4i32lane); 361 MulMCID = &TII->get(AArch64::FMLAv4f32); 362 break; 363 case AArch64::FMLSv4i32_indexed: 364 DupMCID = &TII->get(AArch64::DUPv4i32lane); 365 MulMCID = &TII->get(AArch64::FMLSv4f32); 366 break; 367 case AArch64::FMULXv4i32_indexed: 368 DupMCID = &TII->get(AArch64::DUPv4i32lane); 369 MulMCID = &TII->get(AArch64::FMULXv4f32); 370 break; 371 case AArch64::FMULv4i32_indexed: 372 DupMCID = &TII->get(AArch64::DUPv4i32lane); 373 MulMCID = &TII->get(AArch64::FMULv4f32); 374 break; 375 376 // 2X64 instructions 377 case AArch64::FMLAv2i64_indexed: 378 DupMCID = &TII->get(AArch64::DUPv2i64lane); 379 MulMCID = &TII->get(AArch64::FMLAv2f64); 380 break; 381 case AArch64::FMLSv2i64_indexed: 382 DupMCID = &TII->get(AArch64::DUPv2i64lane); 383 MulMCID = &TII->get(AArch64::FMLSv2f64); 384 break; 385 case AArch64::FMULXv2i64_indexed: 386 DupMCID = &TII->get(AArch64::DUPv2i64lane); 387 MulMCID = &TII->get(AArch64::FMULXv2f64); 388 break; 389 case AArch64::FMULv2i64_indexed: 390 DupMCID = &TII->get(AArch64::DUPv2i64lane); 391 MulMCID = &TII->get(AArch64::FMULv2f64); 392 break; 393 394 // 2X32 instructions 395 case AArch64::FMLAv2i32_indexed: 396 RC = &AArch64::FPR64RegClass; 397 DupMCID = &TII->get(AArch64::DUPv2i32lane); 398 MulMCID = &TII->get(AArch64::FMLAv2f32); 399 break; 400 case AArch64::FMLSv2i32_indexed: 401 RC = &AArch64::FPR64RegClass; 402 DupMCID = &TII->get(AArch64::DUPv2i32lane); 403 MulMCID = &TII->get(AArch64::FMLSv2f32); 404 break; 405 case AArch64::FMULXv2i32_indexed: 406 RC = &AArch64::FPR64RegClass; 407 DupMCID = &TII->get(AArch64::DUPv2i32lane); 408 MulMCID = &TII->get(AArch64::FMULXv2f32); 409 break; 410 case AArch64::FMULv2i32_indexed: 411 RC = &AArch64::FPR64RegClass; 412 DupMCID = &TII->get(AArch64::DUPv2i32lane); 413 MulMCID = &TII->get(AArch64::FMULv2f32); 414 break; 415 } 416 417 SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; 418 ReplInstrMCID.push_back(DupMCID); 419 ReplInstrMCID.push_back(MulMCID); 420 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 421 ReplInstrMCID)) 422 return false; 423 424 const DebugLoc &DL = MI.getDebugLoc(); 425 MachineBasicBlock &MBB = *MI.getParent(); 426 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 427 428 // Get the operands of the current SIMD arithmetic instruction. 429 unsigned MulDest = MI.getOperand(0).getReg(); 430 unsigned SrcReg0 = MI.getOperand(1).getReg(); 431 unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); 432 unsigned SrcReg1 = MI.getOperand(2).getReg(); 433 unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); 434 unsigned DupDest; 435 436 // Instructions of interest have either 4 or 5 operands. 437 if (MI.getNumOperands() == 5) { 438 unsigned SrcReg2 = MI.getOperand(3).getReg(); 439 unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); 440 unsigned LaneNumber = MI.getOperand(4).getImm(); 441 // Create a new DUP instruction. Note that if an equivalent DUP instruction 442 // has already been created before, then use that one instead of creating 443 // a new one. 444 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { 445 DupDest = MRI.createVirtualRegister(RC); 446 BuildMI(MBB, MI, DL, *DupMCID, DupDest) 447 .addReg(SrcReg2, Src2IsKill) 448 .addImm(LaneNumber); 449 } 450 BuildMI(MBB, MI, DL, *MulMCID, MulDest) 451 .addReg(SrcReg0, Src0IsKill) 452 .addReg(SrcReg1, Src1IsKill) 453 .addReg(DupDest, Src2IsKill); 454 } else if (MI.getNumOperands() == 4) { 455 unsigned LaneNumber = MI.getOperand(3).getImm(); 456 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { 457 DupDest = MRI.createVirtualRegister(RC); 458 BuildMI(MBB, MI, DL, *DupMCID, DupDest) 459 .addReg(SrcReg1, Src1IsKill) 460 .addImm(LaneNumber); 461 } 462 BuildMI(MBB, MI, DL, *MulMCID, MulDest) 463 .addReg(SrcReg0, Src0IsKill) 464 .addReg(DupDest, Src1IsKill); 465 } else { 466 return false; 467 } 468 469 ++NumModifiedInstr; 470 return true; 471 } 472 473 /// Load/Store Interleaving instructions are not always beneficial. 474 /// Replace them by ZIP instructions and classical load/store. 475 /// 476 /// For example: 477 /// st2 {v0.4s, v1.4s}, addr 478 /// 479 /// Is rewritten into: 480 /// zip1 v2.4s, v0.4s, v1.4s 481 /// zip2 v3.4s, v0.4s, v1.4s 482 /// stp q2, q3, addr 483 // 484 /// For example: 485 /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr 486 /// 487 /// Is rewritten into: 488 /// zip1 v4.4s, v0.4s, v2.4s 489 /// zip2 v5.4s, v0.4s, v2.4s 490 /// zip1 v6.4s, v1.4s, v3.4s 491 /// zip2 v7.4s, v1.4s, v3.4s 492 /// zip1 v8.4s, v4.4s, v6.4s 493 /// zip2 v9.4s, v4.4s, v6.4s 494 /// zip1 v10.4s, v5.4s, v7.4s 495 /// zip2 v11.4s, v5.4s, v7.4s 496 /// stp q8, q9, addr 497 /// stp q10, q11, addr+32 498 /// 499 /// Currently only instructions related to ST2 and ST4 are considered. 500 /// Other may be added later. 501 /// Return true if the SIMD instruction is modified. 502 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { 503 504 unsigned SeqReg, AddrReg; 505 unsigned StReg[4], StRegKill[4]; 506 MachineInstr *DefiningMI; 507 const DebugLoc &DL = MI.getDebugLoc(); 508 MachineBasicBlock &MBB = *MI.getParent(); 509 SmallVector<unsigned, MaxNumRepl> ZipDest; 510 SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 511 512 // If current instruction matches any of the rewriting rules, then 513 // gather information about parameters of the new instructions. 514 bool Match = false; 515 for (auto &I : IRT) { 516 if (MI.getOpcode() == I.OrigOpc) { 517 SeqReg = MI.getOperand(0).getReg(); 518 AddrReg = MI.getOperand(1).getReg(); 519 DefiningMI = MRI->getUniqueVRegDef(SeqReg); 520 unsigned NumReg = determineSrcReg(MI); 521 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) 522 return false; 523 524 for (auto &Repl : I.ReplOpc) { 525 ReplInstrMCID.push_back(&TII->get(Repl)); 526 // Generate destination registers but only for non-store instruction. 527 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) 528 ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); 529 } 530 Match = true; 531 break; 532 } 533 } 534 535 if (!Match) 536 return false; 537 538 // Determine if it is profitable to replace MI by the series of instructions 539 // represented in ReplInstrMCID. 540 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 541 ReplInstrMCID)) 542 return false; 543 544 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at 545 // this point, the code generation is hardcoded and does not rely on the IRT 546 // table used above given that code generation for ST2 replacement is somewhat 547 // different than for ST4 replacement. We could have added more info into the 548 // table related to how we build new instructions but we may be adding more 549 // complexity with that). 550 switch (MI.getOpcode()) { 551 default: 552 return false; 553 554 case AArch64::ST2Twov16b: 555 case AArch64::ST2Twov8b: 556 case AArch64::ST2Twov8h: 557 case AArch64::ST2Twov4h: 558 case AArch64::ST2Twov4s: 559 case AArch64::ST2Twov2s: 560 case AArch64::ST2Twov2d: 561 // ZIP instructions 562 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 563 .addReg(StReg[0]) 564 .addReg(StReg[1]); 565 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 566 .addReg(StReg[0], StRegKill[0]) 567 .addReg(StReg[1], StRegKill[1]); 568 // STP instructions 569 BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) 570 .addReg(ZipDest[0]) 571 .addReg(ZipDest[1]) 572 .addReg(AddrReg) 573 .addImm(0); 574 break; 575 576 case AArch64::ST4Fourv16b: 577 case AArch64::ST4Fourv8b: 578 case AArch64::ST4Fourv8h: 579 case AArch64::ST4Fourv4h: 580 case AArch64::ST4Fourv4s: 581 case AArch64::ST4Fourv2s: 582 case AArch64::ST4Fourv2d: 583 // ZIP instructions 584 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 585 .addReg(StReg[0]) 586 .addReg(StReg[2]); 587 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 588 .addReg(StReg[0], StRegKill[0]) 589 .addReg(StReg[2], StRegKill[2]); 590 BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) 591 .addReg(StReg[1]) 592 .addReg(StReg[3]); 593 BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) 594 .addReg(StReg[1], StRegKill[1]) 595 .addReg(StReg[3], StRegKill[3]); 596 BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) 597 .addReg(ZipDest[0]) 598 .addReg(ZipDest[2]); 599 BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) 600 .addReg(ZipDest[0]) 601 .addReg(ZipDest[2]); 602 BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) 603 .addReg(ZipDest[1]) 604 .addReg(ZipDest[3]); 605 BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) 606 .addReg(ZipDest[1]) 607 .addReg(ZipDest[3]); 608 // stp instructions 609 BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) 610 .addReg(ZipDest[4]) 611 .addReg(ZipDest[5]) 612 .addReg(AddrReg) 613 .addImm(0); 614 BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) 615 .addReg(ZipDest[6]) 616 .addReg(ZipDest[7]) 617 .addReg(AddrReg) 618 .addImm(2); 619 break; 620 } 621 622 ++NumModifiedInstr; 623 return true; 624 } 625 626 /// Process The REG_SEQUENCE instruction, and extract the source 627 /// operands of the ST2/4 instruction from it. 628 /// Example of such instruction. 629 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 630 /// Return true when the instruction is processed successfully. 631 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, 632 unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { 633 assert (DefiningMI != NULL); 634 if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) 635 return false; 636 637 for (unsigned i=0; i<NumArg; i++) { 638 StReg[i] = DefiningMI->getOperand(2*i+1).getReg(); 639 StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); 640 641 // Sanity check for the other arguments. 642 if (DefiningMI->getOperand(2*i+2).isImm()) { 643 switch (DefiningMI->getOperand(2*i+2).getImm()) { 644 default: 645 return false; 646 647 case AArch64::dsub0: 648 case AArch64::dsub1: 649 case AArch64::dsub2: 650 case AArch64::dsub3: 651 case AArch64::qsub0: 652 case AArch64::qsub1: 653 case AArch64::qsub2: 654 case AArch64::qsub3: 655 break; 656 } 657 } 658 else 659 return false; 660 } 661 return true; 662 } 663 664 /// Return the number of useful source registers for this instruction 665 /// (2 for ST2 and 4 for ST4). 666 unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { 667 switch (MI.getOpcode()) { 668 default: 669 llvm_unreachable("Unsupported instruction for this pass"); 670 671 case AArch64::ST2Twov16b: 672 case AArch64::ST2Twov8b: 673 case AArch64::ST2Twov8h: 674 case AArch64::ST2Twov4h: 675 case AArch64::ST2Twov4s: 676 case AArch64::ST2Twov2s: 677 case AArch64::ST2Twov2d: 678 return 2; 679 680 case AArch64::ST4Fourv16b: 681 case AArch64::ST4Fourv8b: 682 case AArch64::ST4Fourv8h: 683 case AArch64::ST4Fourv4h: 684 case AArch64::ST4Fourv4s: 685 case AArch64::ST4Fourv2s: 686 case AArch64::ST4Fourv2d: 687 return 4; 688 } 689 } 690 691 bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { 692 if (skipFunction(MF.getFunction())) 693 return false; 694 695 TII = MF.getSubtarget().getInstrInfo(); 696 MRI = &MF.getRegInfo(); 697 const TargetSubtargetInfo &ST = MF.getSubtarget(); 698 const AArch64InstrInfo *AAII = 699 static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 700 if (!AAII) 701 return false; 702 SchedModel.init(&ST); 703 if (!SchedModel.hasInstrSchedModel()) 704 return false; 705 706 bool Changed = false; 707 for (auto OptimizationKind : {VectorElem, Interleave}) { 708 if (!shouldExitEarly(&MF, OptimizationKind)) { 709 SmallVector<MachineInstr *, 8> RemoveMIs; 710 for (MachineBasicBlock &MBB : MF) { 711 for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); 712 MII != MIE;) { 713 MachineInstr &MI = *MII; 714 bool InstRewrite; 715 if (OptimizationKind == VectorElem) 716 InstRewrite = optimizeVectElement(MI) ; 717 else 718 InstRewrite = optimizeLdStInterleave(MI); 719 if (InstRewrite) { 720 // Add MI to the list of instructions to be removed given that it 721 // has been replaced. 722 RemoveMIs.push_back(&MI); 723 Changed = true; 724 } 725 ++MII; 726 } 727 } 728 for (MachineInstr *MI : RemoveMIs) 729 MI->eraseFromParent(); 730 } 731 } 732 733 return Changed; 734 } 735 736 /// Returns an instance of the high cost ASIMD instruction replacement 737 /// optimization pass. 738 FunctionPass *llvm::createAArch64SIMDInstrOptPass() { 739 return new AArch64SIMDInstrOpt(); 740 } 741