1 // 2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 3 // See https://llvm.org/LICENSE.txt for license information. 4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5 // 6 //===----------------------------------------------------------------------===// 7 // 8 // This file contains a pass that performs optimization on SIMD instructions 9 // with high latency by splitting them into more efficient series of 10 // instructions. 11 // 12 // 1. Rewrite certain SIMD instructions with vector element due to their 13 // inefficiency on some targets. 14 // 15 // For example: 16 // fmla v0.4s, v1.4s, v2.s[1] 17 // 18 // Is rewritten into: 19 // dup v3.4s, v2.s[1] 20 // fmla v0.4s, v1.4s, v3.4s 21 // 22 // 2. Rewrite interleaved memory access instructions due to their 23 // inefficiency on some targets. 24 // 25 // For example: 26 // st2 {v0.4s, v1.4s}, addr 27 // 28 // Is rewritten into: 29 // zip1 v2.4s, v0.4s, v1.4s 30 // zip2 v3.4s, v0.4s, v1.4s 31 // stp q2, q3, addr 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "AArch64InstrInfo.h" 36 #include "llvm/ADT/SmallVector.h" 37 #include "llvm/ADT/Statistic.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/CodeGen/MachineBasicBlock.h" 40 #include "llvm/CodeGen/MachineFunction.h" 41 #include "llvm/CodeGen/MachineFunctionPass.h" 42 #include "llvm/CodeGen/MachineInstr.h" 43 #include "llvm/CodeGen/MachineInstrBuilder.h" 44 #include "llvm/CodeGen/MachineOperand.h" 45 #include "llvm/CodeGen/MachineRegisterInfo.h" 46 #include "llvm/CodeGen/TargetInstrInfo.h" 47 #include "llvm/CodeGen/TargetSchedule.h" 48 #include "llvm/CodeGen/TargetSubtargetInfo.h" 49 #include "llvm/MC/MCInstrDesc.h" 50 #include "llvm/MC/MCSchedule.h" 51 #include "llvm/Pass.h" 52 #include <unordered_map> 53 #include <map> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "aarch64-simdinstr-opt" 58 59 STATISTIC(NumModifiedInstr, 60 "Number of SIMD instructions modified"); 61 62 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ 63 "AArch64 SIMD instructions optimization pass" 64 65 namespace { 66 67 struct AArch64SIMDInstrOpt : public MachineFunctionPass { 68 static char ID; 69 70 const TargetInstrInfo *TII; 71 MachineRegisterInfo *MRI; 72 TargetSchedModel SchedModel; 73 74 // The two maps below are used to cache decisions instead of recomputing: 75 // This is used to cache instruction replacement decisions within function 76 // units and across function units. 77 std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; 78 // This is used to cache the decision of whether to leave the interleaved 79 // store instructions replacement pass early or not for a particular target. 80 std::unordered_map<std::string, bool> InterlEarlyExit; 81 82 typedef enum { 83 VectorElem, 84 Interleave 85 } Subpass; 86 87 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. 88 struct InstReplInfo { 89 unsigned OrigOpc; 90 std::vector<unsigned> ReplOpc; 91 const TargetRegisterClass RC; 92 }; 93 94 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ 95 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} 96 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ 97 OpcR7, OpcR8, OpcR9, RC) \ 98 {OpcOrg, \ 99 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} 100 101 // The Instruction Replacement Table: 102 std::vector<InstReplInfo> IRT = { 103 // ST2 instructions 104 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 105 AArch64::STPQi, AArch64::FPR128RegClass), 106 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 107 AArch64::STPQi, AArch64::FPR128RegClass), 108 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 109 AArch64::STPDi, AArch64::FPR64RegClass), 110 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 111 AArch64::STPQi, AArch64::FPR128RegClass), 112 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 113 AArch64::STPDi, AArch64::FPR64RegClass), 114 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 115 AArch64::STPQi, AArch64::FPR128RegClass), 116 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 117 AArch64::STPDi, AArch64::FPR64RegClass), 118 // ST4 instructions 119 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 120 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, 121 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 122 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 123 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 124 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, 125 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 126 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 127 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 128 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, 129 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 130 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 131 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 132 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, 133 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 134 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 135 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 136 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, 137 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 138 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 139 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 140 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, 141 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 142 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 143 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 144 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, 145 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 146 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) 147 }; 148 149 // A costly instruction is replaced in this work by N efficient instructions 150 // The maximum of N is curently 10 and it is for ST4 case. 151 static const unsigned MaxNumRepl = 10; 152 153 AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { 154 initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); 155 } 156 157 /// Based only on latency of instructions, determine if it is cost efficient 158 /// to replace the instruction InstDesc by the instructions stored in the 159 /// array InstDescRepl. 160 /// Return true if replacement is expected to be faster. 161 bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 162 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); 163 164 /// Determine if we need to exit the instruction replacement optimization 165 /// passes early. This makes sure that no compile time is spent in this pass 166 /// for targets with no need for any of these optimizations. 167 /// Return true if early exit of the pass is recommended. 168 bool shouldExitEarly(MachineFunction *MF, Subpass SP); 169 170 /// Check whether an equivalent DUP instruction has already been 171 /// created or not. 172 /// Return true when the DUP instruction already exists. In this case, 173 /// DestReg will point to the destination of the already created DUP. 174 bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, 175 unsigned LaneNumber, unsigned *DestReg) const; 176 177 /// Certain SIMD instructions with vector element operand are not efficient. 178 /// Rewrite them into SIMD instructions with vector operands. This rewrite 179 /// is driven by the latency of the instructions. 180 /// Return true if the SIMD instruction is modified. 181 bool optimizeVectElement(MachineInstr &MI); 182 183 /// Process The REG_SEQUENCE instruction, and extract the source 184 /// operands of the ST2/4 instruction from it. 185 /// Example of such instructions. 186 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 187 /// Return true when the instruction is processed successfully. 188 bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, 189 unsigned* StRegKill, unsigned NumArg) const; 190 191 /// Load/Store Interleaving instructions are not always beneficial. 192 /// Replace them by ZIP instructionand classical load/store. 193 /// Return true if the SIMD instruction is modified. 194 bool optimizeLdStInterleave(MachineInstr &MI); 195 196 /// Return the number of useful source registers for this 197 /// instruction (2 for ST2 and 4 for ST4). 198 unsigned determineSrcReg(MachineInstr &MI) const; 199 200 bool runOnMachineFunction(MachineFunction &Fn) override; 201 202 StringRef getPassName() const override { 203 return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; 204 } 205 }; 206 207 char AArch64SIMDInstrOpt::ID = 0; 208 209 } // end anonymous namespace 210 211 INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", 212 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) 213 214 /// Based only on latency of instructions, determine if it is cost efficient 215 /// to replace the instruction InstDesc by the instructions stored in the 216 /// array InstDescRepl. 217 /// Return true if replacement is expected to be faster. 218 bool AArch64SIMDInstrOpt:: 219 shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 220 SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { 221 // Check if replacement decision is already available in the cached table. 222 // if so, return it. 223 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU()); 224 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); 225 auto It = SIMDInstrTable.find(InstID); 226 if (It != SIMDInstrTable.end()) 227 return It->second; 228 229 unsigned SCIdx = InstDesc->getSchedClass(); 230 const MCSchedClassDesc *SCDesc = 231 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); 232 233 // If a target does not define resources for the instructions 234 // of interest, then return false for no replacement. 235 const MCSchedClassDesc *SCDescRepl; 236 if (!SCDesc->isValid() || SCDesc->isVariant()) 237 { 238 SIMDInstrTable[InstID] = false; 239 return false; 240 } 241 for (const auto *IDesc : InstDescRepl) 242 { 243 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( 244 IDesc->getSchedClass()); 245 if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) 246 { 247 SIMDInstrTable[InstID] = false; 248 return false; 249 } 250 } 251 252 // Replacement cost. 253 unsigned ReplCost = 0; 254 for (const auto *IDesc :InstDescRepl) 255 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); 256 257 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) 258 { 259 SIMDInstrTable[InstID] = true; 260 return true; 261 } 262 else 263 { 264 SIMDInstrTable[InstID] = false; 265 return false; 266 } 267 } 268 269 /// Determine if we need to exit this pass for a kind of instruction replacement 270 /// early. This makes sure that no compile time is spent in this pass for 271 /// targets with no need for any of these optimizations beyond performing this 272 /// check. 273 /// Return true if early exit of this pass for a kind of instruction 274 /// replacement is recommended for a target. 275 bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { 276 const MCInstrDesc* OriginalMCID; 277 SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 278 279 switch (SP) { 280 // For this optimization, check by comparing the latency of a representative 281 // instruction to that of the replacement instructions. 282 // TODO: check for all concerned instructions. 283 case VectorElem: 284 OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); 285 ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); 286 ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); 287 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) 288 return false; 289 break; 290 291 // For this optimization, check for all concerned instructions. 292 case Interleave: 293 std::string Subtarget = 294 std::string(SchedModel.getSubtargetInfo()->getCPU()); 295 auto It = InterlEarlyExit.find(Subtarget); 296 if (It != InterlEarlyExit.end()) 297 return It->second; 298 299 for (auto &I : IRT) { 300 OriginalMCID = &TII->get(I.OrigOpc); 301 for (auto &Repl : I.ReplOpc) 302 ReplInstrMCID.push_back(&TII->get(Repl)); 303 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { 304 InterlEarlyExit[Subtarget] = false; 305 return false; 306 } 307 ReplInstrMCID.clear(); 308 } 309 InterlEarlyExit[Subtarget] = true; 310 break; 311 } 312 313 return true; 314 } 315 316 /// Check whether an equivalent DUP instruction has already been 317 /// created or not. 318 /// Return true when the DUP instruction already exists. In this case, 319 /// DestReg will point to the destination of the already created DUP. 320 bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, 321 unsigned SrcReg, unsigned LaneNumber, 322 unsigned *DestReg) const { 323 for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); 324 MII != MIE;) { 325 MII--; 326 MachineInstr *CurrentMI = &*MII; 327 328 if (CurrentMI->getOpcode() == DupOpcode && 329 CurrentMI->getNumOperands() == 3 && 330 CurrentMI->getOperand(1).getReg() == SrcReg && 331 CurrentMI->getOperand(2).getImm() == LaneNumber) { 332 *DestReg = CurrentMI->getOperand(0).getReg(); 333 return true; 334 } 335 } 336 337 return false; 338 } 339 340 /// Certain SIMD instructions with vector element operand are not efficient. 341 /// Rewrite them into SIMD instructions with vector operands. This rewrite 342 /// is driven by the latency of the instructions. 343 /// The instruction of concerns are for the time being FMLA, FMLS, FMUL, 344 /// and FMULX and hence they are hardcoded. 345 /// 346 /// For example: 347 /// fmla v0.4s, v1.4s, v2.s[1] 348 /// 349 /// Is rewritten into 350 /// dup v3.4s, v2.s[1] // DUP not necessary if redundant 351 /// fmla v0.4s, v1.4s, v3.4s 352 /// 353 /// Return true if the SIMD instruction is modified. 354 bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { 355 const MCInstrDesc *MulMCID, *DupMCID; 356 const TargetRegisterClass *RC = &AArch64::FPR128RegClass; 357 358 switch (MI.getOpcode()) { 359 default: 360 return false; 361 362 // 4X32 instructions 363 case AArch64::FMLAv4i32_indexed: 364 DupMCID = &TII->get(AArch64::DUPv4i32lane); 365 MulMCID = &TII->get(AArch64::FMLAv4f32); 366 break; 367 case AArch64::FMLSv4i32_indexed: 368 DupMCID = &TII->get(AArch64::DUPv4i32lane); 369 MulMCID = &TII->get(AArch64::FMLSv4f32); 370 break; 371 case AArch64::FMULXv4i32_indexed: 372 DupMCID = &TII->get(AArch64::DUPv4i32lane); 373 MulMCID = &TII->get(AArch64::FMULXv4f32); 374 break; 375 case AArch64::FMULv4i32_indexed: 376 DupMCID = &TII->get(AArch64::DUPv4i32lane); 377 MulMCID = &TII->get(AArch64::FMULv4f32); 378 break; 379 380 // 2X64 instructions 381 case AArch64::FMLAv2i64_indexed: 382 DupMCID = &TII->get(AArch64::DUPv2i64lane); 383 MulMCID = &TII->get(AArch64::FMLAv2f64); 384 break; 385 case AArch64::FMLSv2i64_indexed: 386 DupMCID = &TII->get(AArch64::DUPv2i64lane); 387 MulMCID = &TII->get(AArch64::FMLSv2f64); 388 break; 389 case AArch64::FMULXv2i64_indexed: 390 DupMCID = &TII->get(AArch64::DUPv2i64lane); 391 MulMCID = &TII->get(AArch64::FMULXv2f64); 392 break; 393 case AArch64::FMULv2i64_indexed: 394 DupMCID = &TII->get(AArch64::DUPv2i64lane); 395 MulMCID = &TII->get(AArch64::FMULv2f64); 396 break; 397 398 // 2X32 instructions 399 case AArch64::FMLAv2i32_indexed: 400 RC = &AArch64::FPR64RegClass; 401 DupMCID = &TII->get(AArch64::DUPv2i32lane); 402 MulMCID = &TII->get(AArch64::FMLAv2f32); 403 break; 404 case AArch64::FMLSv2i32_indexed: 405 RC = &AArch64::FPR64RegClass; 406 DupMCID = &TII->get(AArch64::DUPv2i32lane); 407 MulMCID = &TII->get(AArch64::FMLSv2f32); 408 break; 409 case AArch64::FMULXv2i32_indexed: 410 RC = &AArch64::FPR64RegClass; 411 DupMCID = &TII->get(AArch64::DUPv2i32lane); 412 MulMCID = &TII->get(AArch64::FMULXv2f32); 413 break; 414 case AArch64::FMULv2i32_indexed: 415 RC = &AArch64::FPR64RegClass; 416 DupMCID = &TII->get(AArch64::DUPv2i32lane); 417 MulMCID = &TII->get(AArch64::FMULv2f32); 418 break; 419 } 420 421 SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; 422 ReplInstrMCID.push_back(DupMCID); 423 ReplInstrMCID.push_back(MulMCID); 424 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 425 ReplInstrMCID)) 426 return false; 427 428 const DebugLoc &DL = MI.getDebugLoc(); 429 MachineBasicBlock &MBB = *MI.getParent(); 430 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 431 432 // Get the operands of the current SIMD arithmetic instruction. 433 Register MulDest = MI.getOperand(0).getReg(); 434 Register SrcReg0 = MI.getOperand(1).getReg(); 435 unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); 436 Register SrcReg1 = MI.getOperand(2).getReg(); 437 unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); 438 unsigned DupDest; 439 440 // Instructions of interest have either 4 or 5 operands. 441 if (MI.getNumOperands() == 5) { 442 Register SrcReg2 = MI.getOperand(3).getReg(); 443 unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); 444 unsigned LaneNumber = MI.getOperand(4).getImm(); 445 // Create a new DUP instruction. Note that if an equivalent DUP instruction 446 // has already been created before, then use that one instead of creating 447 // a new one. 448 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { 449 DupDest = MRI.createVirtualRegister(RC); 450 BuildMI(MBB, MI, DL, *DupMCID, DupDest) 451 .addReg(SrcReg2, Src2IsKill) 452 .addImm(LaneNumber); 453 } 454 BuildMI(MBB, MI, DL, *MulMCID, MulDest) 455 .addReg(SrcReg0, Src0IsKill) 456 .addReg(SrcReg1, Src1IsKill) 457 .addReg(DupDest, Src2IsKill); 458 } else if (MI.getNumOperands() == 4) { 459 unsigned LaneNumber = MI.getOperand(3).getImm(); 460 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { 461 DupDest = MRI.createVirtualRegister(RC); 462 BuildMI(MBB, MI, DL, *DupMCID, DupDest) 463 .addReg(SrcReg1, Src1IsKill) 464 .addImm(LaneNumber); 465 } 466 BuildMI(MBB, MI, DL, *MulMCID, MulDest) 467 .addReg(SrcReg0, Src0IsKill) 468 .addReg(DupDest, Src1IsKill); 469 } else { 470 return false; 471 } 472 473 ++NumModifiedInstr; 474 return true; 475 } 476 477 /// Load/Store Interleaving instructions are not always beneficial. 478 /// Replace them by ZIP instructions and classical load/store. 479 /// 480 /// For example: 481 /// st2 {v0.4s, v1.4s}, addr 482 /// 483 /// Is rewritten into: 484 /// zip1 v2.4s, v0.4s, v1.4s 485 /// zip2 v3.4s, v0.4s, v1.4s 486 /// stp q2, q3, addr 487 // 488 /// For example: 489 /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr 490 /// 491 /// Is rewritten into: 492 /// zip1 v4.4s, v0.4s, v2.4s 493 /// zip2 v5.4s, v0.4s, v2.4s 494 /// zip1 v6.4s, v1.4s, v3.4s 495 /// zip2 v7.4s, v1.4s, v3.4s 496 /// zip1 v8.4s, v4.4s, v6.4s 497 /// zip2 v9.4s, v4.4s, v6.4s 498 /// zip1 v10.4s, v5.4s, v7.4s 499 /// zip2 v11.4s, v5.4s, v7.4s 500 /// stp q8, q9, addr 501 /// stp q10, q11, addr+32 502 /// 503 /// Currently only instructions related to ST2 and ST4 are considered. 504 /// Other may be added later. 505 /// Return true if the SIMD instruction is modified. 506 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { 507 508 unsigned SeqReg, AddrReg; 509 unsigned StReg[4], StRegKill[4]; 510 MachineInstr *DefiningMI; 511 const DebugLoc &DL = MI.getDebugLoc(); 512 MachineBasicBlock &MBB = *MI.getParent(); 513 SmallVector<unsigned, MaxNumRepl> ZipDest; 514 SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 515 516 // If current instruction matches any of the rewriting rules, then 517 // gather information about parameters of the new instructions. 518 bool Match = false; 519 for (auto &I : IRT) { 520 if (MI.getOpcode() == I.OrigOpc) { 521 SeqReg = MI.getOperand(0).getReg(); 522 AddrReg = MI.getOperand(1).getReg(); 523 DefiningMI = MRI->getUniqueVRegDef(SeqReg); 524 unsigned NumReg = determineSrcReg(MI); 525 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) 526 return false; 527 528 for (auto &Repl : I.ReplOpc) { 529 ReplInstrMCID.push_back(&TII->get(Repl)); 530 // Generate destination registers but only for non-store instruction. 531 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) 532 ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); 533 } 534 Match = true; 535 break; 536 } 537 } 538 539 if (!Match) 540 return false; 541 542 // Determine if it is profitable to replace MI by the series of instructions 543 // represented in ReplInstrMCID. 544 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 545 ReplInstrMCID)) 546 return false; 547 548 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at 549 // this point, the code generation is hardcoded and does not rely on the IRT 550 // table used above given that code generation for ST2 replacement is somewhat 551 // different than for ST4 replacement. We could have added more info into the 552 // table related to how we build new instructions but we may be adding more 553 // complexity with that). 554 switch (MI.getOpcode()) { 555 default: 556 return false; 557 558 case AArch64::ST2Twov16b: 559 case AArch64::ST2Twov8b: 560 case AArch64::ST2Twov8h: 561 case AArch64::ST2Twov4h: 562 case AArch64::ST2Twov4s: 563 case AArch64::ST2Twov2s: 564 case AArch64::ST2Twov2d: 565 // ZIP instructions 566 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 567 .addReg(StReg[0]) 568 .addReg(StReg[1]); 569 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 570 .addReg(StReg[0], StRegKill[0]) 571 .addReg(StReg[1], StRegKill[1]); 572 // STP instructions 573 BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) 574 .addReg(ZipDest[0]) 575 .addReg(ZipDest[1]) 576 .addReg(AddrReg) 577 .addImm(0); 578 break; 579 580 case AArch64::ST4Fourv16b: 581 case AArch64::ST4Fourv8b: 582 case AArch64::ST4Fourv8h: 583 case AArch64::ST4Fourv4h: 584 case AArch64::ST4Fourv4s: 585 case AArch64::ST4Fourv2s: 586 case AArch64::ST4Fourv2d: 587 // ZIP instructions 588 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 589 .addReg(StReg[0]) 590 .addReg(StReg[2]); 591 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 592 .addReg(StReg[0], StRegKill[0]) 593 .addReg(StReg[2], StRegKill[2]); 594 BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) 595 .addReg(StReg[1]) 596 .addReg(StReg[3]); 597 BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) 598 .addReg(StReg[1], StRegKill[1]) 599 .addReg(StReg[3], StRegKill[3]); 600 BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) 601 .addReg(ZipDest[0]) 602 .addReg(ZipDest[2]); 603 BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) 604 .addReg(ZipDest[0]) 605 .addReg(ZipDest[2]); 606 BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) 607 .addReg(ZipDest[1]) 608 .addReg(ZipDest[3]); 609 BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) 610 .addReg(ZipDest[1]) 611 .addReg(ZipDest[3]); 612 // stp instructions 613 BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) 614 .addReg(ZipDest[4]) 615 .addReg(ZipDest[5]) 616 .addReg(AddrReg) 617 .addImm(0); 618 BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) 619 .addReg(ZipDest[6]) 620 .addReg(ZipDest[7]) 621 .addReg(AddrReg) 622 .addImm(2); 623 break; 624 } 625 626 ++NumModifiedInstr; 627 return true; 628 } 629 630 /// Process The REG_SEQUENCE instruction, and extract the source 631 /// operands of the ST2/4 instruction from it. 632 /// Example of such instruction. 633 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 634 /// Return true when the instruction is processed successfully. 635 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, 636 unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { 637 assert(DefiningMI != nullptr); 638 if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) 639 return false; 640 641 for (unsigned i=0; i<NumArg; i++) { 642 StReg[i] = DefiningMI->getOperand(2*i+1).getReg(); 643 StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); 644 645 // Validation check for the other arguments. 646 if (DefiningMI->getOperand(2*i+2).isImm()) { 647 switch (DefiningMI->getOperand(2*i+2).getImm()) { 648 default: 649 return false; 650 651 case AArch64::dsub0: 652 case AArch64::dsub1: 653 case AArch64::dsub2: 654 case AArch64::dsub3: 655 case AArch64::qsub0: 656 case AArch64::qsub1: 657 case AArch64::qsub2: 658 case AArch64::qsub3: 659 break; 660 } 661 } 662 else 663 return false; 664 } 665 return true; 666 } 667 668 /// Return the number of useful source registers for this instruction 669 /// (2 for ST2 and 4 for ST4). 670 unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { 671 switch (MI.getOpcode()) { 672 default: 673 llvm_unreachable("Unsupported instruction for this pass"); 674 675 case AArch64::ST2Twov16b: 676 case AArch64::ST2Twov8b: 677 case AArch64::ST2Twov8h: 678 case AArch64::ST2Twov4h: 679 case AArch64::ST2Twov4s: 680 case AArch64::ST2Twov2s: 681 case AArch64::ST2Twov2d: 682 return 2; 683 684 case AArch64::ST4Fourv16b: 685 case AArch64::ST4Fourv8b: 686 case AArch64::ST4Fourv8h: 687 case AArch64::ST4Fourv4h: 688 case AArch64::ST4Fourv4s: 689 case AArch64::ST4Fourv2s: 690 case AArch64::ST4Fourv2d: 691 return 4; 692 } 693 } 694 695 bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { 696 if (skipFunction(MF.getFunction())) 697 return false; 698 699 TII = MF.getSubtarget().getInstrInfo(); 700 MRI = &MF.getRegInfo(); 701 const TargetSubtargetInfo &ST = MF.getSubtarget(); 702 const AArch64InstrInfo *AAII = 703 static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 704 if (!AAII) 705 return false; 706 SchedModel.init(&ST); 707 if (!SchedModel.hasInstrSchedModel()) 708 return false; 709 710 bool Changed = false; 711 for (auto OptimizationKind : {VectorElem, Interleave}) { 712 if (!shouldExitEarly(&MF, OptimizationKind)) { 713 SmallVector<MachineInstr *, 8> RemoveMIs; 714 for (MachineBasicBlock &MBB : MF) { 715 for (MachineInstr &MI : MBB) { 716 bool InstRewrite; 717 if (OptimizationKind == VectorElem) 718 InstRewrite = optimizeVectElement(MI) ; 719 else 720 InstRewrite = optimizeLdStInterleave(MI); 721 if (InstRewrite) { 722 // Add MI to the list of instructions to be removed given that it 723 // has been replaced. 724 RemoveMIs.push_back(&MI); 725 Changed = true; 726 } 727 } 728 } 729 for (MachineInstr *MI : RemoveMIs) 730 MI->eraseFromParent(); 731 } 732 } 733 734 return Changed; 735 } 736 737 /// Returns an instance of the high cost ASIMD instruction replacement 738 /// optimization pass. 739 FunctionPass *llvm::createAArch64SIMDInstrOptPass() { 740 return new AArch64SIMDInstrOpt(); 741 } 742