1 //===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions 9 /// that may inhibit the HW prefetching. This is done in two steps. Before 10 /// ISel, we mark strided loads (i.e. those that will likely benefit from 11 /// prefetching) with metadata. Then, after opcodes have been finalized, we 12 /// insert MOVs and re-write loads to prevent unintentional tag collisions. 13 // ===---------------------------------------------------------------------===// 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64Subtarget.h" 18 #include "AArch64TargetMachine.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/DepthFirstIterator.h" 21 #include "llvm/ADT/None.h" 22 #include "llvm/ADT/Optional.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ScalarEvolution.h" 27 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 28 #include "llvm/CodeGen/LiveRegUnits.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineFunctionPass.h" 32 #include "llvm/CodeGen/MachineInstr.h" 33 #include "llvm/CodeGen/MachineInstrBuilder.h" 34 #include "llvm/CodeGen/MachineLoopInfo.h" 35 #include "llvm/CodeGen/MachineOperand.h" 36 #include "llvm/CodeGen/MachineRegisterInfo.h" 37 #include "llvm/CodeGen/TargetPassConfig.h" 38 #include "llvm/CodeGen/TargetRegisterInfo.h" 39 #include "llvm/IR/DebugLoc.h" 40 #include "llvm/IR/Dominators.h" 41 #include "llvm/IR/Function.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/Metadata.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/DebugCounter.h" 50 #include "llvm/Support/raw_ostream.h" 51 #include <cassert> 52 #include <iterator> 53 #include <utility> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "falkor-hwpf-fix" 58 59 STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); 60 STATISTIC(NumCollisionsAvoided, 61 "Number of HW prefetch tag collisions avoided"); 62 STATISTIC(NumCollisionsNotAvoided, 63 "Number of HW prefetch tag collisions not avoided due to lack of registers"); 64 DEBUG_COUNTER(FixCounter, "falkor-hwpf", 65 "Controls which tag collisions are avoided"); 66 67 namespace { 68 69 class FalkorMarkStridedAccesses { 70 public: 71 FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) 72 : LI(LI), SE(SE) {} 73 74 bool run(); 75 76 private: 77 bool runOnLoop(Loop &L); 78 79 LoopInfo &LI; 80 ScalarEvolution &SE; 81 }; 82 83 class FalkorMarkStridedAccessesLegacy : public FunctionPass { 84 public: 85 static char ID; // Pass ID, replacement for typeid 86 87 FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { 88 initializeFalkorMarkStridedAccessesLegacyPass( 89 *PassRegistry::getPassRegistry()); 90 } 91 92 void getAnalysisUsage(AnalysisUsage &AU) const override { 93 AU.addRequired<TargetPassConfig>(); 94 AU.addPreserved<DominatorTreeWrapperPass>(); 95 AU.addRequired<LoopInfoWrapperPass>(); 96 AU.addPreserved<LoopInfoWrapperPass>(); 97 AU.addRequired<ScalarEvolutionWrapperPass>(); 98 AU.addPreserved<ScalarEvolutionWrapperPass>(); 99 } 100 101 bool runOnFunction(Function &F) override; 102 }; 103 104 } // end anonymous namespace 105 106 char FalkorMarkStridedAccessesLegacy::ID = 0; 107 108 INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, 109 "Falkor HW Prefetch Fix", false, false) 110 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 111 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 112 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 113 INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, 114 "Falkor HW Prefetch Fix", false, false) 115 116 FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { 117 return new FalkorMarkStridedAccessesLegacy(); 118 } 119 120 bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { 121 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 122 const AArch64Subtarget *ST = 123 TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F); 124 if (ST->getProcFamily() != AArch64Subtarget::Falkor) 125 return false; 126 127 if (skipFunction(F)) 128 return false; 129 130 LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 131 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 132 133 FalkorMarkStridedAccesses LDP(LI, SE); 134 return LDP.run(); 135 } 136 137 bool FalkorMarkStridedAccesses::run() { 138 bool MadeChange = false; 139 140 for (Loop *L : LI) 141 for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt) 142 MadeChange |= runOnLoop(**LIt); 143 144 return MadeChange; 145 } 146 147 bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) { 148 // Only mark strided loads in the inner-most loop 149 if (!L.empty()) 150 return false; 151 152 bool MadeChange = false; 153 154 for (BasicBlock *BB : L.blocks()) { 155 for (Instruction &I : *BB) { 156 LoadInst *LoadI = dyn_cast<LoadInst>(&I); 157 if (!LoadI) 158 continue; 159 160 Value *PtrValue = LoadI->getPointerOperand(); 161 if (L.isLoopInvariant(PtrValue)) 162 continue; 163 164 const SCEV *LSCEV = SE.getSCEV(PtrValue); 165 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 166 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 167 continue; 168 169 LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, 170 MDNode::get(LoadI->getContext(), {})); 171 ++NumStridedLoadsMarked; 172 LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); 173 MadeChange = true; 174 } 175 } 176 177 return MadeChange; 178 } 179 180 namespace { 181 182 class FalkorHWPFFix : public MachineFunctionPass { 183 public: 184 static char ID; 185 186 FalkorHWPFFix() : MachineFunctionPass(ID) { 187 initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry()); 188 } 189 190 bool runOnMachineFunction(MachineFunction &Fn) override; 191 192 void getAnalysisUsage(AnalysisUsage &AU) const override { 193 AU.setPreservesCFG(); 194 AU.addRequired<MachineLoopInfo>(); 195 MachineFunctionPass::getAnalysisUsage(AU); 196 } 197 198 MachineFunctionProperties getRequiredProperties() const override { 199 return MachineFunctionProperties().set( 200 MachineFunctionProperties::Property::NoVRegs); 201 } 202 203 private: 204 void runOnLoop(MachineLoop &L, MachineFunction &Fn); 205 206 const AArch64InstrInfo *TII; 207 const TargetRegisterInfo *TRI; 208 DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap; 209 bool Modified; 210 }; 211 212 /// Bits from load opcodes used to compute HW prefetcher instruction tags. 213 struct LoadInfo { 214 LoadInfo() = default; 215 216 Register DestReg; 217 Register BaseReg; 218 int BaseRegIdx = -1; 219 const MachineOperand *OffsetOpnd = nullptr; 220 bool IsPrePost = false; 221 }; 222 223 } // end anonymous namespace 224 225 char FalkorHWPFFix::ID = 0; 226 227 INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late", 228 "Falkor HW Prefetch Fix Late Phase", false, false) 229 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 230 INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late", 231 "Falkor HW Prefetch Fix Late Phase", false, false) 232 233 static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { 234 return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8); 235 } 236 237 static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { 238 int DestRegIdx; 239 int BaseRegIdx; 240 int OffsetIdx; 241 bool IsPrePost; 242 243 switch (MI.getOpcode()) { 244 default: 245 return None; 246 247 case AArch64::LD1i64: 248 case AArch64::LD2i64: 249 DestRegIdx = 0; 250 BaseRegIdx = 3; 251 OffsetIdx = -1; 252 IsPrePost = false; 253 break; 254 255 case AArch64::LD1i8: 256 case AArch64::LD1i16: 257 case AArch64::LD1i32: 258 case AArch64::LD2i8: 259 case AArch64::LD2i16: 260 case AArch64::LD2i32: 261 case AArch64::LD3i8: 262 case AArch64::LD3i16: 263 case AArch64::LD3i32: 264 case AArch64::LD3i64: 265 case AArch64::LD4i8: 266 case AArch64::LD4i16: 267 case AArch64::LD4i32: 268 case AArch64::LD4i64: 269 DestRegIdx = -1; 270 BaseRegIdx = 3; 271 OffsetIdx = -1; 272 IsPrePost = false; 273 break; 274 275 case AArch64::LD1Onev1d: 276 case AArch64::LD1Onev2s: 277 case AArch64::LD1Onev4h: 278 case AArch64::LD1Onev8b: 279 case AArch64::LD1Onev2d: 280 case AArch64::LD1Onev4s: 281 case AArch64::LD1Onev8h: 282 case AArch64::LD1Onev16b: 283 case AArch64::LD1Rv1d: 284 case AArch64::LD1Rv2s: 285 case AArch64::LD1Rv4h: 286 case AArch64::LD1Rv8b: 287 case AArch64::LD1Rv2d: 288 case AArch64::LD1Rv4s: 289 case AArch64::LD1Rv8h: 290 case AArch64::LD1Rv16b: 291 DestRegIdx = 0; 292 BaseRegIdx = 1; 293 OffsetIdx = -1; 294 IsPrePost = false; 295 break; 296 297 case AArch64::LD1Twov1d: 298 case AArch64::LD1Twov2s: 299 case AArch64::LD1Twov4h: 300 case AArch64::LD1Twov8b: 301 case AArch64::LD1Twov2d: 302 case AArch64::LD1Twov4s: 303 case AArch64::LD1Twov8h: 304 case AArch64::LD1Twov16b: 305 case AArch64::LD1Threev1d: 306 case AArch64::LD1Threev2s: 307 case AArch64::LD1Threev4h: 308 case AArch64::LD1Threev8b: 309 case AArch64::LD1Threev2d: 310 case AArch64::LD1Threev4s: 311 case AArch64::LD1Threev8h: 312 case AArch64::LD1Threev16b: 313 case AArch64::LD1Fourv1d: 314 case AArch64::LD1Fourv2s: 315 case AArch64::LD1Fourv4h: 316 case AArch64::LD1Fourv8b: 317 case AArch64::LD1Fourv2d: 318 case AArch64::LD1Fourv4s: 319 case AArch64::LD1Fourv8h: 320 case AArch64::LD1Fourv16b: 321 case AArch64::LD2Twov2s: 322 case AArch64::LD2Twov4s: 323 case AArch64::LD2Twov8b: 324 case AArch64::LD2Twov2d: 325 case AArch64::LD2Twov4h: 326 case AArch64::LD2Twov8h: 327 case AArch64::LD2Twov16b: 328 case AArch64::LD2Rv1d: 329 case AArch64::LD2Rv2s: 330 case AArch64::LD2Rv4s: 331 case AArch64::LD2Rv8b: 332 case AArch64::LD2Rv2d: 333 case AArch64::LD2Rv4h: 334 case AArch64::LD2Rv8h: 335 case AArch64::LD2Rv16b: 336 case AArch64::LD3Threev2s: 337 case AArch64::LD3Threev4h: 338 case AArch64::LD3Threev8b: 339 case AArch64::LD3Threev2d: 340 case AArch64::LD3Threev4s: 341 case AArch64::LD3Threev8h: 342 case AArch64::LD3Threev16b: 343 case AArch64::LD3Rv1d: 344 case AArch64::LD3Rv2s: 345 case AArch64::LD3Rv4h: 346 case AArch64::LD3Rv8b: 347 case AArch64::LD3Rv2d: 348 case AArch64::LD3Rv4s: 349 case AArch64::LD3Rv8h: 350 case AArch64::LD3Rv16b: 351 case AArch64::LD4Fourv2s: 352 case AArch64::LD4Fourv4h: 353 case AArch64::LD4Fourv8b: 354 case AArch64::LD4Fourv2d: 355 case AArch64::LD4Fourv4s: 356 case AArch64::LD4Fourv8h: 357 case AArch64::LD4Fourv16b: 358 case AArch64::LD4Rv1d: 359 case AArch64::LD4Rv2s: 360 case AArch64::LD4Rv4h: 361 case AArch64::LD4Rv8b: 362 case AArch64::LD4Rv2d: 363 case AArch64::LD4Rv4s: 364 case AArch64::LD4Rv8h: 365 case AArch64::LD4Rv16b: 366 DestRegIdx = -1; 367 BaseRegIdx = 1; 368 OffsetIdx = -1; 369 IsPrePost = false; 370 break; 371 372 case AArch64::LD1i64_POST: 373 case AArch64::LD2i64_POST: 374 DestRegIdx = 1; 375 BaseRegIdx = 4; 376 OffsetIdx = 5; 377 IsPrePost = true; 378 break; 379 380 case AArch64::LD1i8_POST: 381 case AArch64::LD1i16_POST: 382 case AArch64::LD1i32_POST: 383 case AArch64::LD2i8_POST: 384 case AArch64::LD2i16_POST: 385 case AArch64::LD2i32_POST: 386 case AArch64::LD3i8_POST: 387 case AArch64::LD3i16_POST: 388 case AArch64::LD3i32_POST: 389 case AArch64::LD3i64_POST: 390 case AArch64::LD4i8_POST: 391 case AArch64::LD4i16_POST: 392 case AArch64::LD4i32_POST: 393 case AArch64::LD4i64_POST: 394 DestRegIdx = -1; 395 BaseRegIdx = 4; 396 OffsetIdx = 5; 397 IsPrePost = true; 398 break; 399 400 case AArch64::LD1Onev1d_POST: 401 case AArch64::LD1Onev2s_POST: 402 case AArch64::LD1Onev4h_POST: 403 case AArch64::LD1Onev8b_POST: 404 case AArch64::LD1Onev2d_POST: 405 case AArch64::LD1Onev4s_POST: 406 case AArch64::LD1Onev8h_POST: 407 case AArch64::LD1Onev16b_POST: 408 case AArch64::LD1Rv1d_POST: 409 case AArch64::LD1Rv2s_POST: 410 case AArch64::LD1Rv4h_POST: 411 case AArch64::LD1Rv8b_POST: 412 case AArch64::LD1Rv2d_POST: 413 case AArch64::LD1Rv4s_POST: 414 case AArch64::LD1Rv8h_POST: 415 case AArch64::LD1Rv16b_POST: 416 DestRegIdx = 1; 417 BaseRegIdx = 2; 418 OffsetIdx = 3; 419 IsPrePost = true; 420 break; 421 422 case AArch64::LD1Twov1d_POST: 423 case AArch64::LD1Twov2s_POST: 424 case AArch64::LD1Twov4h_POST: 425 case AArch64::LD1Twov8b_POST: 426 case AArch64::LD1Twov2d_POST: 427 case AArch64::LD1Twov4s_POST: 428 case AArch64::LD1Twov8h_POST: 429 case AArch64::LD1Twov16b_POST: 430 case AArch64::LD1Threev1d_POST: 431 case AArch64::LD1Threev2s_POST: 432 case AArch64::LD1Threev4h_POST: 433 case AArch64::LD1Threev8b_POST: 434 case AArch64::LD1Threev2d_POST: 435 case AArch64::LD1Threev4s_POST: 436 case AArch64::LD1Threev8h_POST: 437 case AArch64::LD1Threev16b_POST: 438 case AArch64::LD1Fourv1d_POST: 439 case AArch64::LD1Fourv2s_POST: 440 case AArch64::LD1Fourv4h_POST: 441 case AArch64::LD1Fourv8b_POST: 442 case AArch64::LD1Fourv2d_POST: 443 case AArch64::LD1Fourv4s_POST: 444 case AArch64::LD1Fourv8h_POST: 445 case AArch64::LD1Fourv16b_POST: 446 case AArch64::LD2Twov2s_POST: 447 case AArch64::LD2Twov4s_POST: 448 case AArch64::LD2Twov8b_POST: 449 case AArch64::LD2Twov2d_POST: 450 case AArch64::LD2Twov4h_POST: 451 case AArch64::LD2Twov8h_POST: 452 case AArch64::LD2Twov16b_POST: 453 case AArch64::LD2Rv1d_POST: 454 case AArch64::LD2Rv2s_POST: 455 case AArch64::LD2Rv4s_POST: 456 case AArch64::LD2Rv8b_POST: 457 case AArch64::LD2Rv2d_POST: 458 case AArch64::LD2Rv4h_POST: 459 case AArch64::LD2Rv8h_POST: 460 case AArch64::LD2Rv16b_POST: 461 case AArch64::LD3Threev2s_POST: 462 case AArch64::LD3Threev4h_POST: 463 case AArch64::LD3Threev8b_POST: 464 case AArch64::LD3Threev2d_POST: 465 case AArch64::LD3Threev4s_POST: 466 case AArch64::LD3Threev8h_POST: 467 case AArch64::LD3Threev16b_POST: 468 case AArch64::LD3Rv1d_POST: 469 case AArch64::LD3Rv2s_POST: 470 case AArch64::LD3Rv4h_POST: 471 case AArch64::LD3Rv8b_POST: 472 case AArch64::LD3Rv2d_POST: 473 case AArch64::LD3Rv4s_POST: 474 case AArch64::LD3Rv8h_POST: 475 case AArch64::LD3Rv16b_POST: 476 case AArch64::LD4Fourv2s_POST: 477 case AArch64::LD4Fourv4h_POST: 478 case AArch64::LD4Fourv8b_POST: 479 case AArch64::LD4Fourv2d_POST: 480 case AArch64::LD4Fourv4s_POST: 481 case AArch64::LD4Fourv8h_POST: 482 case AArch64::LD4Fourv16b_POST: 483 case AArch64::LD4Rv1d_POST: 484 case AArch64::LD4Rv2s_POST: 485 case AArch64::LD4Rv4h_POST: 486 case AArch64::LD4Rv8b_POST: 487 case AArch64::LD4Rv2d_POST: 488 case AArch64::LD4Rv4s_POST: 489 case AArch64::LD4Rv8h_POST: 490 case AArch64::LD4Rv16b_POST: 491 DestRegIdx = -1; 492 BaseRegIdx = 2; 493 OffsetIdx = 3; 494 IsPrePost = true; 495 break; 496 497 case AArch64::LDRBBroW: 498 case AArch64::LDRBBroX: 499 case AArch64::LDRBBui: 500 case AArch64::LDRBroW: 501 case AArch64::LDRBroX: 502 case AArch64::LDRBui: 503 case AArch64::LDRDl: 504 case AArch64::LDRDroW: 505 case AArch64::LDRDroX: 506 case AArch64::LDRDui: 507 case AArch64::LDRHHroW: 508 case AArch64::LDRHHroX: 509 case AArch64::LDRHHui: 510 case AArch64::LDRHroW: 511 case AArch64::LDRHroX: 512 case AArch64::LDRHui: 513 case AArch64::LDRQl: 514 case AArch64::LDRQroW: 515 case AArch64::LDRQroX: 516 case AArch64::LDRQui: 517 case AArch64::LDRSBWroW: 518 case AArch64::LDRSBWroX: 519 case AArch64::LDRSBWui: 520 case AArch64::LDRSBXroW: 521 case AArch64::LDRSBXroX: 522 case AArch64::LDRSBXui: 523 case AArch64::LDRSHWroW: 524 case AArch64::LDRSHWroX: 525 case AArch64::LDRSHWui: 526 case AArch64::LDRSHXroW: 527 case AArch64::LDRSHXroX: 528 case AArch64::LDRSHXui: 529 case AArch64::LDRSWl: 530 case AArch64::LDRSWroW: 531 case AArch64::LDRSWroX: 532 case AArch64::LDRSWui: 533 case AArch64::LDRSl: 534 case AArch64::LDRSroW: 535 case AArch64::LDRSroX: 536 case AArch64::LDRSui: 537 case AArch64::LDRWl: 538 case AArch64::LDRWroW: 539 case AArch64::LDRWroX: 540 case AArch64::LDRWui: 541 case AArch64::LDRXl: 542 case AArch64::LDRXroW: 543 case AArch64::LDRXroX: 544 case AArch64::LDRXui: 545 case AArch64::LDURBBi: 546 case AArch64::LDURBi: 547 case AArch64::LDURDi: 548 case AArch64::LDURHHi: 549 case AArch64::LDURHi: 550 case AArch64::LDURQi: 551 case AArch64::LDURSBWi: 552 case AArch64::LDURSBXi: 553 case AArch64::LDURSHWi: 554 case AArch64::LDURSHXi: 555 case AArch64::LDURSWi: 556 case AArch64::LDURSi: 557 case AArch64::LDURWi: 558 case AArch64::LDURXi: 559 DestRegIdx = 0; 560 BaseRegIdx = 1; 561 OffsetIdx = 2; 562 IsPrePost = false; 563 break; 564 565 case AArch64::LDRBBpost: 566 case AArch64::LDRBBpre: 567 case AArch64::LDRBpost: 568 case AArch64::LDRBpre: 569 case AArch64::LDRDpost: 570 case AArch64::LDRDpre: 571 case AArch64::LDRHHpost: 572 case AArch64::LDRHHpre: 573 case AArch64::LDRHpost: 574 case AArch64::LDRHpre: 575 case AArch64::LDRQpost: 576 case AArch64::LDRQpre: 577 case AArch64::LDRSBWpost: 578 case AArch64::LDRSBWpre: 579 case AArch64::LDRSBXpost: 580 case AArch64::LDRSBXpre: 581 case AArch64::LDRSHWpost: 582 case AArch64::LDRSHWpre: 583 case AArch64::LDRSHXpost: 584 case AArch64::LDRSHXpre: 585 case AArch64::LDRSWpost: 586 case AArch64::LDRSWpre: 587 case AArch64::LDRSpost: 588 case AArch64::LDRSpre: 589 case AArch64::LDRWpost: 590 case AArch64::LDRWpre: 591 case AArch64::LDRXpost: 592 case AArch64::LDRXpre: 593 DestRegIdx = 1; 594 BaseRegIdx = 2; 595 OffsetIdx = 3; 596 IsPrePost = true; 597 break; 598 599 case AArch64::LDNPDi: 600 case AArch64::LDNPQi: 601 case AArch64::LDNPSi: 602 case AArch64::LDPQi: 603 case AArch64::LDPDi: 604 case AArch64::LDPSi: 605 DestRegIdx = -1; 606 BaseRegIdx = 2; 607 OffsetIdx = 3; 608 IsPrePost = false; 609 break; 610 611 case AArch64::LDPSWi: 612 case AArch64::LDPWi: 613 case AArch64::LDPXi: 614 DestRegIdx = 0; 615 BaseRegIdx = 2; 616 OffsetIdx = 3; 617 IsPrePost = false; 618 break; 619 620 case AArch64::LDPQpost: 621 case AArch64::LDPQpre: 622 case AArch64::LDPDpost: 623 case AArch64::LDPDpre: 624 case AArch64::LDPSpost: 625 case AArch64::LDPSpre: 626 DestRegIdx = -1; 627 BaseRegIdx = 3; 628 OffsetIdx = 4; 629 IsPrePost = true; 630 break; 631 632 case AArch64::LDPSWpost: 633 case AArch64::LDPSWpre: 634 case AArch64::LDPWpost: 635 case AArch64::LDPWpre: 636 case AArch64::LDPXpost: 637 case AArch64::LDPXpre: 638 DestRegIdx = 1; 639 BaseRegIdx = 3; 640 OffsetIdx = 4; 641 IsPrePost = true; 642 break; 643 } 644 645 // Loads from the stack pointer don't get prefetched. 646 Register BaseReg = MI.getOperand(BaseRegIdx).getReg(); 647 if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP) 648 return None; 649 650 LoadInfo LI; 651 LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg(); 652 LI.BaseReg = BaseReg; 653 LI.BaseRegIdx = BaseRegIdx; 654 LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); 655 LI.IsPrePost = IsPrePost; 656 return LI; 657 } 658 659 static Optional<unsigned> getTag(const TargetRegisterInfo *TRI, 660 const MachineInstr &MI, const LoadInfo &LI) { 661 unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0; 662 unsigned Base = TRI->getEncodingValue(LI.BaseReg); 663 unsigned Off; 664 if (LI.OffsetOpnd == nullptr) 665 Off = 0; 666 else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() || 667 LI.OffsetOpnd->isCPI()) 668 return None; 669 else if (LI.OffsetOpnd->isReg()) 670 Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg()); 671 else 672 Off = LI.OffsetOpnd->getImm() >> 2; 673 674 return makeTag(Dest, Base, Off); 675 } 676 677 void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { 678 // Build the initial tag map for the whole loop. 679 TagMap.clear(); 680 for (MachineBasicBlock *MBB : L.getBlocks()) 681 for (MachineInstr &MI : *MBB) { 682 Optional<LoadInfo> LInfo = getLoadInfo(MI); 683 if (!LInfo) 684 continue; 685 Optional<unsigned> Tag = getTag(TRI, MI, *LInfo); 686 if (!Tag) 687 continue; 688 TagMap[*Tag].push_back(&MI); 689 } 690 691 bool AnyCollisions = false; 692 for (auto &P : TagMap) { 693 auto Size = P.second.size(); 694 if (Size > 1) { 695 for (auto *MI : P.second) { 696 if (TII->isStridedAccess(*MI)) { 697 AnyCollisions = true; 698 break; 699 } 700 } 701 } 702 if (AnyCollisions) 703 break; 704 } 705 // Nothing to fix. 706 if (!AnyCollisions) 707 return; 708 709 MachineRegisterInfo &MRI = Fn.getRegInfo(); 710 711 // Go through all the basic blocks in the current loop and fix any streaming 712 // loads to avoid collisions with any other loads. 713 LiveRegUnits LR(*TRI); 714 for (MachineBasicBlock *MBB : L.getBlocks()) { 715 LR.clear(); 716 LR.addLiveOuts(*MBB); 717 for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) { 718 MachineInstr &MI = *I; 719 if (!TII->isStridedAccess(MI)) 720 continue; 721 722 Optional<LoadInfo> OptLdI = getLoadInfo(MI); 723 if (!OptLdI) 724 continue; 725 LoadInfo LdI = *OptLdI; 726 Optional<unsigned> OptOldTag = getTag(TRI, MI, LdI); 727 if (!OptOldTag) 728 continue; 729 auto &OldCollisions = TagMap[*OptOldTag]; 730 if (OldCollisions.size() <= 1) 731 continue; 732 733 bool Fixed = false; 734 LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI); 735 736 if (!DebugCounter::shouldExecute(FixCounter)) { 737 LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI); 738 continue; 739 } 740 741 // Add the non-base registers of MI as live so we don't use them as 742 // scratch registers. 743 for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) { 744 if (OpI == static_cast<unsigned>(LdI.BaseRegIdx)) 745 continue; 746 MachineOperand &MO = MI.getOperand(OpI); 747 if (MO.isReg() && MO.readsReg()) 748 LR.addReg(MO.getReg()); 749 } 750 751 for (unsigned ScratchReg : AArch64::GPR64RegClass) { 752 if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg)) 753 continue; 754 755 LoadInfo NewLdI(LdI); 756 NewLdI.BaseReg = ScratchReg; 757 unsigned NewTag = *getTag(TRI, MI, NewLdI); 758 // Scratch reg tag would collide too, so don't use it. 759 if (TagMap.count(NewTag)) 760 continue; 761 762 LLVM_DEBUG(dbgs() << "Changing base reg to: " 763 << printReg(ScratchReg, TRI) << '\n'); 764 765 // Rewrite: 766 // Xd = LOAD Xb, off 767 // to: 768 // Xc = MOV Xb 769 // Xd = LOAD Xc, off 770 DebugLoc DL = MI.getDebugLoc(); 771 BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg) 772 .addReg(AArch64::XZR) 773 .addReg(LdI.BaseReg) 774 .addImm(0); 775 MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx); 776 BaseOpnd.setReg(ScratchReg); 777 778 // If the load does a pre/post increment, then insert a MOV after as 779 // well to update the real base register. 780 if (LdI.IsPrePost) { 781 LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: " 782 << printReg(ScratchReg, TRI) << '\n'); 783 MI.getOperand(0).setReg( 784 ScratchReg); // Change tied operand pre/post update dest. 785 BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL, 786 TII->get(AArch64::ORRXrs), LdI.BaseReg) 787 .addReg(AArch64::XZR) 788 .addReg(ScratchReg) 789 .addImm(0); 790 } 791 792 for (int I = 0, E = OldCollisions.size(); I != E; ++I) 793 if (OldCollisions[I] == &MI) { 794 std::swap(OldCollisions[I], OldCollisions[E - 1]); 795 OldCollisions.pop_back(); 796 break; 797 } 798 799 // Update TagMap to reflect instruction changes to reduce the number 800 // of later MOVs to be inserted. This needs to be done after 801 // OldCollisions is updated since it may be relocated by this 802 // insertion. 803 TagMap[NewTag].push_back(&MI); 804 ++NumCollisionsAvoided; 805 Fixed = true; 806 Modified = true; 807 break; 808 } 809 if (!Fixed) 810 ++NumCollisionsNotAvoided; 811 } 812 } 813 } 814 815 bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { 816 auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 817 if (ST.getProcFamily() != AArch64Subtarget::Falkor) 818 return false; 819 820 if (skipFunction(Fn.getFunction())) 821 return false; 822 823 TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 824 TRI = ST.getRegisterInfo(); 825 826 MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>(); 827 828 Modified = false; 829 830 for (MachineLoop *I : LI) 831 for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) 832 // Only process inner-loops 833 if (L->empty()) 834 runOnLoop(**L, Fn); 835 836 return Modified; 837 } 838 839 FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); } 840