1 //===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions 9 /// that may inhibit the HW prefetching. This is done in two steps. Before 10 /// ISel, we mark strided loads (i.e. those that will likely benefit from 11 /// prefetching) with metadata. Then, after opcodes have been finalized, we 12 /// insert MOVs and re-write loads to prevent unintentional tag collisions. 13 // ===---------------------------------------------------------------------===// 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64Subtarget.h" 18 #include "AArch64TargetMachine.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/DepthFirstIterator.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/LoopInfo.h" 24 #include "llvm/Analysis/ScalarEvolution.h" 25 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 26 #include "llvm/CodeGen/LiveRegUnits.h" 27 #include "llvm/CodeGen/MachineBasicBlock.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineFunctionPass.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineLoopInfo.h" 33 #include "llvm/CodeGen/MachineOperand.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/TargetPassConfig.h" 36 #include "llvm/CodeGen/TargetRegisterInfo.h" 37 #include "llvm/IR/DebugLoc.h" 38 #include "llvm/IR/Dominators.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/Metadata.h" 43 #include "llvm/InitializePasses.h" 44 #include "llvm/Pass.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/DebugCounter.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include <cassert> 50 #include <iterator> 51 #include <utility> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "aarch64-falkor-hwpf-fix" 56 57 STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); 58 STATISTIC(NumCollisionsAvoided, 59 "Number of HW prefetch tag collisions avoided"); 60 STATISTIC(NumCollisionsNotAvoided, 61 "Number of HW prefetch tag collisions not avoided due to lack of registers"); 62 DEBUG_COUNTER(FixCounter, "falkor-hwpf", 63 "Controls which tag collisions are avoided"); 64 65 namespace { 66 67 class FalkorMarkStridedAccesses { 68 public: 69 FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) 70 : LI(LI), SE(SE) {} 71 72 bool run(); 73 74 private: 75 bool runOnLoop(Loop &L); 76 77 LoopInfo &LI; 78 ScalarEvolution &SE; 79 }; 80 81 class FalkorMarkStridedAccessesLegacy : public FunctionPass { 82 public: 83 static char ID; // Pass ID, replacement for typeid 84 85 FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { 86 initializeFalkorMarkStridedAccessesLegacyPass( 87 *PassRegistry::getPassRegistry()); 88 } 89 90 void getAnalysisUsage(AnalysisUsage &AU) const override { 91 AU.addRequired<TargetPassConfig>(); 92 AU.addPreserved<DominatorTreeWrapperPass>(); 93 AU.addRequired<LoopInfoWrapperPass>(); 94 AU.addPreserved<LoopInfoWrapperPass>(); 95 AU.addRequired<ScalarEvolutionWrapperPass>(); 96 AU.addPreserved<ScalarEvolutionWrapperPass>(); 97 } 98 99 bool runOnFunction(Function &F) override; 100 }; 101 102 } // end anonymous namespace 103 104 char FalkorMarkStridedAccessesLegacy::ID = 0; 105 106 INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, 107 "Falkor HW Prefetch Fix", false, false) 108 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 109 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 110 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 111 INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, 112 "Falkor HW Prefetch Fix", false, false) 113 114 FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { 115 return new FalkorMarkStridedAccessesLegacy(); 116 } 117 118 bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { 119 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 120 const AArch64Subtarget *ST = 121 TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F); 122 if (ST->getProcFamily() != AArch64Subtarget::Falkor) 123 return false; 124 125 if (skipFunction(F)) 126 return false; 127 128 LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 129 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 130 131 FalkorMarkStridedAccesses LDP(LI, SE); 132 return LDP.run(); 133 } 134 135 bool FalkorMarkStridedAccesses::run() { 136 bool MadeChange = false; 137 138 for (Loop *L : LI) 139 for (Loop *LIt : depth_first(L)) 140 MadeChange |= runOnLoop(*LIt); 141 142 return MadeChange; 143 } 144 145 bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) { 146 // Only mark strided loads in the inner-most loop 147 if (!L.isInnermost()) 148 return false; 149 150 bool MadeChange = false; 151 152 for (BasicBlock *BB : L.blocks()) { 153 for (Instruction &I : *BB) { 154 LoadInst *LoadI = dyn_cast<LoadInst>(&I); 155 if (!LoadI) 156 continue; 157 158 Value *PtrValue = LoadI->getPointerOperand(); 159 if (L.isLoopInvariant(PtrValue)) 160 continue; 161 162 const SCEV *LSCEV = SE.getSCEV(PtrValue); 163 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 164 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 165 continue; 166 167 LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, 168 MDNode::get(LoadI->getContext(), {})); 169 ++NumStridedLoadsMarked; 170 LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); 171 MadeChange = true; 172 } 173 } 174 175 return MadeChange; 176 } 177 178 namespace { 179 180 class FalkorHWPFFix : public MachineFunctionPass { 181 public: 182 static char ID; 183 184 FalkorHWPFFix() : MachineFunctionPass(ID) { 185 initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry()); 186 } 187 188 bool runOnMachineFunction(MachineFunction &Fn) override; 189 190 void getAnalysisUsage(AnalysisUsage &AU) const override { 191 AU.setPreservesCFG(); 192 AU.addRequired<MachineLoopInfo>(); 193 MachineFunctionPass::getAnalysisUsage(AU); 194 } 195 196 MachineFunctionProperties getRequiredProperties() const override { 197 return MachineFunctionProperties().set( 198 MachineFunctionProperties::Property::NoVRegs); 199 } 200 201 private: 202 void runOnLoop(MachineLoop &L, MachineFunction &Fn); 203 204 const AArch64InstrInfo *TII; 205 const TargetRegisterInfo *TRI; 206 DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap; 207 bool Modified; 208 }; 209 210 /// Bits from load opcodes used to compute HW prefetcher instruction tags. 211 struct LoadInfo { 212 LoadInfo() = default; 213 214 Register DestReg; 215 Register BaseReg; 216 int BaseRegIdx = -1; 217 const MachineOperand *OffsetOpnd = nullptr; 218 bool IsPrePost = false; 219 }; 220 221 } // end anonymous namespace 222 223 char FalkorHWPFFix::ID = 0; 224 225 INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", 226 "Falkor HW Prefetch Fix Late Phase", false, false) 227 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 228 INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", 229 "Falkor HW Prefetch Fix Late Phase", false, false) 230 231 static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { 232 return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8); 233 } 234 235 static std::optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { 236 int DestRegIdx; 237 int BaseRegIdx; 238 int OffsetIdx; 239 bool IsPrePost; 240 241 switch (MI.getOpcode()) { 242 default: 243 return std::nullopt; 244 245 case AArch64::LD1i64: 246 case AArch64::LD2i64: 247 DestRegIdx = 0; 248 BaseRegIdx = 3; 249 OffsetIdx = -1; 250 IsPrePost = false; 251 break; 252 253 case AArch64::LD1i8: 254 case AArch64::LD1i16: 255 case AArch64::LD1i32: 256 case AArch64::LD2i8: 257 case AArch64::LD2i16: 258 case AArch64::LD2i32: 259 case AArch64::LD3i8: 260 case AArch64::LD3i16: 261 case AArch64::LD3i32: 262 case AArch64::LD3i64: 263 case AArch64::LD4i8: 264 case AArch64::LD4i16: 265 case AArch64::LD4i32: 266 case AArch64::LD4i64: 267 DestRegIdx = -1; 268 BaseRegIdx = 3; 269 OffsetIdx = -1; 270 IsPrePost = false; 271 break; 272 273 case AArch64::LD1Onev1d: 274 case AArch64::LD1Onev2s: 275 case AArch64::LD1Onev4h: 276 case AArch64::LD1Onev8b: 277 case AArch64::LD1Onev2d: 278 case AArch64::LD1Onev4s: 279 case AArch64::LD1Onev8h: 280 case AArch64::LD1Onev16b: 281 case AArch64::LD1Rv1d: 282 case AArch64::LD1Rv2s: 283 case AArch64::LD1Rv4h: 284 case AArch64::LD1Rv8b: 285 case AArch64::LD1Rv2d: 286 case AArch64::LD1Rv4s: 287 case AArch64::LD1Rv8h: 288 case AArch64::LD1Rv16b: 289 DestRegIdx = 0; 290 BaseRegIdx = 1; 291 OffsetIdx = -1; 292 IsPrePost = false; 293 break; 294 295 case AArch64::LD1Twov1d: 296 case AArch64::LD1Twov2s: 297 case AArch64::LD1Twov4h: 298 case AArch64::LD1Twov8b: 299 case AArch64::LD1Twov2d: 300 case AArch64::LD1Twov4s: 301 case AArch64::LD1Twov8h: 302 case AArch64::LD1Twov16b: 303 case AArch64::LD1Threev1d: 304 case AArch64::LD1Threev2s: 305 case AArch64::LD1Threev4h: 306 case AArch64::LD1Threev8b: 307 case AArch64::LD1Threev2d: 308 case AArch64::LD1Threev4s: 309 case AArch64::LD1Threev8h: 310 case AArch64::LD1Threev16b: 311 case AArch64::LD1Fourv1d: 312 case AArch64::LD1Fourv2s: 313 case AArch64::LD1Fourv4h: 314 case AArch64::LD1Fourv8b: 315 case AArch64::LD1Fourv2d: 316 case AArch64::LD1Fourv4s: 317 case AArch64::LD1Fourv8h: 318 case AArch64::LD1Fourv16b: 319 case AArch64::LD2Twov2s: 320 case AArch64::LD2Twov4s: 321 case AArch64::LD2Twov8b: 322 case AArch64::LD2Twov2d: 323 case AArch64::LD2Twov4h: 324 case AArch64::LD2Twov8h: 325 case AArch64::LD2Twov16b: 326 case AArch64::LD2Rv1d: 327 case AArch64::LD2Rv2s: 328 case AArch64::LD2Rv4s: 329 case AArch64::LD2Rv8b: 330 case AArch64::LD2Rv2d: 331 case AArch64::LD2Rv4h: 332 case AArch64::LD2Rv8h: 333 case AArch64::LD2Rv16b: 334 case AArch64::LD3Threev2s: 335 case AArch64::LD3Threev4h: 336 case AArch64::LD3Threev8b: 337 case AArch64::LD3Threev2d: 338 case AArch64::LD3Threev4s: 339 case AArch64::LD3Threev8h: 340 case AArch64::LD3Threev16b: 341 case AArch64::LD3Rv1d: 342 case AArch64::LD3Rv2s: 343 case AArch64::LD3Rv4h: 344 case AArch64::LD3Rv8b: 345 case AArch64::LD3Rv2d: 346 case AArch64::LD3Rv4s: 347 case AArch64::LD3Rv8h: 348 case AArch64::LD3Rv16b: 349 case AArch64::LD4Fourv2s: 350 case AArch64::LD4Fourv4h: 351 case AArch64::LD4Fourv8b: 352 case AArch64::LD4Fourv2d: 353 case AArch64::LD4Fourv4s: 354 case AArch64::LD4Fourv8h: 355 case AArch64::LD4Fourv16b: 356 case AArch64::LD4Rv1d: 357 case AArch64::LD4Rv2s: 358 case AArch64::LD4Rv4h: 359 case AArch64::LD4Rv8b: 360 case AArch64::LD4Rv2d: 361 case AArch64::LD4Rv4s: 362 case AArch64::LD4Rv8h: 363 case AArch64::LD4Rv16b: 364 DestRegIdx = -1; 365 BaseRegIdx = 1; 366 OffsetIdx = -1; 367 IsPrePost = false; 368 break; 369 370 case AArch64::LD1i64_POST: 371 case AArch64::LD2i64_POST: 372 DestRegIdx = 1; 373 BaseRegIdx = 4; 374 OffsetIdx = 5; 375 IsPrePost = true; 376 break; 377 378 case AArch64::LD1i8_POST: 379 case AArch64::LD1i16_POST: 380 case AArch64::LD1i32_POST: 381 case AArch64::LD2i8_POST: 382 case AArch64::LD2i16_POST: 383 case AArch64::LD2i32_POST: 384 case AArch64::LD3i8_POST: 385 case AArch64::LD3i16_POST: 386 case AArch64::LD3i32_POST: 387 case AArch64::LD3i64_POST: 388 case AArch64::LD4i8_POST: 389 case AArch64::LD4i16_POST: 390 case AArch64::LD4i32_POST: 391 case AArch64::LD4i64_POST: 392 DestRegIdx = -1; 393 BaseRegIdx = 4; 394 OffsetIdx = 5; 395 IsPrePost = true; 396 break; 397 398 case AArch64::LD1Onev1d_POST: 399 case AArch64::LD1Onev2s_POST: 400 case AArch64::LD1Onev4h_POST: 401 case AArch64::LD1Onev8b_POST: 402 case AArch64::LD1Onev2d_POST: 403 case AArch64::LD1Onev4s_POST: 404 case AArch64::LD1Onev8h_POST: 405 case AArch64::LD1Onev16b_POST: 406 case AArch64::LD1Rv1d_POST: 407 case AArch64::LD1Rv2s_POST: 408 case AArch64::LD1Rv4h_POST: 409 case AArch64::LD1Rv8b_POST: 410 case AArch64::LD1Rv2d_POST: 411 case AArch64::LD1Rv4s_POST: 412 case AArch64::LD1Rv8h_POST: 413 case AArch64::LD1Rv16b_POST: 414 DestRegIdx = 1; 415 BaseRegIdx = 2; 416 OffsetIdx = 3; 417 IsPrePost = true; 418 break; 419 420 case AArch64::LD1Twov1d_POST: 421 case AArch64::LD1Twov2s_POST: 422 case AArch64::LD1Twov4h_POST: 423 case AArch64::LD1Twov8b_POST: 424 case AArch64::LD1Twov2d_POST: 425 case AArch64::LD1Twov4s_POST: 426 case AArch64::LD1Twov8h_POST: 427 case AArch64::LD1Twov16b_POST: 428 case AArch64::LD1Threev1d_POST: 429 case AArch64::LD1Threev2s_POST: 430 case AArch64::LD1Threev4h_POST: 431 case AArch64::LD1Threev8b_POST: 432 case AArch64::LD1Threev2d_POST: 433 case AArch64::LD1Threev4s_POST: 434 case AArch64::LD1Threev8h_POST: 435 case AArch64::LD1Threev16b_POST: 436 case AArch64::LD1Fourv1d_POST: 437 case AArch64::LD1Fourv2s_POST: 438 case AArch64::LD1Fourv4h_POST: 439 case AArch64::LD1Fourv8b_POST: 440 case AArch64::LD1Fourv2d_POST: 441 case AArch64::LD1Fourv4s_POST: 442 case AArch64::LD1Fourv8h_POST: 443 case AArch64::LD1Fourv16b_POST: 444 case AArch64::LD2Twov2s_POST: 445 case AArch64::LD2Twov4s_POST: 446 case AArch64::LD2Twov8b_POST: 447 case AArch64::LD2Twov2d_POST: 448 case AArch64::LD2Twov4h_POST: 449 case AArch64::LD2Twov8h_POST: 450 case AArch64::LD2Twov16b_POST: 451 case AArch64::LD2Rv1d_POST: 452 case AArch64::LD2Rv2s_POST: 453 case AArch64::LD2Rv4s_POST: 454 case AArch64::LD2Rv8b_POST: 455 case AArch64::LD2Rv2d_POST: 456 case AArch64::LD2Rv4h_POST: 457 case AArch64::LD2Rv8h_POST: 458 case AArch64::LD2Rv16b_POST: 459 case AArch64::LD3Threev2s_POST: 460 case AArch64::LD3Threev4h_POST: 461 case AArch64::LD3Threev8b_POST: 462 case AArch64::LD3Threev2d_POST: 463 case AArch64::LD3Threev4s_POST: 464 case AArch64::LD3Threev8h_POST: 465 case AArch64::LD3Threev16b_POST: 466 case AArch64::LD3Rv1d_POST: 467 case AArch64::LD3Rv2s_POST: 468 case AArch64::LD3Rv4h_POST: 469 case AArch64::LD3Rv8b_POST: 470 case AArch64::LD3Rv2d_POST: 471 case AArch64::LD3Rv4s_POST: 472 case AArch64::LD3Rv8h_POST: 473 case AArch64::LD3Rv16b_POST: 474 case AArch64::LD4Fourv2s_POST: 475 case AArch64::LD4Fourv4h_POST: 476 case AArch64::LD4Fourv8b_POST: 477 case AArch64::LD4Fourv2d_POST: 478 case AArch64::LD4Fourv4s_POST: 479 case AArch64::LD4Fourv8h_POST: 480 case AArch64::LD4Fourv16b_POST: 481 case AArch64::LD4Rv1d_POST: 482 case AArch64::LD4Rv2s_POST: 483 case AArch64::LD4Rv4h_POST: 484 case AArch64::LD4Rv8b_POST: 485 case AArch64::LD4Rv2d_POST: 486 case AArch64::LD4Rv4s_POST: 487 case AArch64::LD4Rv8h_POST: 488 case AArch64::LD4Rv16b_POST: 489 DestRegIdx = -1; 490 BaseRegIdx = 2; 491 OffsetIdx = 3; 492 IsPrePost = true; 493 break; 494 495 case AArch64::LDRBBroW: 496 case AArch64::LDRBBroX: 497 case AArch64::LDRBBui: 498 case AArch64::LDRBroW: 499 case AArch64::LDRBroX: 500 case AArch64::LDRBui: 501 case AArch64::LDRDl: 502 case AArch64::LDRDroW: 503 case AArch64::LDRDroX: 504 case AArch64::LDRDui: 505 case AArch64::LDRHHroW: 506 case AArch64::LDRHHroX: 507 case AArch64::LDRHHui: 508 case AArch64::LDRHroW: 509 case AArch64::LDRHroX: 510 case AArch64::LDRHui: 511 case AArch64::LDRQl: 512 case AArch64::LDRQroW: 513 case AArch64::LDRQroX: 514 case AArch64::LDRQui: 515 case AArch64::LDRSBWroW: 516 case AArch64::LDRSBWroX: 517 case AArch64::LDRSBWui: 518 case AArch64::LDRSBXroW: 519 case AArch64::LDRSBXroX: 520 case AArch64::LDRSBXui: 521 case AArch64::LDRSHWroW: 522 case AArch64::LDRSHWroX: 523 case AArch64::LDRSHWui: 524 case AArch64::LDRSHXroW: 525 case AArch64::LDRSHXroX: 526 case AArch64::LDRSHXui: 527 case AArch64::LDRSWl: 528 case AArch64::LDRSWroW: 529 case AArch64::LDRSWroX: 530 case AArch64::LDRSWui: 531 case AArch64::LDRSl: 532 case AArch64::LDRSroW: 533 case AArch64::LDRSroX: 534 case AArch64::LDRSui: 535 case AArch64::LDRWl: 536 case AArch64::LDRWroW: 537 case AArch64::LDRWroX: 538 case AArch64::LDRWui: 539 case AArch64::LDRXl: 540 case AArch64::LDRXroW: 541 case AArch64::LDRXroX: 542 case AArch64::LDRXui: 543 case AArch64::LDURBBi: 544 case AArch64::LDURBi: 545 case AArch64::LDURDi: 546 case AArch64::LDURHHi: 547 case AArch64::LDURHi: 548 case AArch64::LDURQi: 549 case AArch64::LDURSBWi: 550 case AArch64::LDURSBXi: 551 case AArch64::LDURSHWi: 552 case AArch64::LDURSHXi: 553 case AArch64::LDURSWi: 554 case AArch64::LDURSi: 555 case AArch64::LDURWi: 556 case AArch64::LDURXi: 557 DestRegIdx = 0; 558 BaseRegIdx = 1; 559 OffsetIdx = 2; 560 IsPrePost = false; 561 break; 562 563 case AArch64::LDRBBpost: 564 case AArch64::LDRBBpre: 565 case AArch64::LDRBpost: 566 case AArch64::LDRBpre: 567 case AArch64::LDRDpost: 568 case AArch64::LDRDpre: 569 case AArch64::LDRHHpost: 570 case AArch64::LDRHHpre: 571 case AArch64::LDRHpost: 572 case AArch64::LDRHpre: 573 case AArch64::LDRQpost: 574 case AArch64::LDRQpre: 575 case AArch64::LDRSBWpost: 576 case AArch64::LDRSBWpre: 577 case AArch64::LDRSBXpost: 578 case AArch64::LDRSBXpre: 579 case AArch64::LDRSHWpost: 580 case AArch64::LDRSHWpre: 581 case AArch64::LDRSHXpost: 582 case AArch64::LDRSHXpre: 583 case AArch64::LDRSWpost: 584 case AArch64::LDRSWpre: 585 case AArch64::LDRSpost: 586 case AArch64::LDRSpre: 587 case AArch64::LDRWpost: 588 case AArch64::LDRWpre: 589 case AArch64::LDRXpost: 590 case AArch64::LDRXpre: 591 DestRegIdx = 1; 592 BaseRegIdx = 2; 593 OffsetIdx = 3; 594 IsPrePost = true; 595 break; 596 597 case AArch64::LDNPDi: 598 case AArch64::LDNPQi: 599 case AArch64::LDNPSi: 600 case AArch64::LDPQi: 601 case AArch64::LDPDi: 602 case AArch64::LDPSi: 603 DestRegIdx = -1; 604 BaseRegIdx = 2; 605 OffsetIdx = 3; 606 IsPrePost = false; 607 break; 608 609 case AArch64::LDPSWi: 610 case AArch64::LDPWi: 611 case AArch64::LDPXi: 612 DestRegIdx = 0; 613 BaseRegIdx = 2; 614 OffsetIdx = 3; 615 IsPrePost = false; 616 break; 617 618 case AArch64::LDPQpost: 619 case AArch64::LDPQpre: 620 case AArch64::LDPDpost: 621 case AArch64::LDPDpre: 622 case AArch64::LDPSpost: 623 case AArch64::LDPSpre: 624 DestRegIdx = -1; 625 BaseRegIdx = 3; 626 OffsetIdx = 4; 627 IsPrePost = true; 628 break; 629 630 case AArch64::LDPSWpost: 631 case AArch64::LDPSWpre: 632 case AArch64::LDPWpost: 633 case AArch64::LDPWpre: 634 case AArch64::LDPXpost: 635 case AArch64::LDPXpre: 636 DestRegIdx = 1; 637 BaseRegIdx = 3; 638 OffsetIdx = 4; 639 IsPrePost = true; 640 break; 641 } 642 643 // Loads from the stack pointer don't get prefetched. 644 Register BaseReg = MI.getOperand(BaseRegIdx).getReg(); 645 if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP) 646 return std::nullopt; 647 648 LoadInfo LI; 649 LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg(); 650 LI.BaseReg = BaseReg; 651 LI.BaseRegIdx = BaseRegIdx; 652 LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); 653 LI.IsPrePost = IsPrePost; 654 return LI; 655 } 656 657 static std::optional<unsigned> getTag(const TargetRegisterInfo *TRI, 658 const MachineInstr &MI, 659 const LoadInfo &LI) { 660 unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0; 661 unsigned Base = TRI->getEncodingValue(LI.BaseReg); 662 unsigned Off; 663 if (LI.OffsetOpnd == nullptr) 664 Off = 0; 665 else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() || 666 LI.OffsetOpnd->isCPI()) 667 return std::nullopt; 668 else if (LI.OffsetOpnd->isReg()) 669 Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg()); 670 else 671 Off = LI.OffsetOpnd->getImm() >> 2; 672 673 return makeTag(Dest, Base, Off); 674 } 675 676 void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { 677 // Build the initial tag map for the whole loop. 678 TagMap.clear(); 679 for (MachineBasicBlock *MBB : L.getBlocks()) 680 for (MachineInstr &MI : *MBB) { 681 std::optional<LoadInfo> LInfo = getLoadInfo(MI); 682 if (!LInfo) 683 continue; 684 std::optional<unsigned> Tag = getTag(TRI, MI, *LInfo); 685 if (!Tag) 686 continue; 687 TagMap[*Tag].push_back(&MI); 688 } 689 690 bool AnyCollisions = false; 691 for (auto &P : TagMap) { 692 auto Size = P.second.size(); 693 if (Size > 1) { 694 for (auto *MI : P.second) { 695 if (TII->isStridedAccess(*MI)) { 696 AnyCollisions = true; 697 break; 698 } 699 } 700 } 701 if (AnyCollisions) 702 break; 703 } 704 // Nothing to fix. 705 if (!AnyCollisions) 706 return; 707 708 MachineRegisterInfo &MRI = Fn.getRegInfo(); 709 710 // Go through all the basic blocks in the current loop and fix any streaming 711 // loads to avoid collisions with any other loads. 712 LiveRegUnits LR(*TRI); 713 for (MachineBasicBlock *MBB : L.getBlocks()) { 714 LR.clear(); 715 LR.addLiveOuts(*MBB); 716 for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) { 717 MachineInstr &MI = *I; 718 if (!TII->isStridedAccess(MI)) 719 continue; 720 721 std::optional<LoadInfo> OptLdI = getLoadInfo(MI); 722 if (!OptLdI) 723 continue; 724 LoadInfo LdI = *OptLdI; 725 std::optional<unsigned> OptOldTag = getTag(TRI, MI, LdI); 726 if (!OptOldTag) 727 continue; 728 auto &OldCollisions = TagMap[*OptOldTag]; 729 if (OldCollisions.size() <= 1) 730 continue; 731 732 bool Fixed = false; 733 LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI); 734 735 if (!DebugCounter::shouldExecute(FixCounter)) { 736 LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI); 737 continue; 738 } 739 740 // Add the non-base registers of MI as live so we don't use them as 741 // scratch registers. 742 for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) { 743 if (OpI == static_cast<unsigned>(LdI.BaseRegIdx)) 744 continue; 745 MachineOperand &MO = MI.getOperand(OpI); 746 if (MO.isReg() && MO.readsReg()) 747 LR.addReg(MO.getReg()); 748 } 749 750 for (unsigned ScratchReg : AArch64::GPR64RegClass) { 751 if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg)) 752 continue; 753 754 LoadInfo NewLdI(LdI); 755 NewLdI.BaseReg = ScratchReg; 756 unsigned NewTag = *getTag(TRI, MI, NewLdI); 757 // Scratch reg tag would collide too, so don't use it. 758 if (TagMap.count(NewTag)) 759 continue; 760 761 LLVM_DEBUG(dbgs() << "Changing base reg to: " 762 << printReg(ScratchReg, TRI) << '\n'); 763 764 // Rewrite: 765 // Xd = LOAD Xb, off 766 // to: 767 // Xc = MOV Xb 768 // Xd = LOAD Xc, off 769 DebugLoc DL = MI.getDebugLoc(); 770 BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg) 771 .addReg(AArch64::XZR) 772 .addReg(LdI.BaseReg) 773 .addImm(0); 774 MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx); 775 BaseOpnd.setReg(ScratchReg); 776 777 // If the load does a pre/post increment, then insert a MOV after as 778 // well to update the real base register. 779 if (LdI.IsPrePost) { 780 LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: " 781 << printReg(ScratchReg, TRI) << '\n'); 782 MI.getOperand(0).setReg( 783 ScratchReg); // Change tied operand pre/post update dest. 784 BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL, 785 TII->get(AArch64::ORRXrs), LdI.BaseReg) 786 .addReg(AArch64::XZR) 787 .addReg(ScratchReg) 788 .addImm(0); 789 } 790 791 for (int I = 0, E = OldCollisions.size(); I != E; ++I) 792 if (OldCollisions[I] == &MI) { 793 std::swap(OldCollisions[I], OldCollisions[E - 1]); 794 OldCollisions.pop_back(); 795 break; 796 } 797 798 // Update TagMap to reflect instruction changes to reduce the number 799 // of later MOVs to be inserted. This needs to be done after 800 // OldCollisions is updated since it may be relocated by this 801 // insertion. 802 TagMap[NewTag].push_back(&MI); 803 ++NumCollisionsAvoided; 804 Fixed = true; 805 Modified = true; 806 break; 807 } 808 if (!Fixed) 809 ++NumCollisionsNotAvoided; 810 } 811 } 812 } 813 814 bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { 815 auto &ST = Fn.getSubtarget<AArch64Subtarget>(); 816 if (ST.getProcFamily() != AArch64Subtarget::Falkor) 817 return false; 818 819 if (skipFunction(Fn.getFunction())) 820 return false; 821 822 TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 823 TRI = ST.getRegisterInfo(); 824 825 MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>(); 826 827 Modified = false; 828 829 for (MachineLoop *I : LI) 830 for (MachineLoop *L : depth_first(I)) 831 // Only process inner-loops 832 if (L->isInnermost()) 833 runOnLoop(*L, Fn); 834 835 return Modified; 836 } 837 838 FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); } 839