1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// R600 Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600MachineScheduler.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "R600Subtarget.h" 17 18 using namespace llvm; 19 20 #define DEBUG_TYPE "machine-scheduler" 21 22 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { 23 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); 24 DAG = static_cast<ScheduleDAGMILive*>(dag); 25 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); 26 TII = static_cast<const R600InstrInfo*>(DAG->TII); 27 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); 28 VLIW5 = !ST.hasCaymanISA(); 29 MRI = &DAG->MRI; 30 CurInstKind = IDOther; 31 CurEmitted = 0; 32 OccupedSlotsMask = 31; 33 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); 34 InstKindLimit[IDOther] = 32; 35 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); 36 AluInstCount = 0; 37 FetchInstCount = 0; 38 } 39 40 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, 41 std::vector<SUnit *> &QDst) 42 { 43 llvm::append_range(QDst, QSrc); 44 QSrc.clear(); 45 } 46 47 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { 48 assert (GPRCount && "GPRCount cannot be 0"); 49 return 248 / GPRCount; 50 } 51 52 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { 53 SUnit *SU = nullptr; 54 NextInstKind = IDOther; 55 56 IsTopNode = false; 57 58 // check if we might want to switch current clause type 59 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || 60 (Available[CurInstKind].empty()); 61 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && 62 (!Available[IDFetch].empty() || !Available[IDOther].empty()); 63 64 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { 65 // We use the heuristic provided by AMD Accelerated Parallel Processing 66 // OpenCL Programming Guide : 67 // The approx. number of WF that allows TEX inst to hide ALU inst is : 68 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) 69 float ALUFetchRationEstimate = 70 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / 71 (FetchInstCount + Available[IDFetch].size()); 72 if (ALUFetchRationEstimate == 0) { 73 AllowSwitchFromAlu = true; 74 } else { 75 unsigned NeededWF = 62.5f / ALUFetchRationEstimate; 76 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n"); 77 // We assume the local GPR requirements to be "dominated" by the requirement 78 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and 79 // after TEX are indeed likely to consume or generate values from/for the 80 // TEX clause. 81 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause 82 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need 83 // one GPR) or TmXYZW = TnXYZW (need 2 GPR). 84 // (TODO : use RegisterPressure) 85 // If we are going too use too many GPR, we flush Fetch instruction to lower 86 // register pressure on 128 bits regs. 87 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); 88 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) 89 AllowSwitchFromAlu = true; 90 } 91 } 92 93 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || 94 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { 95 // try to pick ALU 96 SU = pickAlu(); 97 if (!SU && !PhysicalRegCopy.empty()) { 98 SU = PhysicalRegCopy.front(); 99 PhysicalRegCopy.erase(PhysicalRegCopy.begin()); 100 } 101 if (SU) { 102 if (CurEmitted >= InstKindLimit[IDAlu]) 103 CurEmitted = 0; 104 NextInstKind = IDAlu; 105 } 106 } 107 108 if (!SU) { 109 // try to pick FETCH 110 SU = pickOther(IDFetch); 111 if (SU) 112 NextInstKind = IDFetch; 113 } 114 115 // try to pick other 116 if (!SU) { 117 SU = pickOther(IDOther); 118 if (SU) 119 NextInstKind = IDOther; 120 } 121 122 LLVM_DEBUG(if (SU) { 123 dbgs() << " ** Pick node **\n"; 124 DAG->dumpNode(*SU); 125 } else { 126 dbgs() << "NO NODE \n"; 127 for (unsigned i = 0; i < DAG->SUnits.size(); i++) { 128 const SUnit &S = DAG->SUnits[i]; 129 if (!S.isScheduled) 130 DAG->dumpNode(S); 131 } 132 }); 133 134 return SU; 135 } 136 137 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 138 if (NextInstKind != CurInstKind) { 139 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n"); 140 if (NextInstKind != IDAlu) 141 OccupedSlotsMask |= 31; 142 CurEmitted = 0; 143 CurInstKind = NextInstKind; 144 } 145 146 if (CurInstKind == IDAlu) { 147 AluInstCount ++; 148 switch (getAluKind(SU)) { 149 case AluT_XYZW: 150 CurEmitted += 4; 151 break; 152 case AluDiscarded: 153 break; 154 default: { 155 ++CurEmitted; 156 for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), 157 E = SU->getInstr()->operands_end(); It != E; ++It) { 158 MachineOperand &MO = *It; 159 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) 160 ++CurEmitted; 161 } 162 } 163 } 164 } else { 165 ++CurEmitted; 166 } 167 168 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); 169 170 if (CurInstKind != IDFetch) { 171 MoveUnits(Pending[IDFetch], Available[IDFetch]); 172 } else 173 FetchInstCount++; 174 } 175 176 static bool 177 isPhysicalRegCopy(MachineInstr *MI) { 178 if (MI->getOpcode() != R600::COPY) 179 return false; 180 181 return !MI->getOperand(1).getReg().isVirtual(); 182 } 183 184 void R600SchedStrategy::releaseTopNode(SUnit *SU) { 185 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU)); 186 } 187 188 void R600SchedStrategy::releaseBottomNode(SUnit *SU) { 189 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU)); 190 if (isPhysicalRegCopy(SU->getInstr())) { 191 PhysicalRegCopy.push_back(SU); 192 return; 193 } 194 195 int IK = getInstKind(SU); 196 197 // There is no export clause, we can schedule one as soon as its ready 198 if (IK == IDOther) 199 Available[IDOther].push_back(SU); 200 else 201 Pending[IK].push_back(SU); 202 203 } 204 205 bool R600SchedStrategy::regBelongsToClass(Register Reg, 206 const TargetRegisterClass *RC) const { 207 if (!Reg.isVirtual()) { 208 return RC->contains(Reg); 209 } else { 210 return MRI->getRegClass(Reg) == RC; 211 } 212 } 213 214 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { 215 MachineInstr *MI = SU->getInstr(); 216 217 if (TII->isTransOnly(*MI)) 218 return AluTrans; 219 220 switch (MI->getOpcode()) { 221 case R600::PRED_X: 222 return AluPredX; 223 case R600::INTERP_PAIR_XY: 224 case R600::INTERP_PAIR_ZW: 225 case R600::INTERP_VEC_LOAD: 226 case R600::DOT_4: 227 return AluT_XYZW; 228 case R600::COPY: 229 if (MI->getOperand(1).isUndef()) { 230 // MI will become a KILL, don't considers it in scheduling 231 return AluDiscarded; 232 } 233 break; 234 default: 235 break; 236 } 237 238 // Does the instruction take a whole IG ? 239 // XXX: Is it possible to add a helper function in R600InstrInfo that can 240 // be used here and in R600PacketizerList::isSoloInstruction() ? 241 if(TII->isVector(*MI) || 242 TII->isCubeOp(MI->getOpcode()) || 243 TII->isReductionOp(MI->getOpcode()) || 244 MI->getOpcode() == R600::GROUP_BARRIER) { 245 return AluT_XYZW; 246 } 247 248 if (TII->isLDSInstr(MI->getOpcode())) { 249 return AluT_X; 250 } 251 252 // Is the result already assigned to a channel ? 253 unsigned DestSubReg = MI->getOperand(0).getSubReg(); 254 switch (DestSubReg) { 255 case R600::sub0: 256 return AluT_X; 257 case R600::sub1: 258 return AluT_Y; 259 case R600::sub2: 260 return AluT_Z; 261 case R600::sub3: 262 return AluT_W; 263 default: 264 break; 265 } 266 267 // Is the result already member of a X/Y/Z/W class ? 268 Register DestReg = MI->getOperand(0).getReg(); 269 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || 270 regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) 271 return AluT_X; 272 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass)) 273 return AluT_Y; 274 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass)) 275 return AluT_Z; 276 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass)) 277 return AluT_W; 278 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass)) 279 return AluT_XYZW; 280 281 // LDS src registers cannot be used in the Trans slot. 282 if (TII->readsLDSSrcReg(*MI)) 283 return AluT_XYZW; 284 285 return AluAny; 286 } 287 288 int R600SchedStrategy::getInstKind(SUnit* SU) { 289 int Opcode = SU->getInstr()->getOpcode(); 290 291 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) 292 return IDFetch; 293 294 if (TII->isALUInstr(Opcode)) { 295 return IDAlu; 296 } 297 298 switch (Opcode) { 299 case R600::PRED_X: 300 case R600::COPY: 301 case R600::CONST_COPY: 302 case R600::INTERP_PAIR_XY: 303 case R600::INTERP_PAIR_ZW: 304 case R600::INTERP_VEC_LOAD: 305 case R600::DOT_4: 306 return IDAlu; 307 default: 308 return IDOther; 309 } 310 } 311 312 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { 313 if (Q.empty()) 314 return nullptr; 315 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); 316 It != E; ++It) { 317 SUnit *SU = *It; 318 InstructionsGroupCandidate.push_back(SU->getInstr()); 319 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && 320 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) { 321 InstructionsGroupCandidate.pop_back(); 322 Q.erase((It + 1).base()); 323 return SU; 324 } else { 325 InstructionsGroupCandidate.pop_back(); 326 } 327 } 328 return nullptr; 329 } 330 331 void R600SchedStrategy::LoadAlu() { 332 std::vector<SUnit *> &QSrc = Pending[IDAlu]; 333 for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { 334 AluKind AK = getAluKind(QSrc[i]); 335 AvailableAlus[AK].push_back(QSrc[i]); 336 } 337 QSrc.clear(); 338 } 339 340 void R600SchedStrategy::PrepareNextSlot() { 341 LLVM_DEBUG(dbgs() << "New Slot\n"); 342 assert (OccupedSlotsMask && "Slot wasn't filled"); 343 OccupedSlotsMask = 0; 344 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) 345 // OccupedSlotsMask |= 16; 346 InstructionsGroupCandidate.clear(); 347 LoadAlu(); 348 } 349 350 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { 351 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); 352 if (DstIndex == -1) { 353 return; 354 } 355 Register DestReg = MI->getOperand(DstIndex).getReg(); 356 // PressureRegister crashes if an operand is def and used in the same inst 357 // and we try to constraint its regclass 358 for (MachineInstr::mop_iterator It = MI->operands_begin(), 359 E = MI->operands_end(); It != E; ++It) { 360 MachineOperand &MO = *It; 361 if (MO.isReg() && !MO.isDef() && 362 MO.getReg() == DestReg) 363 return; 364 } 365 // Constrains the regclass of DestReg to assign it to Slot 366 switch (Slot) { 367 case 0: 368 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass); 369 break; 370 case 1: 371 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass); 372 break; 373 case 2: 374 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass); 375 break; 376 case 3: 377 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass); 378 break; 379 } 380 } 381 382 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { 383 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; 384 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); 385 if (SlotedSU) 386 return SlotedSU; 387 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); 388 if (UnslotedSU) 389 AssignSlot(UnslotedSU->getInstr(), Slot); 390 return UnslotedSU; 391 } 392 393 unsigned R600SchedStrategy::AvailablesAluCount() const { 394 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + 395 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + 396 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + 397 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + 398 AvailableAlus[AluPredX].size(); 399 } 400 401 SUnit* R600SchedStrategy::pickAlu() { 402 while (AvailablesAluCount() || !Pending[IDAlu].empty()) { 403 if (!OccupedSlotsMask) { 404 // Bottom up scheduling : predX must comes first 405 if (!AvailableAlus[AluPredX].empty()) { 406 OccupedSlotsMask |= 31; 407 return PopInst(AvailableAlus[AluPredX], false); 408 } 409 // Flush physical reg copies (RA will discard them) 410 if (!AvailableAlus[AluDiscarded].empty()) { 411 OccupedSlotsMask |= 31; 412 return PopInst(AvailableAlus[AluDiscarded], false); 413 } 414 // If there is a T_XYZW alu available, use it 415 if (!AvailableAlus[AluT_XYZW].empty()) { 416 OccupedSlotsMask |= 15; 417 return PopInst(AvailableAlus[AluT_XYZW], false); 418 } 419 } 420 bool TransSlotOccuped = OccupedSlotsMask & 16; 421 if (!TransSlotOccuped && VLIW5) { 422 if (!AvailableAlus[AluTrans].empty()) { 423 OccupedSlotsMask |= 16; 424 return PopInst(AvailableAlus[AluTrans], false); 425 } 426 SUnit *SU = AttemptFillSlot(3, true); 427 if (SU) { 428 OccupedSlotsMask |= 16; 429 return SU; 430 } 431 } 432 for (int Chan = 3; Chan > -1; --Chan) { 433 bool isOccupied = OccupedSlotsMask & (1 << Chan); 434 if (!isOccupied) { 435 SUnit *SU = AttemptFillSlot(Chan, false); 436 if (SU) { 437 OccupedSlotsMask |= (1 << Chan); 438 InstructionsGroupCandidate.push_back(SU->getInstr()); 439 return SU; 440 } 441 } 442 } 443 PrepareNextSlot(); 444 } 445 return nullptr; 446 } 447 448 SUnit* R600SchedStrategy::pickOther(int QID) { 449 SUnit *SU = nullptr; 450 std::vector<SUnit *> &AQ = Available[QID]; 451 452 if (AQ.empty()) { 453 MoveUnits(Pending[QID], AQ); 454 } 455 if (!AQ.empty()) { 456 SU = AQ.back(); 457 AQ.pop_back(); 458 } 459 return SU; 460 } 461