1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// R600 Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600MachineScheduler.h" 15 #include "MCTargetDesc/R600MCTargetDesc.h" 16 #include "R600Subtarget.h" 17 18 using namespace llvm; 19 20 #define DEBUG_TYPE "machine-scheduler" 21 22 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { 23 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); 24 DAG = static_cast<ScheduleDAGMILive*>(dag); 25 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); 26 TII = static_cast<const R600InstrInfo*>(DAG->TII); 27 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); 28 VLIW5 = !ST.hasCaymanISA(); 29 MRI = &DAG->MRI; 30 CurInstKind = IDOther; 31 CurEmitted = 0; 32 OccupiedSlotsMask = 31; 33 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); 34 InstKindLimit[IDOther] = 32; 35 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); 36 AluInstCount = 0; 37 FetchInstCount = 0; 38 } 39 40 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, 41 std::vector<SUnit *> &QDst) 42 { 43 llvm::append_range(QDst, QSrc); 44 QSrc.clear(); 45 } 46 47 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { 48 assert (GPRCount && "GPRCount cannot be 0"); 49 return 248 / GPRCount; 50 } 51 52 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { 53 SUnit *SU = nullptr; 54 NextInstKind = IDOther; 55 56 IsTopNode = false; 57 58 // check if we might want to switch current clause type 59 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || 60 (Available[CurInstKind].empty()); 61 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && 62 (!Available[IDFetch].empty() || !Available[IDOther].empty()); 63 64 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { 65 // We use the heuristic provided by AMD Accelerated Parallel Processing 66 // OpenCL Programming Guide : 67 // The approx. number of WF that allows TEX inst to hide ALU inst is : 68 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) 69 float ALUFetchRationEstimate = 70 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / 71 (FetchInstCount + Available[IDFetch].size()); 72 if (ALUFetchRationEstimate == 0) { 73 AllowSwitchFromAlu = true; 74 } else { 75 unsigned NeededWF = 62.5f / ALUFetchRationEstimate; 76 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n"); 77 // We assume the local GPR requirements to be "dominated" by the requirement 78 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and 79 // after TEX are indeed likely to consume or generate values from/for the 80 // TEX clause. 81 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause 82 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need 83 // one GPR) or TmXYZW = TnXYZW (need 2 GPR). 84 // (TODO : use RegisterPressure) 85 // If we are going too use too many GPR, we flush Fetch instruction to lower 86 // register pressure on 128 bits regs. 87 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); 88 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) 89 AllowSwitchFromAlu = true; 90 } 91 } 92 93 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || 94 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { 95 // try to pick ALU 96 SU = pickAlu(); 97 if (!SU && !PhysicalRegCopy.empty()) { 98 SU = PhysicalRegCopy.front(); 99 PhysicalRegCopy.erase(PhysicalRegCopy.begin()); 100 } 101 if (SU) { 102 if (CurEmitted >= InstKindLimit[IDAlu]) 103 CurEmitted = 0; 104 NextInstKind = IDAlu; 105 } 106 } 107 108 if (!SU) { 109 // try to pick FETCH 110 SU = pickOther(IDFetch); 111 if (SU) 112 NextInstKind = IDFetch; 113 } 114 115 // try to pick other 116 if (!SU) { 117 SU = pickOther(IDOther); 118 if (SU) 119 NextInstKind = IDOther; 120 } 121 122 LLVM_DEBUG(if (SU) { 123 dbgs() << " ** Pick node **\n"; 124 DAG->dumpNode(*SU); 125 } else { 126 dbgs() << "NO NODE \n"; 127 for (const SUnit &S : DAG->SUnits) 128 if (!S.isScheduled) 129 DAG->dumpNode(S); 130 }); 131 132 return SU; 133 } 134 135 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 136 if (NextInstKind != CurInstKind) { 137 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n"); 138 if (NextInstKind != IDAlu) 139 OccupiedSlotsMask |= 31; 140 CurEmitted = 0; 141 CurInstKind = NextInstKind; 142 } 143 144 if (CurInstKind == IDAlu) { 145 AluInstCount ++; 146 switch (getAluKind(SU)) { 147 case AluT_XYZW: 148 CurEmitted += 4; 149 break; 150 case AluDiscarded: 151 break; 152 default: { 153 ++CurEmitted; 154 for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), 155 E = SU->getInstr()->operands_end(); It != E; ++It) { 156 MachineOperand &MO = *It; 157 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) 158 ++CurEmitted; 159 } 160 } 161 } 162 } else { 163 ++CurEmitted; 164 } 165 166 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); 167 168 if (CurInstKind != IDFetch) { 169 MoveUnits(Pending[IDFetch], Available[IDFetch]); 170 } else 171 FetchInstCount++; 172 } 173 174 static bool 175 isPhysicalRegCopy(MachineInstr *MI) { 176 if (MI->getOpcode() != R600::COPY) 177 return false; 178 179 return !MI->getOperand(1).getReg().isVirtual(); 180 } 181 182 void R600SchedStrategy::releaseTopNode(SUnit *SU) { 183 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU)); 184 } 185 186 void R600SchedStrategy::releaseBottomNode(SUnit *SU) { 187 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU)); 188 if (isPhysicalRegCopy(SU->getInstr())) { 189 PhysicalRegCopy.push_back(SU); 190 return; 191 } 192 193 int IK = getInstKind(SU); 194 195 // There is no export clause, we can schedule one as soon as its ready 196 if (IK == IDOther) 197 Available[IDOther].push_back(SU); 198 else 199 Pending[IK].push_back(SU); 200 201 } 202 203 bool R600SchedStrategy::regBelongsToClass(Register Reg, 204 const TargetRegisterClass *RC) const { 205 if (!Reg.isVirtual()) 206 return RC->contains(Reg); 207 return MRI->getRegClass(Reg) == RC; 208 } 209 210 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { 211 MachineInstr *MI = SU->getInstr(); 212 213 if (TII->isTransOnly(*MI)) 214 return AluTrans; 215 216 switch (MI->getOpcode()) { 217 case R600::PRED_X: 218 return AluPredX; 219 case R600::INTERP_PAIR_XY: 220 case R600::INTERP_PAIR_ZW: 221 case R600::INTERP_VEC_LOAD: 222 case R600::DOT_4: 223 return AluT_XYZW; 224 case R600::COPY: 225 if (MI->getOperand(1).isUndef()) { 226 // MI will become a KILL, don't considers it in scheduling 227 return AluDiscarded; 228 } 229 break; 230 default: 231 break; 232 } 233 234 // Does the instruction take a whole IG ? 235 // XXX: Is it possible to add a helper function in R600InstrInfo that can 236 // be used here and in R600PacketizerList::isSoloInstruction() ? 237 if(TII->isVector(*MI) || 238 TII->isCubeOp(MI->getOpcode()) || 239 TII->isReductionOp(MI->getOpcode()) || 240 MI->getOpcode() == R600::GROUP_BARRIER) { 241 return AluT_XYZW; 242 } 243 244 if (TII->isLDSInstr(MI->getOpcode())) { 245 return AluT_X; 246 } 247 248 // Is the result already assigned to a channel ? 249 unsigned DestSubReg = MI->getOperand(0).getSubReg(); 250 switch (DestSubReg) { 251 case R600::sub0: 252 return AluT_X; 253 case R600::sub1: 254 return AluT_Y; 255 case R600::sub2: 256 return AluT_Z; 257 case R600::sub3: 258 return AluT_W; 259 default: 260 break; 261 } 262 263 // Is the result already member of a X/Y/Z/W class ? 264 Register DestReg = MI->getOperand(0).getReg(); 265 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || 266 regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) 267 return AluT_X; 268 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass)) 269 return AluT_Y; 270 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass)) 271 return AluT_Z; 272 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass)) 273 return AluT_W; 274 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass)) 275 return AluT_XYZW; 276 277 // LDS src registers cannot be used in the Trans slot. 278 if (TII->readsLDSSrcReg(*MI)) 279 return AluT_XYZW; 280 281 return AluAny; 282 } 283 284 int R600SchedStrategy::getInstKind(SUnit* SU) { 285 int Opcode = SU->getInstr()->getOpcode(); 286 287 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) 288 return IDFetch; 289 290 if (TII->isALUInstr(Opcode)) { 291 return IDAlu; 292 } 293 294 switch (Opcode) { 295 case R600::PRED_X: 296 case R600::COPY: 297 case R600::CONST_COPY: 298 case R600::INTERP_PAIR_XY: 299 case R600::INTERP_PAIR_ZW: 300 case R600::INTERP_VEC_LOAD: 301 case R600::DOT_4: 302 return IDAlu; 303 default: 304 return IDOther; 305 } 306 } 307 308 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { 309 if (Q.empty()) 310 return nullptr; 311 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); 312 It != E; ++It) { 313 SUnit *SU = *It; 314 InstructionsGroupCandidate.push_back(SU->getInstr()); 315 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && 316 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) { 317 InstructionsGroupCandidate.pop_back(); 318 Q.erase((It + 1).base()); 319 return SU; 320 } 321 InstructionsGroupCandidate.pop_back(); 322 } 323 return nullptr; 324 } 325 326 void R600SchedStrategy::LoadAlu() { 327 std::vector<SUnit *> &QSrc = Pending[IDAlu]; 328 for (SUnit *SU : QSrc) { 329 AluKind AK = getAluKind(SU); 330 AvailableAlus[AK].push_back(SU); 331 } 332 QSrc.clear(); 333 } 334 335 void R600SchedStrategy::PrepareNextSlot() { 336 LLVM_DEBUG(dbgs() << "New Slot\n"); 337 assert(OccupiedSlotsMask && "Slot wasn't filled"); 338 OccupiedSlotsMask = 0; 339 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) 340 // OccupiedSlotsMask |= 16; 341 InstructionsGroupCandidate.clear(); 342 LoadAlu(); 343 } 344 345 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { 346 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); 347 if (DstIndex == -1) { 348 return; 349 } 350 Register DestReg = MI->getOperand(DstIndex).getReg(); 351 // PressureRegister crashes if an operand is def and used in the same inst 352 // and we try to constraint its regclass 353 for (MachineInstr::mop_iterator It = MI->operands_begin(), 354 E = MI->operands_end(); It != E; ++It) { 355 MachineOperand &MO = *It; 356 if (MO.isReg() && !MO.isDef() && 357 MO.getReg() == DestReg) 358 return; 359 } 360 // Constrains the regclass of DestReg to assign it to Slot 361 switch (Slot) { 362 case 0: 363 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass); 364 break; 365 case 1: 366 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass); 367 break; 368 case 2: 369 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass); 370 break; 371 case 3: 372 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass); 373 break; 374 } 375 } 376 377 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { 378 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; 379 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); 380 if (SlotedSU) 381 return SlotedSU; 382 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); 383 if (UnslotedSU) 384 AssignSlot(UnslotedSU->getInstr(), Slot); 385 return UnslotedSU; 386 } 387 388 unsigned R600SchedStrategy::AvailablesAluCount() const { 389 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + 390 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + 391 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + 392 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + 393 AvailableAlus[AluPredX].size(); 394 } 395 396 SUnit* R600SchedStrategy::pickAlu() { 397 while (AvailablesAluCount() || !Pending[IDAlu].empty()) { 398 if (!OccupiedSlotsMask) { 399 // Bottom up scheduling : predX must comes first 400 if (!AvailableAlus[AluPredX].empty()) { 401 OccupiedSlotsMask |= 31; 402 return PopInst(AvailableAlus[AluPredX], false); 403 } 404 // Flush physical reg copies (RA will discard them) 405 if (!AvailableAlus[AluDiscarded].empty()) { 406 OccupiedSlotsMask |= 31; 407 return PopInst(AvailableAlus[AluDiscarded], false); 408 } 409 // If there is a T_XYZW alu available, use it 410 if (!AvailableAlus[AluT_XYZW].empty()) { 411 OccupiedSlotsMask |= 15; 412 return PopInst(AvailableAlus[AluT_XYZW], false); 413 } 414 } 415 bool TransSlotOccupied = OccupiedSlotsMask & 16; 416 if (!TransSlotOccupied && VLIW5) { 417 if (!AvailableAlus[AluTrans].empty()) { 418 OccupiedSlotsMask |= 16; 419 return PopInst(AvailableAlus[AluTrans], false); 420 } 421 SUnit *SU = AttemptFillSlot(3, true); 422 if (SU) { 423 OccupiedSlotsMask |= 16; 424 return SU; 425 } 426 } 427 for (int Chan = 3; Chan > -1; --Chan) { 428 bool isOccupied = OccupiedSlotsMask & (1 << Chan); 429 if (!isOccupied) { 430 SUnit *SU = AttemptFillSlot(Chan, false); 431 if (SU) { 432 OccupiedSlotsMask |= (1 << Chan); 433 InstructionsGroupCandidate.push_back(SU->getInstr()); 434 return SU; 435 } 436 } 437 } 438 PrepareNextSlot(); 439 } 440 return nullptr; 441 } 442 443 SUnit* R600SchedStrategy::pickOther(int QID) { 444 SUnit *SU = nullptr; 445 std::vector<SUnit *> &AQ = Available[QID]; 446 447 if (AQ.empty()) { 448 MoveUnits(Pending[QID], AQ); 449 } 450 if (!AQ.empty()) { 451 SU = AQ.back(); 452 AQ.pop_back(); 453 } 454 return SU; 455 } 456