1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// R600 Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600MachineScheduler.h" 15 #include "MCTargetDesc/R600MCTargetDesc.h" 16 #include "R600Subtarget.h" 17 18 using namespace llvm; 19 20 #define DEBUG_TYPE "machine-scheduler" 21 22 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { 23 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); 24 DAG = static_cast<ScheduleDAGMILive*>(dag); 25 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); 26 TII = static_cast<const R600InstrInfo*>(DAG->TII); 27 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); 28 VLIW5 = !ST.hasCaymanISA(); 29 MRI = &DAG->MRI; 30 CurInstKind = IDOther; 31 CurEmitted = 0; 32 OccupiedSlotsMask = 31; 33 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); 34 InstKindLimit[IDOther] = 32; 35 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); 36 AluInstCount = 0; 37 FetchInstCount = 0; 38 } 39 40 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, 41 std::vector<SUnit *> &QDst) 42 { 43 llvm::append_range(QDst, QSrc); 44 QSrc.clear(); 45 } 46 47 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { 48 assert (GPRCount && "GPRCount cannot be 0"); 49 return 248 / GPRCount; 50 } 51 52 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { 53 SUnit *SU = nullptr; 54 NextInstKind = IDOther; 55 56 IsTopNode = false; 57 58 // check if we might want to switch current clause type 59 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || 60 (Available[CurInstKind].empty()); 61 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && 62 (!Available[IDFetch].empty() || !Available[IDOther].empty()); 63 64 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { 65 // We use the heuristic provided by AMD Accelerated Parallel Processing 66 // OpenCL Programming Guide : 67 // The approx. number of WF that allows TEX inst to hide ALU inst is : 68 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) 69 float ALUFetchRationEstimate = 70 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / 71 (FetchInstCount + Available[IDFetch].size()); 72 if (ALUFetchRationEstimate == 0) { 73 AllowSwitchFromAlu = true; 74 } else { 75 unsigned NeededWF = 62.5f / ALUFetchRationEstimate; 76 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n"); 77 // We assume the local GPR requirements to be "dominated" by the requirement 78 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and 79 // after TEX are indeed likely to consume or generate values from/for the 80 // TEX clause. 81 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause 82 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need 83 // one GPR) or TmXYZW = TnXYZW (need 2 GPR). 84 // (TODO : use RegisterPressure) 85 // If we are going too use too many GPR, we flush Fetch instruction to lower 86 // register pressure on 128 bits regs. 87 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); 88 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) 89 AllowSwitchFromAlu = true; 90 } 91 } 92 93 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || 94 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { 95 // try to pick ALU 96 SU = pickAlu(); 97 if (!SU && !PhysicalRegCopy.empty()) { 98 SU = PhysicalRegCopy.front(); 99 PhysicalRegCopy.erase(PhysicalRegCopy.begin()); 100 } 101 if (SU) { 102 if (CurEmitted >= InstKindLimit[IDAlu]) 103 CurEmitted = 0; 104 NextInstKind = IDAlu; 105 } 106 } 107 108 if (!SU) { 109 // try to pick FETCH 110 SU = pickOther(IDFetch); 111 if (SU) 112 NextInstKind = IDFetch; 113 } 114 115 // try to pick other 116 if (!SU) { 117 SU = pickOther(IDOther); 118 if (SU) 119 NextInstKind = IDOther; 120 } 121 122 LLVM_DEBUG(if (SU) { 123 dbgs() << " ** Pick node **\n"; 124 DAG->dumpNode(*SU); 125 } else { 126 dbgs() << "NO NODE \n"; 127 for (const SUnit &S : DAG->SUnits) 128 if (!S.isScheduled) 129 DAG->dumpNode(S); 130 }); 131 132 return SU; 133 } 134 135 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 136 if (NextInstKind != CurInstKind) { 137 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n"); 138 if (NextInstKind != IDAlu) 139 OccupiedSlotsMask |= 31; 140 CurEmitted = 0; 141 CurInstKind = NextInstKind; 142 } 143 144 if (CurInstKind == IDAlu) { 145 AluInstCount ++; 146 switch (getAluKind(SU)) { 147 case AluT_XYZW: 148 CurEmitted += 4; 149 break; 150 case AluDiscarded: 151 break; 152 default: { 153 ++CurEmitted; 154 for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), 155 E = SU->getInstr()->operands_end(); It != E; ++It) { 156 MachineOperand &MO = *It; 157 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) 158 ++CurEmitted; 159 } 160 } 161 } 162 } else { 163 ++CurEmitted; 164 } 165 166 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); 167 168 if (CurInstKind != IDFetch) { 169 MoveUnits(Pending[IDFetch], Available[IDFetch]); 170 } else 171 FetchInstCount++; 172 } 173 174 static bool 175 isPhysicalRegCopy(MachineInstr *MI) { 176 if (MI->getOpcode() != R600::COPY) 177 return false; 178 179 return !MI->getOperand(1).getReg().isVirtual(); 180 } 181 182 void R600SchedStrategy::releaseTopNode(SUnit *SU) { 183 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU)); 184 } 185 186 void R600SchedStrategy::releaseBottomNode(SUnit *SU) { 187 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU)); 188 if (isPhysicalRegCopy(SU->getInstr())) { 189 PhysicalRegCopy.push_back(SU); 190 return; 191 } 192 193 int IK = getInstKind(SU); 194 195 // There is no export clause, we can schedule one as soon as its ready 196 if (IK == IDOther) 197 Available[IDOther].push_back(SU); 198 else 199 Pending[IK].push_back(SU); 200 201 } 202 203 bool R600SchedStrategy::regBelongsToClass(Register Reg, 204 const TargetRegisterClass *RC) const { 205 if (!Reg.isVirtual()) { 206 return RC->contains(Reg); 207 } else { 208 return MRI->getRegClass(Reg) == RC; 209 } 210 } 211 212 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { 213 MachineInstr *MI = SU->getInstr(); 214 215 if (TII->isTransOnly(*MI)) 216 return AluTrans; 217 218 switch (MI->getOpcode()) { 219 case R600::PRED_X: 220 return AluPredX; 221 case R600::INTERP_PAIR_XY: 222 case R600::INTERP_PAIR_ZW: 223 case R600::INTERP_VEC_LOAD: 224 case R600::DOT_4: 225 return AluT_XYZW; 226 case R600::COPY: 227 if (MI->getOperand(1).isUndef()) { 228 // MI will become a KILL, don't considers it in scheduling 229 return AluDiscarded; 230 } 231 break; 232 default: 233 break; 234 } 235 236 // Does the instruction take a whole IG ? 237 // XXX: Is it possible to add a helper function in R600InstrInfo that can 238 // be used here and in R600PacketizerList::isSoloInstruction() ? 239 if(TII->isVector(*MI) || 240 TII->isCubeOp(MI->getOpcode()) || 241 TII->isReductionOp(MI->getOpcode()) || 242 MI->getOpcode() == R600::GROUP_BARRIER) { 243 return AluT_XYZW; 244 } 245 246 if (TII->isLDSInstr(MI->getOpcode())) { 247 return AluT_X; 248 } 249 250 // Is the result already assigned to a channel ? 251 unsigned DestSubReg = MI->getOperand(0).getSubReg(); 252 switch (DestSubReg) { 253 case R600::sub0: 254 return AluT_X; 255 case R600::sub1: 256 return AluT_Y; 257 case R600::sub2: 258 return AluT_Z; 259 case R600::sub3: 260 return AluT_W; 261 default: 262 break; 263 } 264 265 // Is the result already member of a X/Y/Z/W class ? 266 Register DestReg = MI->getOperand(0).getReg(); 267 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || 268 regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) 269 return AluT_X; 270 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass)) 271 return AluT_Y; 272 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass)) 273 return AluT_Z; 274 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass)) 275 return AluT_W; 276 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass)) 277 return AluT_XYZW; 278 279 // LDS src registers cannot be used in the Trans slot. 280 if (TII->readsLDSSrcReg(*MI)) 281 return AluT_XYZW; 282 283 return AluAny; 284 } 285 286 int R600SchedStrategy::getInstKind(SUnit* SU) { 287 int Opcode = SU->getInstr()->getOpcode(); 288 289 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) 290 return IDFetch; 291 292 if (TII->isALUInstr(Opcode)) { 293 return IDAlu; 294 } 295 296 switch (Opcode) { 297 case R600::PRED_X: 298 case R600::COPY: 299 case R600::CONST_COPY: 300 case R600::INTERP_PAIR_XY: 301 case R600::INTERP_PAIR_ZW: 302 case R600::INTERP_VEC_LOAD: 303 case R600::DOT_4: 304 return IDAlu; 305 default: 306 return IDOther; 307 } 308 } 309 310 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { 311 if (Q.empty()) 312 return nullptr; 313 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); 314 It != E; ++It) { 315 SUnit *SU = *It; 316 InstructionsGroupCandidate.push_back(SU->getInstr()); 317 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && 318 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) { 319 InstructionsGroupCandidate.pop_back(); 320 Q.erase((It + 1).base()); 321 return SU; 322 } else { 323 InstructionsGroupCandidate.pop_back(); 324 } 325 } 326 return nullptr; 327 } 328 329 void R600SchedStrategy::LoadAlu() { 330 std::vector<SUnit *> &QSrc = Pending[IDAlu]; 331 for (SUnit *SU : QSrc) { 332 AluKind AK = getAluKind(SU); 333 AvailableAlus[AK].push_back(SU); 334 } 335 QSrc.clear(); 336 } 337 338 void R600SchedStrategy::PrepareNextSlot() { 339 LLVM_DEBUG(dbgs() << "New Slot\n"); 340 assert(OccupiedSlotsMask && "Slot wasn't filled"); 341 OccupiedSlotsMask = 0; 342 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) 343 // OccupiedSlotsMask |= 16; 344 InstructionsGroupCandidate.clear(); 345 LoadAlu(); 346 } 347 348 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { 349 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); 350 if (DstIndex == -1) { 351 return; 352 } 353 Register DestReg = MI->getOperand(DstIndex).getReg(); 354 // PressureRegister crashes if an operand is def and used in the same inst 355 // and we try to constraint its regclass 356 for (MachineInstr::mop_iterator It = MI->operands_begin(), 357 E = MI->operands_end(); It != E; ++It) { 358 MachineOperand &MO = *It; 359 if (MO.isReg() && !MO.isDef() && 360 MO.getReg() == DestReg) 361 return; 362 } 363 // Constrains the regclass of DestReg to assign it to Slot 364 switch (Slot) { 365 case 0: 366 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass); 367 break; 368 case 1: 369 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass); 370 break; 371 case 2: 372 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass); 373 break; 374 case 3: 375 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass); 376 break; 377 } 378 } 379 380 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { 381 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; 382 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); 383 if (SlotedSU) 384 return SlotedSU; 385 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); 386 if (UnslotedSU) 387 AssignSlot(UnslotedSU->getInstr(), Slot); 388 return UnslotedSU; 389 } 390 391 unsigned R600SchedStrategy::AvailablesAluCount() const { 392 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + 393 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + 394 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + 395 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + 396 AvailableAlus[AluPredX].size(); 397 } 398 399 SUnit* R600SchedStrategy::pickAlu() { 400 while (AvailablesAluCount() || !Pending[IDAlu].empty()) { 401 if (!OccupiedSlotsMask) { 402 // Bottom up scheduling : predX must comes first 403 if (!AvailableAlus[AluPredX].empty()) { 404 OccupiedSlotsMask |= 31; 405 return PopInst(AvailableAlus[AluPredX], false); 406 } 407 // Flush physical reg copies (RA will discard them) 408 if (!AvailableAlus[AluDiscarded].empty()) { 409 OccupiedSlotsMask |= 31; 410 return PopInst(AvailableAlus[AluDiscarded], false); 411 } 412 // If there is a T_XYZW alu available, use it 413 if (!AvailableAlus[AluT_XYZW].empty()) { 414 OccupiedSlotsMask |= 15; 415 return PopInst(AvailableAlus[AluT_XYZW], false); 416 } 417 } 418 bool TransSlotOccupied = OccupiedSlotsMask & 16; 419 if (!TransSlotOccupied && VLIW5) { 420 if (!AvailableAlus[AluTrans].empty()) { 421 OccupiedSlotsMask |= 16; 422 return PopInst(AvailableAlus[AluTrans], false); 423 } 424 SUnit *SU = AttemptFillSlot(3, true); 425 if (SU) { 426 OccupiedSlotsMask |= 16; 427 return SU; 428 } 429 } 430 for (int Chan = 3; Chan > -1; --Chan) { 431 bool isOccupied = OccupiedSlotsMask & (1 << Chan); 432 if (!isOccupied) { 433 SUnit *SU = AttemptFillSlot(Chan, false); 434 if (SU) { 435 OccupiedSlotsMask |= (1 << Chan); 436 InstructionsGroupCandidate.push_back(SU->getInstr()); 437 return SU; 438 } 439 } 440 } 441 PrepareNextSlot(); 442 } 443 return nullptr; 444 } 445 446 SUnit* R600SchedStrategy::pickOther(int QID) { 447 SUnit *SU = nullptr; 448 std::vector<SUnit *> &AQ = Available[QID]; 449 450 if (AQ.empty()) { 451 MoveUnits(Pending[QID], AQ); 452 } 453 if (!AQ.empty()) { 454 SU = AQ.back(); 455 AQ.pop_back(); 456 } 457 return SU; 458 } 459