1//===-- SISchedule.td - SI Scheduling definitions -------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// MachineModel definitions for Southern Islands (SI) 10// 11//===----------------------------------------------------------------------===// 12 13def : PredicateProlog<[{ 14 const SIInstrInfo *TII = 15 static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo()); 16 (void)TII; 17}]>; 18 19def WriteBranch : SchedWrite; 20def WriteExport : SchedWrite; 21def WriteLDS : SchedWrite; 22def WriteSALU : SchedWrite; 23def WriteSMEM : SchedWrite; 24def WriteVMEM : SchedWrite; 25def WriteBarrier : SchedWrite; 26 27def MIVGPRRead : SchedRead; 28def MIMFMARead : SchedRead; 29 30// Normal 16 or 32 bit VALU instructions 31def Write32Bit : SchedWrite; 32// Conversion to or from F32 (but not converting F64 to or from F32) 33def WriteFloatCvt : SchedWrite; 34// F16 or F32 transcendental instructions (these are quarter rate) 35def WriteTrans32 : SchedWrite; 36// Other quarter rate VALU instructions 37def WriteQuarterRate32 : SchedWrite; 38 39def WriteFloatFMA : SchedWrite; 40 41// Slow quarter rate f64 instruction. 42def WriteDouble : SchedWrite; 43 44// half rate f64 instruction (same as v_add_f64) 45def WriteDoubleAdd : SchedWrite; 46 47// Conversion to or from f64 instruction 48def WriteDoubleCvt : SchedWrite; 49 50// F64 "transcendental" (actually only reciprocal and/or square root) 51// instructions 52def WriteTrans64 : SchedWrite; 53 54// Half rate 64-bit instructions. 55def Write64Bit : SchedWrite; 56 57// Integer multiplications. 58def WriteIntMul : SchedWrite; 59 60// mAI multipass instructions. 61def Write2PassMAI : SchedWrite; 62def Write4PassMAI : SchedWrite; 63def Write8PassMAI : SchedWrite; 64def Write16PassMAI : SchedWrite; 65def Write4PassDGEMM : SchedWrite; 66def Write8PassDGEMM : SchedWrite; 67 68// Scalar float instructions 69def WriteSFPU : SchedWrite; 70 71// F16 or F32 pseudo scalar transcendental instructions 72def WritePseudoScalarTrans : SchedWrite; 73 74// FIXME: Should there be a class for instructions which are VALU 75// instructions and have VALU rates, but write to the SALU (i.e. VOPC 76// instructions) 77 78class SISchedMachineModel : SchedMachineModel { 79 let CompleteModel = 1; 80 // MicroOpBufferSize = 1 means that instructions will always be added 81 // the ready queue when they become available. This exposes them 82 // to the register pressure analysis. 83 let MicroOpBufferSize = 1; 84 let IssueWidth = 1; 85 let PostRAScheduler = 1; 86 87 // FIXME:Approximate 2 * branch cost. Try to hack around bad 88 // early-ifcvt heuristics. These need improvement to avoid the OOE 89 // heuristics. 90 int MispredictPenalty = 20; 91} 92 93def SIFullSpeedModel : SISchedMachineModel; 94def SIQuarterSpeedModel : SISchedMachineModel; 95def SIDPFullSpeedModel : SISchedMachineModel; 96def SIDPGFX940FullSpeedModel : SISchedMachineModel; 97def GFX10SpeedModel : SISchedMachineModel; 98def GFX11SpeedModel : SISchedMachineModel; 99def GFX12SpeedModel : SISchedMachineModel; 100 101// XXX: Are the resource counts correct? 102def HWBranch : ProcResource<1> { 103 let BufferSize = 1; 104} 105def HWExport : ProcResource<1> { 106 let BufferSize = 1; 107} 108def HWLGKM : ProcResource<1> { 109 let BufferSize = 1; 110} 111def HWSALU : ProcResource<1> { 112 let BufferSize = 1; 113} 114def HWVMEM : ProcResource<1> { 115 let BufferSize = 1; 116} 117def HWVALU : ProcResource<1> { 118 let BufferSize = 1; 119} 120def HWTransVALU : ProcResource<1> { // Transcendental VALU 121 let BufferSize = 1; 122} 123def HWRC : ProcResource<1> { // Register destination cache 124 let BufferSize = 1; 125} 126def HWXDL : ProcResource<1> { // MFMA CU 127 let BufferSize = 0; 128} 129 130class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, 131 int latency> : WriteRes<write, resources> { 132 let Latency = latency; 133} 134 135class HWVALUWriteRes<SchedWrite write, int latency> : 136 HWWriteRes<write, [HWVALU], latency>; 137 138class UnsupportedWriteRes<SchedWrite write> : WriteRes<write, []> { 139 let Unsupported = 1; 140} 141 142def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>; 143 144def MIReadVGPR : SchedReadVariant<[ 145 SchedVar<PredMIReadVGPR, [MIVGPRRead]>, 146 SchedVar<NoSchedPred, [ReadDefault]>]>; 147 148// The latency numbers are taken from AMD Accelerated Parallel Processing 149// guide. They may not be accurate. 150 151// The latency values are 1 / (operations / cycle) / 4. 152multiclass SICommonWriteRes { 153 154 let RetireOOO = 1 in { // llvm-mca specific flag 155 def : HWWriteRes<WriteBranch, [HWBranch], 8>; 156 def : HWWriteRes<WriteExport, [HWExport], 4>; 157 def : HWWriteRes<WriteLDS, [HWLGKM], 5>; // Can be between 2 and 64 158 def : HWWriteRes<WriteSALU, [HWSALU], 1>; 159 def : HWWriteRes<WriteSMEM, [HWLGKM], 5>; 160 def : HWWriteRes<WriteVMEM, [HWVMEM], 80>; 161 def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? 162 163 def : HWVALUWriteRes<Write32Bit, 1>; 164 def : HWVALUWriteRes<WriteFloatCvt, 4>; 165 def : HWVALUWriteRes<WriteTrans32, 4>; 166 def : HWVALUWriteRes<WriteQuarterRate32, 4>; 167 168 let ReleaseAtCycles = [4] in 169 def : HWVALUWriteRes<Write4PassDGEMM, 4>; 170 let ReleaseAtCycles = [8] in 171 def : HWVALUWriteRes<Write8PassDGEMM, 8>; 172 173 let ReleaseAtCycles = [2] in 174 def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; 175 let ReleaseAtCycles = [4] in 176 def : HWWriteRes<Write4PassMAI, [HWXDL], 4>; 177 let ReleaseAtCycles = [8] in 178 def : HWWriteRes<Write8PassMAI, [HWXDL], 8>; 179 let ReleaseAtCycles = [16] in 180 def : HWWriteRes<Write16PassMAI, [HWXDL], 16>; 181 182 def : UnsupportedWriteRes<WriteSFPU>; 183 def : UnsupportedWriteRes<WritePseudoScalarTrans>; 184 } // End RetireOOO = 1 185 186 def : ReadAdvance<MIVGPRRead, -2>; 187 188 // Technically mfma reads can be from 0 to 4 cycles but that does not make 189 // sense to model because its register setup is huge. In particular if we 190 // properly model read advance as -2 for a vgpr read it will result in a 191 // bad scheduling of acc writes before that mfma. To avoid it we would 192 // need to consume 2 or 4 more vgprs to be initialized before the acc 193 // write sequence. Just assume worst case here. 194 def : ReadAdvance<MIMFMARead, -4>; 195} 196 197def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; 198def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; 199def WriteCopy : SchedWriteVariant<[ 200 SchedVar<PredIsVGPR32Copy, [Write32Bit]>, 201 SchedVar<PredIsVGPR64Copy, [Write64Bit]>, 202 SchedVar<NoSchedPred, [WriteSALU]>]>; 203 204let SchedModel = SIFullSpeedModel in { 205 206defm : SICommonWriteRes; 207 208let RetireOOO = 1 in { // llvm-mca specific flag 209def : HWVALUWriteRes<Write64Bit, 2>; 210def : HWVALUWriteRes<WriteIntMul, 4>; 211def : HWVALUWriteRes<WriteFloatFMA, 1>; 212def : HWVALUWriteRes<WriteDouble, 4>; 213def : HWVALUWriteRes<WriteDoubleAdd, 2>; 214def : HWVALUWriteRes<WriteDoubleCvt, 4>; 215def : HWVALUWriteRes<WriteTrans64, 4>; 216} // End RetireOOO = 1 217 218def : InstRW<[WriteCopy], (instrs COPY)>; 219 220} // End SchedModel = SIFullSpeedModel 221 222let SchedModel = SIQuarterSpeedModel in { 223 224defm : SICommonWriteRes; 225 226let RetireOOO = 1 in { // llvm-mca specific flag 227def : HWVALUWriteRes<Write64Bit, 2>; 228def : HWVALUWriteRes<WriteIntMul, 4>; 229def : HWVALUWriteRes<WriteFloatFMA, 16>; 230def : HWVALUWriteRes<WriteDouble, 16>; 231def : HWVALUWriteRes<WriteDoubleAdd, 8>; 232def : HWVALUWriteRes<WriteDoubleCvt, 4>; 233def : HWVALUWriteRes<WriteTrans64, 16>; 234} // End RetireOOO = 1 235 236def : InstRW<[WriteCopy], (instrs COPY)>; 237def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; 238def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; 239def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; 240def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; 241 242} // End SchedModel = SIQuarterSpeedModel 243 244let SchedModel = SIDPFullSpeedModel in { 245 246defm : SICommonWriteRes; 247 248let RetireOOO = 1 in { // llvm-mca specific flag 249def : HWVALUWriteRes<WriteFloatFMA, 1>; 250def : HWVALUWriteRes<WriteDouble, 1>; 251def : HWVALUWriteRes<WriteDoubleAdd, 1>; 252def : HWVALUWriteRes<WriteDoubleCvt, 1>; 253def : HWVALUWriteRes<WriteTrans64, 4>; 254def : HWVALUWriteRes<WriteIntMul, 1>; 255def : HWVALUWriteRes<Write64Bit, 1>; 256} // End RetireOOO = 1 257 258def : InstRW<[WriteCopy], (instrs COPY)>; 259def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; 260def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; 261def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>; 262def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>; 263def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; 264def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; 265 266} // End SchedModel = SIDPFullSpeedModel 267 268let SchedModel = SIDPGFX940FullSpeedModel in { 269 270defm : SICommonWriteRes; 271 272def : HWVALUWriteRes<WriteFloatFMA, 1>; 273def : HWVALUWriteRes<WriteDouble, 1>; 274def : HWVALUWriteRes<WriteDoubleAdd, 1>; 275def : HWVALUWriteRes<WriteDoubleCvt, 1>; 276def : HWVALUWriteRes<WriteTrans64, 4>; 277def : HWVALUWriteRes<WriteIntMul, 1>; 278def : HWVALUWriteRes<Write64Bit, 1>; 279 280def : InstRW<[WriteCopy], (instrs COPY)>; 281def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; 282def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; 283 284def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>; 285def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>; 286def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>; 287def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>; 288 289def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>; 290def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>; 291def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>; 292def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>; 293 294def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; 295def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; 296 297def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; 298def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; 299 300} // End SchedModel = SIDPGFX940FullSpeedModel 301 302let SchedModel = GFX10SpeedModel in { 303 304// The latency values are 1 / (operations / cycle). 305// Add 1 stall cycle for VGPR read. 306let RetireOOO = 1 in { // llvm-mca specific flag 307def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; 308def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; 309def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; 310def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>; 311def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; 312def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; 313def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>; 314def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>; 315def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>; 316def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; 317def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 24>; 318 319def : HWWriteRes<WriteBranch, [HWBranch], 32>; 320def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; 321def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; 322def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; 323def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; 324def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; 325def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; 326 327def : UnsupportedWriteRes<WriteSFPU>; 328def : UnsupportedWriteRes<WritePseudoScalarTrans>; 329} // End RetireOOO = 1 330 331def : InstRW<[WriteCopy], (instrs COPY)>; 332 333} // End SchedModel = GFX10SpeedModel 334 335let SchedModel = GFX11SpeedModel in { 336 337// The latency values are 1 / (operations / cycle). 338// Add 1 stall cycle for VGPR read. 339let RetireOOO = 1 in { // llvm-mca specific flag 340def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; 341def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; 342def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; 343def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>; 344def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; 345def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; 346def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>; 347def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>; 348def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>; 349def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; 350def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 40>; 351 352def : HWWriteRes<WriteBranch, [HWBranch], 32>; 353def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; 354def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; 355def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; 356def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>; 357def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; 358def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; 359def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; 360} // End RetireOOO = 1 361 362def : UnsupportedWriteRes<WritePseudoScalarTrans>; 363 364def : InstRW<[WriteCopy], (instrs COPY)>; 365 366} // End SchedModel = GFX11SpeedModel 367 368let SchedModel = GFX12SpeedModel in { 369 370def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; 371def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; 372def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; 373def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; 374def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; 375def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; 376def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>; 377def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>; 378def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>; 379def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; 380def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>; 381def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>; 382 383def : HWWriteRes<WriteBranch, [HWBranch], 32>; 384def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; 385def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; 386def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; 387def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>; 388def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; 389def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; 390def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; 391 392def : InstRW<[WriteCopy], (instrs COPY)>; 393 394} // End SchedModel = GFX12SpeedModel 395