1//===-- SISchedule.td - SI Scheduling definitions -------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// MachineModel definitions for Southern Islands (SI) 10// 11//===----------------------------------------------------------------------===// 12 13def : PredicateProlog<[{ 14 const SIInstrInfo *TII = 15 static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo()); 16 (void)TII; 17}]>; 18 19def WriteBranch : SchedWrite; 20def WriteExport : SchedWrite; 21def WriteLDS : SchedWrite; 22def WriteSALU : SchedWrite; 23def WriteSMEM : SchedWrite; 24def WriteVMEM : SchedWrite; 25def WriteBarrier : SchedWrite; 26 27def MIVGPRRead : SchedRead; 28def MIMFMARead : SchedRead; 29 30// Normal 16 or 32 bit VALU instructions 31def Write32Bit : SchedWrite; 32// Conversion to or from F32 (but not converting F64 to or from F32) 33def WriteFloatCvt : SchedWrite; 34// F16 or F32 transcendental instructions (these are quarter rate) 35def WriteTrans32 : SchedWrite; 36// Other quarter rate VALU instructions 37def WriteQuarterRate32 : SchedWrite; 38 39def WriteFloatFMA : SchedWrite; 40 41// Slow quarter rate f64 instruction. 42def WriteDouble : SchedWrite; 43 44// half rate f64 instruction (same as v_add_f64) 45def WriteDoubleAdd : SchedWrite; 46 47// Conversion to or from f64 instruction 48def WriteDoubleCvt : SchedWrite; 49 50// F64 "transcendental" (actually only reciprocal and/or square root) 51// instructions 52def WriteTrans64 : SchedWrite; 53 54// Half rate 64-bit instructions. 55def Write64Bit : SchedWrite; 56 57// mAI multipass instructions. 58def Write2PassMAI : SchedWrite; 59def Write8PassMAI : SchedWrite; 60def Write16PassMAI : SchedWrite; 61 62// FIXME: Should there be a class for instructions which are VALU 63// instructions and have VALU rates, but write to the SALU (i.e. VOPC 64// instructions) 65 66class SISchedMachineModel : SchedMachineModel { 67 let CompleteModel = 1; 68 // MicroOpBufferSize = 1 means that instructions will always be added 69 // the ready queue when they become available. This exposes them 70 // to the register pressure analysis. 71 let MicroOpBufferSize = 1; 72 let IssueWidth = 1; 73 let PostRAScheduler = 1; 74 75 // FIXME:Approximate 2 * branch cost. Try to hack around bad 76 // early-ifcvt heuristics. These need improvement to avoid the OOE 77 // heuristics. 78 int MispredictPenalty = 20; 79} 80 81def SIFullSpeedModel : SISchedMachineModel; 82def SIQuarterSpeedModel : SISchedMachineModel; 83def GFX10SpeedModel : SISchedMachineModel; 84 85// XXX: Are the resource counts correct? 86def HWBranch : ProcResource<1> { 87 let BufferSize = 1; 88} 89def HWExport : ProcResource<1> { 90 let BufferSize = 7; // Taken from S_WAITCNT 91} 92def HWLGKM : ProcResource<1> { 93 let BufferSize = 31; // Taken from S_WAITCNT 94} 95def HWSALU : ProcResource<1> { 96 let BufferSize = 1; 97} 98def HWVMEM : ProcResource<1> { 99 let BufferSize = 15; // Taken from S_WAITCNT 100} 101def HWVALU : ProcResource<1> { 102 let BufferSize = 1; 103} 104def HWRC : ProcResource<1> { // Register destination cache 105 let BufferSize = 1; 106} 107def HWXDL : ProcResource<1> { // MFMA CU 108 let BufferSize = 0; 109} 110 111class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, 112 int latency> : WriteRes<write, resources> { 113 let Latency = latency; 114} 115 116class HWVALUWriteRes<SchedWrite write, int latency> : 117 HWWriteRes<write, [HWVALU], latency>; 118 119def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>; 120 121def MIReadVGPR : SchedReadVariant<[ 122 SchedVar<PredMIReadVGPR, [MIVGPRRead]>, 123 SchedVar<NoSchedPred, [ReadDefault]>]>; 124 125// The latency numbers are taken from AMD Accelerated Parallel Processing 126// guide. They may not be accurate. 127 128// The latency values are 1 / (operations / cycle) / 4. 129multiclass SICommonWriteRes { 130 131 def : HWWriteRes<WriteBranch, [HWBranch], 8>; 132 def : HWWriteRes<WriteExport, [HWExport], 4>; 133 def : HWWriteRes<WriteLDS, [HWLGKM], 5>; // Can be between 2 and 64 134 def : HWWriteRes<WriteSALU, [HWSALU], 1>; 135 def : HWWriteRes<WriteSMEM, [HWLGKM], 5>; 136 def : HWWriteRes<WriteVMEM, [HWVMEM], 80>; 137 def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? 138 139 def : HWVALUWriteRes<Write32Bit, 1>; 140 def : HWVALUWriteRes<Write64Bit, 2>; 141 def : HWVALUWriteRes<WriteFloatCvt, 4>; 142 def : HWVALUWriteRes<WriteTrans32, 4>; 143 def : HWVALUWriteRes<WriteQuarterRate32, 4>; 144 145 let ResourceCycles = [2] in 146 def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; 147 let ResourceCycles = [8] in 148 def : HWWriteRes<Write8PassMAI, [HWXDL], 8>; 149 let ResourceCycles = [16] in 150 def : HWWriteRes<Write16PassMAI, [HWXDL], 16>; 151 152 def : ReadAdvance<MIVGPRRead, -2>; 153 def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; 154 155 // Technically mfma reads can be from 0 to 4 cycles but that does not make 156 // sense to model because its register setup is huge. In particular if we 157 // properly model read advance as -2 for a vgpr read it will result in a 158 // bad scheduling of acc writes before that mfma. To avoid it we would 159 // need to consume 2 or 4 more vgprs to be initialized before the acc 160 // write sequence. Just assume worst case here. 161 def : ReadAdvance<MIMFMARead, -4>; 162 163 def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; 164 def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; 165 def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; 166} 167 168def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; 169def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; 170def WriteCopy : SchedWriteVariant<[ 171 SchedVar<PredIsVGPR32Copy, [Write32Bit]>, 172 SchedVar<PredIsVGPR64Copy, [Write64Bit]>, 173 SchedVar<NoSchedPred, [WriteSALU]>]>; 174 175let SchedModel = SIFullSpeedModel in { 176 177defm : SICommonWriteRes; 178 179def : HWVALUWriteRes<WriteFloatFMA, 1>; 180def : HWVALUWriteRes<WriteDouble, 4>; 181def : HWVALUWriteRes<WriteDoubleAdd, 2>; 182def : HWVALUWriteRes<WriteDoubleCvt, 4>; 183def : HWVALUWriteRes<WriteTrans64, 4>; 184 185def : InstRW<[WriteCopy], (instrs COPY)>; 186 187} // End SchedModel = SIFullSpeedModel 188 189let SchedModel = SIQuarterSpeedModel in { 190 191defm : SICommonWriteRes; 192 193def : HWVALUWriteRes<WriteFloatFMA, 16>; 194def : HWVALUWriteRes<WriteDouble, 16>; 195def : HWVALUWriteRes<WriteDoubleAdd, 8>; 196def : HWVALUWriteRes<WriteDoubleCvt, 4>; 197def : HWVALUWriteRes<WriteTrans64, 16>; 198 199def : InstRW<[WriteCopy], (instrs COPY)>; 200 201} // End SchedModel = SIQuarterSpeedModel 202 203let SchedModel = GFX10SpeedModel in { 204 205// The latency values are 1 / (operations / cycle). 206// Add 1 stall cycle for VGPR read. 207def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; 208def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; 209def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; 210def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; 211def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; 212def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; 213def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>; 214def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>; 215def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>; 216def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>; 217 218def : HWWriteRes<WriteBranch, [HWBranch], 32>; 219def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; 220def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; 221def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; 222def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; 223def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; 224def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; 225 226def : InstRW<[WriteCopy], (instrs COPY)>; 227 228} // End SchedModel = GFX10SpeedModel 229