1//===-- PPCScheduleP9.td - PPC P9 Scheduling Definitions ---*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the itinerary class data for the POWER9 processor. 10// 11//===----------------------------------------------------------------------===// 12include "PPCInstrInfo.td" 13 14def P9Model : SchedMachineModel { 15 // The maximum number of instructions to be issued at the same time. 16 // While a value of 8 is technically correct since 8 instructions can be 17 // fetched from the instruction cache. However, only 6 instructions may be 18 // actually dispatched at a time. 19 let IssueWidth = 8; 20 21 // Load latency is 4 or 5 cycles depending on the load. This latency assumes 22 // that we have a cache hit. For a cache miss the load latency will be more. 23 // There are two instructions (lxvl, lxvll) that have a latency of 6 cycles. 24 // However it is not worth bumping this value up to 6 when the vast majority 25 // of instructions are 4 or 5 cycles. 26 let LoadLatency = 5; 27 28 // A total of 16 cycles to recover from a branch mispredict. 29 let MispredictPenalty = 16; 30 31 // Try to make sure we have at least 10 dispatch groups in a loop. 32 // A dispatch group is 6 instructions. 33 let LoopMicroOpBufferSize = 60; 34 35 // As iops are dispatched to a slice, they are held in an independent slice 36 // issue queue until all register sources and other dependencies have been 37 // resolved and they can be issued. Each of four execution slices has an 38 // 11-entry iop issue queue. 39 let MicroOpBufferSize = 44; 40 41 let CompleteModel = 1; 42 43 // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing 44 // Engine), prefixed instructions on Power 9, PC relative mem ops, or 45 // instructions introduced in ISA 3.1. 46 let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops, 47 IsISA3_1]; 48 49} 50 51let SchedModel = P9Model in { 52 53 // ***************** Processor Resources ***************** 54 55 // Dispatcher slots: 56 // x0, x1, x2, and x3 are the dedicated slice dispatch ports, where each 57 // corresponds to one of the four execution slices. 58 def DISPx02 : ProcResource<2>; 59 def DISPx13 : ProcResource<2>; 60 // The xa and xb ports can be used to send an iop to either of the two slices 61 // of the superslice, but are restricted to iops with only two primary sources. 62 def DISPxab : ProcResource<2>; 63 // b0 and b1 are dedicated dispatch ports into the branch slice. 64 def DISPb01 : ProcResource<2>; 65 66 // Any non BR dispatch ports 67 def DISP_NBR 68 : ProcResGroup<[ DISPx02, DISPx13, DISPxab]>; 69 def DISP_SS : ProcResGroup<[ DISPx02, DISPx13]>; 70 71 // Issue Ports 72 // An instruction can go down one of two issue queues. 73 // Address Generation (AGEN) mainly for loads and stores. 74 // Execution (EXEC) for most other instructions. 75 // Some instructions cannot be run on just any issue queue and may require an 76 // Even or an Odd queue. The EXECE represents the even queues and the EXECO 77 // represents the odd queues. 78 def IP_AGEN : ProcResource<4>; 79 def IP_EXEC : ProcResource<4>; 80 def IP_EXECE : ProcResource<2> { 81 //Even Exec Ports 82 let Super = IP_EXEC; 83 } 84 def IP_EXECO : ProcResource<2> { 85 //Odd Exec Ports 86 let Super = IP_EXEC; 87 } 88 89 // Pipeline Groups 90 // Four ALU (Fixed Point Arithmetic) units in total. Two even, two Odd. 91 def ALU : ProcResource<4>; 92 def ALUE : ProcResource<2> { 93 //Even ALU pipelines 94 let Super = ALU; 95 } 96 def ALUO : ProcResource<2> { 97 //Odd ALU pipelines 98 let Super = ALU; 99 } 100 101 // Two DIV (Fixed Point Divide) units. 102 def DIV : ProcResource<2>; 103 104 // Four DP (Floating Point) units in total. Two even, two Odd. 105 def DP : ProcResource<4>; 106 def DPE : ProcResource<2> { 107 //Even DP pipelines 108 let Super = DP; 109 } 110 def DPO : ProcResource<2> { 111 //Odd DP pipelines 112 let Super = DP; 113 } 114 115 // Four LS (Load or Store) units. 116 def LS : ProcResource<4>; 117 118 // Two PM (Permute) units. 119 def PM : ProcResource<2>; 120 121 // Only one DFU (Decimal Floating Point and Quad Precision) unit. 122 def DFU : ProcResource<1>; 123 124 // Only one Branch unit. 125 def BR : ProcResource<1> { 126 let BufferSize = 16; 127 } 128 129 // Only one CY (Crypto) unit. 130 def CY : ProcResource<1>; 131 132 // ***************** SchedWriteRes Definitions ***************** 133 134 // Dispatcher 135 // Dispatch Rules: '-' or 'V' 136 // Vector ('V') - vector iops (128-bit operand) take only one decode and 137 // dispatch slot but are dispatched to both the even and odd slices of a 138 // superslice. 139 def DISP_1C : SchedWriteRes<[DISP_NBR]> { 140 let NumMicroOps = 0; 141 let Latency = 1; 142 } 143 // Dispatch Rules: 'E' 144 // Even slice ('E')- certain operations must be sent only to an even slice. 145 // Also consumes odd dispatch slice slot of the same superslice at dispatch 146 def DISP_EVEN_1C : SchedWriteRes<[ DISPx02, DISPx13 ]> { 147 let NumMicroOps = 0; 148 let Latency = 1; 149 } 150 // Dispatch Rules: 'P' 151 // Paired ('P') - certain cracked and expanded iops are paired such that they 152 // must dispatch together to the same superslice. 153 def DISP_PAIR_1C : SchedWriteRes<[ DISP_SS, DISP_SS]> { 154 let NumMicroOps = 0; 155 let Latency = 1; 156 } 157 // Tuple Restricted ('R') - certain iops preclude dispatching more than one 158 // operation per slice for the super- slice to which they are dispatched 159 def DISP_3SLOTS_1C : SchedWriteRes<[DISPx02, DISPx13, DISPxab]> { 160 let NumMicroOps = 0; 161 let Latency = 1; 162 } 163 // Each execution and branch slice can receive up to two iops per cycle 164 def DISP_BR_1C : SchedWriteRes<[ DISPxab ]> { 165 let NumMicroOps = 0; 166 let Latency = 1; 167 } 168 169 // Issue Ports 170 def IP_AGEN_1C : SchedWriteRes<[IP_AGEN]> { 171 let NumMicroOps = 0; 172 let Latency = 1; 173 } 174 175 def IP_EXEC_1C : SchedWriteRes<[IP_EXEC]> { 176 let NumMicroOps = 0; 177 let Latency = 1; 178 } 179 180 def IP_EXECE_1C : SchedWriteRes<[IP_EXECE]> { 181 let NumMicroOps = 0; 182 let Latency = 1; 183 } 184 185 def IP_EXECO_1C : SchedWriteRes<[IP_EXECO]> { 186 let NumMicroOps = 0; 187 let Latency = 1; 188 } 189 190 //Pipeline Groups 191 192 // ALU Units 193 // An ALU may take either 2 or 3 cycles to complete the operation. 194 // However, the ALU unit is only ever busy for 1 cycle at a time and may 195 // receive new instructions each cycle. 196 def P9_ALU_2C : SchedWriteRes<[ALU]> { 197 let Latency = 2; 198 } 199 200 def P9_ALUE_2C : SchedWriteRes<[ALUE]> { 201 let Latency = 2; 202 } 203 204 def P9_ALUO_2C : SchedWriteRes<[ALUO]> { 205 let Latency = 2; 206 } 207 208 def P9_ALU_3C : SchedWriteRes<[ALU]> { 209 let Latency = 3; 210 } 211 212 def P9_ALUE_3C : SchedWriteRes<[ALUE]> { 213 let Latency = 3; 214 } 215 216 def P9_ALUO_3C : SchedWriteRes<[ALUO]> { 217 let Latency = 3; 218 } 219 220 // DIV Unit 221 // A DIV unit may take from 5 to 40 cycles to complete. 222 // Some DIV operations may keep the unit busy for up to 8 cycles. 223 def P9_DIV_5C : SchedWriteRes<[DIV]> { 224 let Latency = 5; 225 } 226 227 def P9_DIV_12C : SchedWriteRes<[DIV]> { 228 let Latency = 12; 229 } 230 231 def P9_DIV_16C_8 : SchedWriteRes<[DIV]> { 232 let ResourceCycles = [8]; 233 let Latency = 16; 234 } 235 236 def P9_DIV_24C_8 : SchedWriteRes<[DIV]> { 237 let ResourceCycles = [8]; 238 let Latency = 24; 239 } 240 241 def P9_DIV_40C_8 : SchedWriteRes<[DIV]> { 242 let ResourceCycles = [8]; 243 let Latency = 40; 244 } 245 246 // DP Unit 247 // A DP unit may take from 2 to 36 cycles to complete. 248 // Some DP operations keep the unit busy for up to 10 cycles. 249 def P9_DP_5C : SchedWriteRes<[DP]> { 250 let Latency = 5; 251 } 252 253 def P9_DP_7C : SchedWriteRes<[DP]> { 254 let Latency = 7; 255 } 256 257 def P9_DPE_7C : SchedWriteRes<[DPE]> { 258 let Latency = 7; 259 } 260 261 def P9_DPO_7C : SchedWriteRes<[DPO]> { 262 let Latency = 7; 263 } 264 265 def P9_DP_22C_5 : SchedWriteRes<[DP]> { 266 let ResourceCycles = [5]; 267 let Latency = 22; 268 } 269 270 def P9_DPO_24C_8 : SchedWriteRes<[DPO]> { 271 let ResourceCycles = [8]; 272 let Latency = 24; 273 } 274 275 def P9_DPE_24C_8 : SchedWriteRes<[DPE]> { 276 let ResourceCycles = [8]; 277 let Latency = 24; 278 } 279 280 def P9_DP_26C_5 : SchedWriteRes<[DP]> { 281 let ResourceCycles = [5]; 282 let Latency = 22; 283 } 284 285 def P9_DPE_27C_10 : SchedWriteRes<[DP]> { 286 let ResourceCycles = [10]; 287 let Latency = 27; 288 } 289 290 def P9_DPO_27C_10 : SchedWriteRes<[DP]> { 291 let ResourceCycles = [10]; 292 let Latency = 27; 293 } 294 295 def P9_DP_33C_8 : SchedWriteRes<[DP]> { 296 let ResourceCycles = [8]; 297 let Latency = 33; 298 } 299 300 def P9_DPE_33C_8 : SchedWriteRes<[DPE]> { 301 let ResourceCycles = [8]; 302 let Latency = 33; 303 } 304 305 def P9_DPO_33C_8 : SchedWriteRes<[DPO]> { 306 let ResourceCycles = [8]; 307 let Latency = 33; 308 } 309 310 def P9_DP_36C_10 : SchedWriteRes<[DP]> { 311 let ResourceCycles = [10]; 312 let Latency = 36; 313 } 314 315 def P9_DPE_36C_10 : SchedWriteRes<[DP]> { 316 let ResourceCycles = [10]; 317 let Latency = 36; 318 } 319 320 def P9_DPO_36C_10 : SchedWriteRes<[DP]> { 321 let ResourceCycles = [10]; 322 let Latency = 36; 323 } 324 325 // PM Unit 326 // Three cycle permute operations. 327 def P9_PM_3C : SchedWriteRes<[PM]> { 328 let Latency = 3; 329 } 330 331 // Load and Store Units 332 // Loads can have 4, 5 or 6 cycles of latency. 333 // Stores are listed as having a single cycle of latency. This is not 334 // completely accurate since it takes more than 1 cycle to actually store 335 // the value. However, since the store does not produce a result it can be 336 // considered complete after one cycle. 337 def P9_LS_1C : SchedWriteRes<[LS]> { 338 let Latency = 1; 339 } 340 341 def P9_LS_4C : SchedWriteRes<[LS]> { 342 let Latency = 4; 343 } 344 345 def P9_LS_5C : SchedWriteRes<[LS]> { 346 let Latency = 5; 347 } 348 349 def P9_LS_6C : SchedWriteRes<[LS]> { 350 let Latency = 6; 351 } 352 353 // DFU Unit 354 // Some of the most expensive ops use the DFU. 355 // Can take from 12 cycles to 76 cycles to obtain a result. 356 // The unit may be busy for up to 62 cycles. 357 def P9_DFU_12C : SchedWriteRes<[DFU]> { 358 let Latency = 12; 359 } 360 361 def P9_DFU_23C : SchedWriteRes<[DFU]> { 362 let Latency = 23; 363 let ResourceCycles = [11]; 364 } 365 366 def P9_DFU_24C : SchedWriteRes<[DFU]> { 367 let Latency = 24; 368 let ResourceCycles = [12]; 369 } 370 371 def P9_DFU_37C : SchedWriteRes<[DFU]> { 372 let Latency = 37; 373 let ResourceCycles = [25]; 374 } 375 376 def P9_DFU_58C : SchedWriteRes<[DFU]> { 377 let Latency = 58; 378 let ResourceCycles = [44]; 379 } 380 381 def P9_DFU_76C : SchedWriteRes<[DFU]> { 382 let Latency = 76; 383 let ResourceCycles = [62]; 384 } 385 386 // 2 or 5 cycle latencies for the branch unit. 387 def P9_BR_2C : SchedWriteRes<[BR]> { 388 let Latency = 2; 389 } 390 391 def P9_BR_5C : SchedWriteRes<[BR]> { 392 let Latency = 5; 393 } 394 395 // 6 cycle latency for the crypto unit 396 def P9_CY_6C : SchedWriteRes<[CY]> { 397 let Latency = 6; 398 } 399 400 // ***************** WriteSeq Definitions ***************** 401 402 // These are combinations of the resources listed above. 403 // The idea is that some cracked instructions cannot be done in parallel and 404 // so the latencies for their resources must be added. 405 def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>; 406 def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>; 407 def P9_LoadAndALU2Op_7C : WriteSequence<[P9_LS_4C, P9_ALU_3C]>; 408 def P9_LoadAndALU2Op_8C : WriteSequence<[P9_LS_5C, P9_ALU_3C]>; 409 def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>; 410 def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>; 411 def P9_IntDivAndALUOp_18C_8 : WriteSequence<[P9_DIV_16C_8, P9_ALU_2C]>; 412 def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>; 413 def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>; 414 def P9_StoreAndALUOp_3C : WriteSequence<[P9_LS_1C, P9_ALU_2C]>; 415 def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>; 416 def P9_ALU2OpAndALU2Op_6C : WriteSequence<[P9_ALU_3C, P9_ALU_3C]>; 417 def P9_ALUOpAndALUOpAndALUOp_6C : 418 WriteSequence<[P9_ALU_2C, P9_ALU_2C, P9_ALU_2C]>; 419 def P9_DPOpAndALUOp_7C : WriteSequence<[P9_DP_5C, P9_ALU_2C]>; 420 def P9_DPOpAndALU2Op_10C : WriteSequence<[P9_DP_7C, P9_ALU_3C]>; 421 def P9_DPOpAndALU2Op_25C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_3C]>; 422 def P9_DPOpAndALU2Op_29C_5 : WriteSequence<[P9_DP_26C_5, P9_ALU_3C]>; 423 def P9_DPOpAndALU2Op_36C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_3C]>; 424 def P9_DPOpAndALU2Op_39C_10 : WriteSequence<[P9_DP_36C_10, P9_ALU_3C]>; 425 def P9_BROpAndALUOp_7C : WriteSequence<[P9_BR_5C, P9_ALU_2C]>; 426 427 // Include the resource requirements of individual instructions. 428 include "P9InstrResources.td" 429 430} 431 432