1//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for the Ampere Computing Ampere-1B to 10// support instruction scheduling and other instruction cost heuristics. 11// 12//===----------------------------------------------------------------------===// 13 14// The Ampere-1B core is an out-of-order micro-architecture. The front 15// end has branch prediction, with a 10-cycle recovery time from a 16// mispredicted branch. Instructions coming out of the front end are 17// decoded into internal micro-ops (uops). 18 19def Ampere1BModel : SchedMachineModel { 20 let IssueWidth = 12; // Maximum micro-ops dispatch rate. 21 let MicroOpBufferSize = 192; // micro-op re-order buffer size 22 let LoadLatency = 3; // Optimistic load latency 23 let MispredictPenalty = 10; // Branch mispredict penalty 24 let LoopMicroOpBufferSize = 32; // Instruction queue size 25 let CompleteModel = 1; 26 27 list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, 28 SMEUnsupported.F, 29 PAUnsupported.F); 30} 31 32let SchedModel = Ampere1BModel in { 33 34//===----------------------------------------------------------------------===// 35// Define each kind of processor resource and number available on Ampere-1B. 36 37def Ampere1BUnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w 38def Ampere1BUnitB : ProcResource<2>; // integer single-cycle, and complex shifts 39def Ampere1BUnitBS : ProcResource<1>; // integer multi-cycle 40def Ampere1BUnitL : ProcResource<2>; // load 41def Ampere1BUnitS : ProcResource<2>; // store address calculation 42def Ampere1BUnitX : ProcResource<1>; // FP and vector operations, and flag write 43def Ampere1BUnitY : ProcResource<1>; // FP and vector operations, and crypto 44def Ampere1BUnitZ : ProcResource<1>; // FP store data and FP-to-integer moves 45 46def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>; 47def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>; 48 49//===----------------------------------------------------------------------===// 50// Define customized scheduler read/write types specific to the Ampere-1. 51 52def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> { 53 let Latency = 1; 54 let NumMicroOps = 1; 55} 56 57def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> { 58 let Latency = 1; 59 let NumMicroOps = 2; 60} 61 62def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> { 63 let Latency = 1; 64 let NumMicroOps = 1; 65} 66 67def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { 68 let Latency = 1; 69 let NumMicroOps = 1; 70} 71 72def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> { 73 let Latency = 1; 74 let NumMicroOps = 2; 75} 76 77def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> { 78 let Latency = 1; 79 let NumMicroOps = 1; 80} 81 82def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> { 83 let Latency = 1; 84 let NumMicroOps = 2; 85} 86 87def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { 88 let Latency = 1; 89 let NumMicroOps = 1; 90} 91 92def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> { 93 let Latency = 1; 94 let NumMicroOps = 1; 95} 96 97def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> { 98 let Latency = 1; 99 let NumMicroOps = 2; 100} 101 102def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> { 103 let Latency = 2; 104 let NumMicroOps = 1; 105} 106 107def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> { 108 let Latency = 2; 109 let NumMicroOps = 2; 110} 111 112def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> { 113 let Latency = 2; 114 let NumMicroOps = 2; 115} 116 117def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> { 118 let Latency = 2; 119 let NumMicroOps = 2; 120} 121 122def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB, 123 Ampere1BUnitS, 124 Ampere1BUnitAB]> { 125 let Latency = 2; 126 let NumMicroOps = 3; 127} 128 129def Ampere1BWrite_2cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS, 130 Ampere1BUnitZ, 131 Ampere1BUnitZ]> { 132 let Latency = 2; 133 let NumMicroOps = 3; 134} 135 136def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { 137 let Latency = 2; 138 let NumMicroOps = 1; 139} 140 141def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> { 142 let Latency = 2; 143 let NumMicroOps = 2; 144} 145 146def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { 147 let Latency = 3; 148 let NumMicroOps = 1; 149} 150 151def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { 152 let Latency = 3; 153 let NumMicroOps = 1; 154} 155 156def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 157 let Latency = 3; 158 let NumMicroOps = 1; 159} 160 161def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { 162 let Latency = 3; 163 let NumMicroOps = 1; 164} 165 166def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> { 167 let Latency = 3; 168 let NumMicroOps = 1; 169} 170 171def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, 172 Ampere1BUnitZ]> { 173 let Latency = 3; 174 let NumMicroOps = 2; 175} 176 177def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS, 178 Ampere1BUnitZ, Ampere1BUnitZ]> { 179 let Latency = 3; 180 let NumMicroOps = 3; 181} 182 183def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, 184 Ampere1BUnitZ, Ampere1BUnitZ]> { 185 let Latency = 3; 186 let NumMicroOps = 4; 187} 188 189def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> { 190 let Latency = 4; 191 let NumMicroOps = 2; 192} 193 194def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> { 195 let Latency = 4; 196 let NumMicroOps = 1; 197} 198 199def Ampere1BWrite_4cyc_2L : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL]> { 200 let Latency = 4; 201 let NumMicroOps = 2; 202} 203 204def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> { 205 let Latency = 4; 206 let NumMicroOps = 2; 207} 208 209def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 210 let Latency = 4; 211 let NumMicroOps = 1; 212} 213 214def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> { 215 let Latency = 4; 216 let NumMicroOps = 1; 217} 218 219def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { 220 let Latency = 4; 221 let NumMicroOps = 2; 222} 223 224def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> { 225 let Latency = 5; 226 let NumMicroOps = 1; 227} 228 229def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY, 230 Ampere1BUnitS, 231 Ampere1BUnitZ]> { 232 let Latency = 4; 233 let NumMicroOps = 3; 234} 235 236def Ampere1BWrite_4cyc_3S_3Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, 237 Ampere1BUnitS, Ampere1BUnitZ, 238 Ampere1BUnitZ, Ampere1BUnitZ]> { 239 let Latency = 4; 240 let NumMicroOps = 6; 241} 242 243def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS, 244 Ampere1BUnitS, Ampere1BUnitS, 245 Ampere1BUnitZ, Ampere1BUnitZ, 246 Ampere1BUnitZ, Ampere1BUnitZ]> { 247 let Latency = 5; 248 let NumMicroOps = 8; 249} 250 251def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL, 252 Ampere1BUnitBS]> { 253 let Latency = 5; 254 let NumMicroOps = 2; 255} 256 257def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL, 258 Ampere1BUnitL, 259 Ampere1BUnitL]> { 260 let Latency = 5; 261 let NumMicroOps = 3; 262} 263 264def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL, 265 Ampere1BUnitL, 266 Ampere1BUnitL, 267 Ampere1BUnitL]> { 268 let Latency = 5; 269 let NumMicroOps = 4; 270} 271 272def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 273 let Latency = 5; 274 let NumMicroOps = 1; 275} 276 277def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, 278 Ampere1BUnitS, Ampere1BUnitS, 279 Ampere1BUnitZ, Ampere1BUnitZ]> { 280 let Latency = 5; 281 let NumMicroOps = 6; 282} 283 284def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> { 285 let Latency = 6; 286 let NumMicroOps = 2; 287} 288 289def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA, 290 Ampere1BUnitA]> { 291 let Latency = 6; 292 let NumMicroOps = 3; 293} 294 295def Ampere1BWrite_6cyc_1L_1XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitXY]> { 296 let Latency = 6; 297 let NumMicroOps = 2; 298} 299 300def Ampere1BWrite_6cyc_2L_2XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 301 Ampere1BUnitXY, Ampere1BUnitXY]> { 302 let Latency = 6; 303 let NumMicroOps = 4; 304} 305 306def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 307 let Latency = 6; 308 let NumMicroOps = 2; 309} 310 311def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { 312 let Latency = 6; 313 let NumMicroOps = 2; 314} 315 316def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, 317 Ampere1BUnitXY]> { 318 let Latency = 6; 319 let NumMicroOps = 3; 320} 321 322def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, 323 Ampere1BUnitS, Ampere1BUnitS, 324 Ampere1BUnitZ, Ampere1BUnitZ]> { 325 let Latency = 6; 326 let NumMicroOps = 6; 327} 328 329def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY, 330 Ampere1BUnitS, Ampere1BUnitS, Ampere1BUnitS, 331 Ampere1BUnitZ, Ampere1BUnitZ, Ampere1BUnitZ]> { 332 let Latency = 6; 333 let NumMicroOps = 9; 334} 335 336def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { 337 let Latency = 7; 338 let NumMicroOps = 2; 339} 340 341def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> { 342 let Latency = 7; 343 let NumMicroOps = 2; 344} 345 346def Ampere1BWrite_7cyc_1X_1Z : SchedWriteRes<[Ampere1BUnitX, Ampere1BUnitZ]> { 347 let Latency = 7; 348 let NumMicroOps = 2; 349} 350 351def Ampere1BWrite_7cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 352 Ampere1BUnitL, Ampere1BUnitXY, 353 Ampere1BUnitXY, Ampere1BUnitXY]> { 354 let Latency = 7; 355 let NumMicroOps = 6; 356} 357 358def Ampere1BWrite_7cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 359 Ampere1BUnitL, Ampere1BUnitL, 360 Ampere1BUnitXY, Ampere1BUnitXY, 361 Ampere1BUnitXY, Ampere1BUnitXY]> { 362 let Latency = 7; 363 let NumMicroOps = 8; 364} 365 366def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, 367 Ampere1BUnitXY, Ampere1BUnitXY, 368 Ampere1BUnitS, Ampere1BUnitS, 369 Ampere1BUnitS, Ampere1BUnitS, 370 Ampere1BUnitZ, Ampere1BUnitZ, 371 Ampere1BUnitZ, Ampere1BUnitZ]> { 372 let Latency = 7; 373 let NumMicroOps = 12; 374} 375 376def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> { 377 let Latency = 8; 378 let NumMicroOps = 2; 379} 380 381def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> { 382 let Latency = 8; 383 let NumMicroOps = 2; 384} 385 386def Ampere1BWrite_8cyc_2L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 387 Ampere1BUnitXY, Ampere1BUnitXY, 388 Ampere1BUnitXY]> { 389 let Latency = 8; 390 let NumMicroOps = 5; 391} 392 393def Ampere1BWrite_8cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 394 Ampere1BUnitL, Ampere1BUnitXY, 395 Ampere1BUnitXY, Ampere1BUnitXY]> { 396 let Latency = 8; 397 let NumMicroOps = 6; 398} 399 400def Ampere1BWrite_8cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 401 Ampere1BUnitL, Ampere1BUnitL, 402 Ampere1BUnitXY, Ampere1BUnitXY, 403 Ampere1BUnitXY, Ampere1BUnitXY]> { 404 let Latency = 8; 405 let NumMicroOps = 8; 406} 407 408def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> { 409 let Latency = 8; 410 let NumMicroOps = 2; 411} 412 413def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, 414 Ampere1BUnitXY, Ampere1BUnitXY]> { 415 let Latency = 8; 416 let NumMicroOps = 4; 417} 418 419def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, 420 Ampere1BUnitXY, Ampere1BUnitXY, 421 Ampere1BUnitXY, Ampere1BUnitXY, 422 Ampere1BUnitS, Ampere1BUnitS, 423 Ampere1BUnitS, Ampere1BUnitS, 424 Ampere1BUnitZ, Ampere1BUnitZ, 425 Ampere1BUnitZ, Ampere1BUnitZ]> { 426 let Latency = 9; 427 let NumMicroOps = 14; 428} 429 430def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> { 431 let Latency = 9; 432 let NumMicroOps = 3; 433} 434 435def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> { 436 let Latency = 9; 437 let NumMicroOps = 3; 438} 439 440def Ampere1BWrite_9cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 441 Ampere1BUnitL, Ampere1BUnitXY, 442 Ampere1BUnitXY, Ampere1BUnitXY]> { 443 let Latency = 9; 444 let NumMicroOps = 6; 445} 446 447def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 448 let Latency = 9; 449 let NumMicroOps = 1; 450} 451 452def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> { 453 let Latency = 9; 454 let NumMicroOps = 3; 455} 456 457def Ampere1BWrite_10cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 458 Ampere1BUnitL, Ampere1BUnitL, 459 Ampere1BUnitXY, Ampere1BUnitXY, 460 Ampere1BUnitXY, Ampere1BUnitXY]> { 461 let Latency = 10; 462 let NumMicroOps = 12; 463} 464 465def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> { 466 let Latency = 11; 467 let NumMicroOps = 3; 468} 469 470def Ampere1BWrite_11cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL, 471 Ampere1BUnitL, Ampere1BUnitL, 472 Ampere1BUnitXY, Ampere1BUnitXY, 473 Ampere1BUnitXY, Ampere1BUnitXY]> { 474 let Latency = 11; 475 let NumMicroOps = 12; 476} 477 478def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 479 let Latency = 12; 480 let NumMicroOps = 1; 481} 482 483def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> { 484 let Latency = 13; 485 let NumMicroOps = 2; 486} 487 488def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 489 let Latency = 17; 490 let NumMicroOps = 1; 491} 492 493def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS, 494 Ampere1BUnitBS, 495 Ampere1BUnitX]> { 496 let Latency = 13; 497 let NumMicroOps = 3; 498} 499 500def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 501 let Latency = 19; 502 let NumMicroOps = 1; 503} 504 505def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 506 let Latency = 21; 507 let NumMicroOps = 1; 508} 509 510def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 511 let Latency = 33; 512 let NumMicroOps = 1; 513} 514 515def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 516 let Latency = 39; 517 let NumMicroOps = 1; 518} 519 520def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> { 521 let Latency = 63; 522 let NumMicroOps = 1; 523} 524 525// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), 526// which are a single uop, and for extended registers, which have full flexibility 527// across Unit A or B for both uops. 528def Ampere1BWrite_Arith : SchedWriteVariant<[ 529 SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>, 530 SchedVar<IsCheapLSL, [Ampere1BWrite_1cyc_1AB]>, 531 SchedVar<NoSchedPred, [Ampere1BWrite_2cyc_1B_1AB]>]>; 532 533def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[ 534 SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>, 535 SchedVar<IsCheapLSL, [Ampere1BWrite_1cyc_1AB]>, 536 SchedVar<NoSchedPred, [Ampere1BWrite_2cyc_1B_1AB]>]>; 537 538//===----------------------------------------------------------------------===// 539// Map the target-defined scheduler read/write resources and latencies for Ampere-1. 540// This provides a coarse model, which is then specialised below. 541 542def : WriteRes<WriteImm, [Ampere1BUnitAB]>; // MOVN, MOVZ 543def : WriteRes<WriteI, [Ampere1BUnitAB]>; // ALU 544def : WriteRes<WriteISReg, [Ampere1BUnitB, Ampere1BUnitAB]> { 545 let Latency = 2; 546 let NumMicroOps = 2; 547} // ALU of Shifted-Reg 548def : WriteRes<WriteIEReg, [Ampere1BUnitAB, Ampere1BUnitAB]> { 549 let Latency = 2; 550 let NumMicroOps = 2; 551} // ALU of Extended-Reg 552def : WriteRes<WriteExtr, [Ampere1BUnitB]>; // EXTR shifts a reg pair 553def : WriteRes<WriteIS, [Ampere1BUnitB]>; // Shift/Scale 554def : WriteRes<WriteID32, [Ampere1BUnitBS, Ampere1BUnitX]> { 555 let Latency = 13; 556} // 32-bit Divide 557def : WriteRes<WriteID64, [Ampere1BUnitBS, Ampere1BUnitX]> { 558 let Latency = 19; 559} // 64-bit Divide 560def : WriteRes<WriteIM32, [Ampere1BUnitBS]> { 561 let Latency = 3; 562} // 32-bit Multiply 563def : WriteRes<WriteIM64, [Ampere1BUnitBS, Ampere1BUnitAB]> { 564 let Latency = 3; 565} // 64-bit Multiply 566def : WriteRes<WriteBr, [Ampere1BUnitA]>; 567def : WriteRes<WriteBrReg, [Ampere1BUnitA, Ampere1BUnitA]>; 568def : WriteRes<WriteLD, [Ampere1BUnitL]> { 569 let Latency = 3; 570} // Load from base addr plus immediate offset 571def : WriteRes<WriteST, [Ampere1BUnitS]> { 572 let Latency = 1; 573} // Store to base addr plus immediate offset 574def : WriteRes<WriteSTP, [Ampere1BUnitS, Ampere1BUnitS]> { 575 let Latency = 1; 576 let NumMicroOps = 1; 577} // Store a register pair. 578def : WriteRes<WriteAdr, [Ampere1BUnitAB]>; 579def : WriteRes<WriteLDIdx, [Ampere1BUnitAB, Ampere1BUnitS]> { 580 let Latency = 3; 581 let NumMicroOps = 1; 582} // Load from a register index (maybe scaled). 583def : WriteRes<WriteSTIdx, [Ampere1BUnitS, Ampere1BUnitS]> { 584 let Latency = 1; 585 let NumMicroOps = 2; 586} // Store to a register index (maybe scaled). 587def : WriteRes<WriteF, [Ampere1BUnitXY]> { 588 let Latency = 2; 589} // General floating-point ops. 590def : WriteRes<WriteFCmp, [Ampere1BUnitX]> { 591 let Latency = 3; 592} // Floating-point compare. 593def : WriteRes<WriteFCvt, [Ampere1BUnitXY]> { 594 let Latency = 3; 595} // Float conversion. 596def : WriteRes<WriteFCopy, [Ampere1BUnitXY]> { 597} // Float-int register copy. 598def : WriteRes<WriteFImm, [Ampere1BUnitXY]> { 599 let Latency = 2; 600} // Float-int register copy. 601def : WriteRes<WriteFMul, [Ampere1BUnitXY]> { 602 let Latency = 4; 603} // Floating-point multiply. 604def : WriteRes<WriteFDiv, [Ampere1BUnitXY]> { 605 let Latency = 19; 606} // Floating-point division. 607def : WriteRes<WriteVd, [Ampere1BUnitXY]> { 608 let Latency = 3; 609} // 64bit Vector D ops. 610def : WriteRes<WriteVq, [Ampere1BUnitXY]> { 611 let Latency = 3; 612} // 128bit Vector Q ops. 613def : WriteRes<WriteVLD, [Ampere1BUnitL, Ampere1BUnitL]> { 614 let Latency = 4; 615} // Vector loads. 616def : WriteRes<WriteVST, [Ampere1BUnitS, Ampere1BUnitZ]> { 617 let Latency = 2; 618} // Vector stores. 619 620def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 621 622def : WriteRes<WriteSys, []> { let Latency = 1; } 623def : WriteRes<WriteBarrier, []> { let Latency = 1; } 624def : WriteRes<WriteHint, []> { let Latency = 1; } 625 626def : WriteRes<WriteLDHi, []> { 627 let Latency = 3; 628} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP 629 630// Forwarding logic. 631def : ReadAdvance<ReadI, 0>; 632def : ReadAdvance<ReadISReg, 0>; 633def : ReadAdvance<ReadIEReg, 0>; 634def : ReadAdvance<ReadIM, 0>; 635def : ReadAdvance<ReadIMA, 1, [WriteIM32, WriteIM64]>; 636def : ReadAdvance<ReadID, 0>; 637def : ReadAdvance<ReadExtrHi, 0>; 638def : ReadAdvance<ReadST, 0>; 639def : ReadAdvance<ReadAdrBase, 0>; 640def : ReadAdvance<ReadVLD, 0>; 641 642//===----------------------------------------------------------------------===// 643// Specialising the scheduling model further for Ampere-1B. 644 645def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>; 646 647// Branch instructions 648def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>; 649def : InstRW<[Ampere1BWrite_1cyc_1A], 650 (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; 651def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>; 652 653// Common Short Sequence Compression (CSSC) 654def : InstRW<[Ampere1BWrite_1cyc_1AB], (instregex "^ABS[WX]")>; 655def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CNT[WX]")>; 656def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "^CTZ[WX]")>; 657def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instregex "^[SU](MAX|MIN)[WX]")>; 658 659// Cryptography instructions 660// -- AES encryption/decryption 661def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>; 662def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>; 663// -- Polynomial multiplication 664def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; 665// -- SHA-256 hash 666def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>; 667// -- SHA-256 schedule update 668def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>; 669// -- SHA-3 instructions 670def : InstRW<[Ampere1BWrite_2cyc_1XY], 671 (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; 672// -- SHA-512 hash 673def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>; 674// -- SHA-512 schedule update 675def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>; 676// -- SHA1 choose/majority/parity 677def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>; 678// -- SHA1 hash/schedule update 679def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>; 680def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>; 681// -- SM3 hash 682def : InstRW<[Ampere1BWrite_2cyc_1XY], 683 (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>; 684def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>; 685 686// FP and vector load instructions 687// -- Load 1-element structure to one/all lanes 688// ---- all lanes 689def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], 690 (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; 691// ---- one lane 692def : InstRW<[Ampere1BWrite_6cyc_1L_1XY], 693 (instregex "^LD1i(8|16|32|64)")>; 694// -- Load 1-element structure to one/all lanes, 1D size 695def : InstRW<[Ampere1BWrite_4cyc_1L], 696 (instregex "^LD1Rv1d")>; 697// -- Load 1-element structures to 1 register 698def : InstRW<[Ampere1BWrite_4cyc_1L], 699 (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; 700// -- Load 1-element structures to 2 registers 701def : InstRW<[Ampere1BWrite_4cyc_2L], 702 (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; 703// -- Load 1-element structures to 3 registers 704def : InstRW<[Ampere1BWrite_5cyc_3L], 705 (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; 706// -- Load 1-element structures to 4 registers 707def : InstRW<[Ampere1BWrite_5cyc_4L], 708 (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; 709// -- Load 2-element structure to all lanes of 2 registers, 1D size 710def : InstRW<[Ampere1BWrite_4cyc_2L], 711 (instregex "^LD2Rv1d")>; 712// -- Load 2-element structure to all lanes of 2 registers, other sizes 713def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], 714 (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; 715// -- Load 2-element structure to one lane of 2 registers 716def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], 717 (instregex "^LD2i(8|16|32|64)")>; 718// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size 719def : InstRW<[Ampere1BWrite_6cyc_2L_2XY], 720 (instregex "^LD2Twov(16b|8h|4s|2d)")>; 721// -- Load 2-element structures to 2 registers, 8B/4H/2S size 722def : InstRW<[Ampere1BWrite_8cyc_2L_3XY], 723 (instregex "^LD2Twov(8b|4h|2s)")>; 724// -- Load 3-element structure to all lanes of 3 registers, 1D size 725def : InstRW<[Ampere1BWrite_5cyc_3L], 726 (instregex "^LD3Rv1d")>; 727// -- Load 3-element structure to all lanes of 3 registers, other sizes 728def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], 729 (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; 730// -- Load 3-element structure to one lane of 3 registers 731def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], 732 (instregex "^LD3i(8|16|32|64)")>; 733// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes 734def : InstRW<[Ampere1BWrite_8cyc_3L_3XY], 735 (instregex "^LD3Threev(16b|8h|4s)")>; 736// -- Load 3-element structures to 3 registers, 2D size 737def : InstRW<[Ampere1BWrite_7cyc_3L_3XY], 738 (instregex "^LD3Threev2d")>; 739// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes 740def : InstRW<[Ampere1BWrite_9cyc_3L_3XY], 741 (instregex "^LD3Threev(8b|4h|2s)")>; 742// -- Load 4-element structure to all lanes of 4 registers, 1D size 743def : InstRW<[Ampere1BWrite_5cyc_4L], 744 (instregex "^LD4Rv1d")>; 745// -- Load 4-element structure to all lanes of 4 registers, other sizes 746def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], 747 (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; 748// -- Load 4-element structure to one lane of 4 registers 749def : InstRW<[Ampere1BWrite_7cyc_4L_4XY], 750 (instregex "^LD4i(8|16|32|64)")>; 751// -- Load 4-element structures to 4 registers, 2D size 752def : InstRW<[Ampere1BWrite_8cyc_4L_4XY], 753 (instregex "^LD4Fourv2d")>; 754// -- Load 4-element structures to 4 registers, 2S size 755def : InstRW<[Ampere1BWrite_11cyc_4L_8XY], 756 (instregex "^LD4Fourv2s")>; 757// -- Load 4-element structures to 4 registers, other sizes 758def : InstRW<[Ampere1BWrite_10cyc_4L_8XY], 759 (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; 760// -- Load pair, Q-form 761def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>; 762// -- Load pair, S/D-form 763def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; 764// -- Load register 765def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>; 766// -- Load register, sign-extended register 767def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; 768 769// FP and vector store instructions 770// -- Store 1-element structure from one lane of 1 register 771def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z], 772 (instregex "^ST1i(8|16|32|64)")>; 773// -- Store 1-element structures from 1 register 774def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], 775 (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; 776// -- Store 1-element structures from 2 registers 777def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], 778 (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; 779// -- Store 1-element structures from 3 registers 780def : InstRW<[Ampere1BWrite_4cyc_3S_3Z], 781 (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; 782// -- Store 1-element structures from 4 registers 783def : InstRW<[Ampere1BWrite_5cyc_4S_4Z], 784 (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; 785// -- Store 2-element structure from one lane of 2 registers 786def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], 787 (instregex "^ST2i(8|16|32|64)")>; 788// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes 789def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z], 790 (instregex "^ST2Twov(16b|8h|4s|2d)")>; 791// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes 792def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z], 793 (instregex "^ST2Twov(8b|4h|2s)")>; 794// -- Store 3-element structure from one lane of 3 registers 795def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], 796 (instregex "^ST3i(8|16|32|64)")>; 797// -- Store 3-element structures from 3 registers 798def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z], 799 (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; 800// -- Store 4-element structure from one lane of 4 registers 801def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], 802 (instregex "^ST4i(8|16|32|64)")>; 803// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes 804def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], 805 (instregex "^ST4Fourv(16b|8h|4s)")>; 806// -- Store 4-element structures from 4 registers, 2D sizes 807def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z], 808 (instregex "^ST4Fourv2d")>; 809// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes 810def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z], 811 (instregex "^ST4Fourv(8b|4h|2s)")>; 812// -- Store pair, Q-form 813def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>; 814// -- Store pair, S/D-form 815def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>; 816// -- Store register 817def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; 818// -- Store register, sign-extended register offset 819def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; 820 821// FP data processing, bfloat16 format 822def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>; 823def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>; 824def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; 825def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>; 826def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>; 827 828// FP data processing, scalar/vector, half precision 829def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; 830def : InstRW<[Ampere1BWrite_3cyc_1XY], 831 (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; 832def : InstRW<[Ampere1BWrite_3cyc_1XY], 833 (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; 834def : InstRW<[Ampere1BWrite_3cyc_1XY], 835 (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; 836def : InstRW<[Ampere1BWrite_3cyc_1X], 837 (instregex "^FCMPE?H")>; 838def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], 839 (instregex "^FCCMPE?H")>; 840def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], 841 (instregex "^FCSELH")>; 842def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; 843// Convert FP to integer, H-form 844def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>; 845// Convert to FP from GPR, H-form 846def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>; 847// Convert to FP from GPR, fixed-point, H-form 848def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>; 849def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>; 850def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>; 851def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; 852def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; 853def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; 854def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; 855def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>; 856def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; 857def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; 858def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>; 859def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; 860def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; 861// FP square root, H-form 862def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>; 863// FP square root, vector-form, F16 864def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>; 865 866// FP data processing, scalar/vector, single/double precision 867def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; 868def : InstRW<[Ampere1BWrite_3cyc_1XY], 869 (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; 870def : InstRW<[Ampere1BWrite_3cyc_1XY], 871 (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; 872def : InstRW<[Ampere1BWrite_3cyc_1XY], 873 (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; 874def : InstRW<[Ampere1BWrite_3cyc_1X], 875 (instregex "^FCMPE?(S|D)")>; 876def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X], 877 (instregex "^FCCMPE?(S|D)")>; 878def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY], 879 (instregex "^FCSEL(S|D)")>; 880def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; 881// Convert FP to integer, S/D-form 882def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>; 883// Convert to FP from GPR, S/D-form 884def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>; 885// Convert to FP from GPR, fixed-point, S/D-form 886def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>; 887def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>; 888def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>; 889def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; 890def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; 891def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; 892def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>; 893def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULSrr, FNMULSrr)>; 894def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULDrr, FNMULDrr)>; 895def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; 896def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; 897def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; 898def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; 899def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; 900def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>; 901def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>; 902def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>; 903 904// FP miscellaneous instructions 905def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; 906def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>; 907def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; 908def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>; 909def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>; 910def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>; 911def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; 912def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; 913def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; 914def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>; 915def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; 916 917// Integer arithmetic and logical instructions 918def : InstRW<[Ampere1BWrite_1cyc_1A], 919 (instregex "ADC(W|X)r", "SBC(W|X)r")>; 920def : InstRW<[Ampere1BWrite_Arith], 921 (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>; 922def : InstRW<[Ampere1BWrite_1cyc_1AB], 923 (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>; 924def : InstRW<[Ampere1BWrite_ArithFlagsetting], 925 (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>; 926def : InstRW<[Ampere1BWrite_1cyc_1A], 927 (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>; 928def : InstRW<[Ampere1BWrite_1cyc_1A], 929 (instregex "(ADC|SBC)S[WX]r")>; 930def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>; 931def : InstRW<[Ampere1BWrite_1cyc_1A], 932 (instregex "(CCMN|CCMP)(X|W)")>; 933def : InstRW<[Ampere1BWrite_1cyc_1A], 934 (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; 935def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>; 936def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>; 937def : InstRW<[Ampere1BWrite_3cyc_1BS], 938 (instregex "(S|U)MULHr")>; 939def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB], 940 (instregex "(S|U)?M(ADD|SUB)L?r")>; 941 942// Integer load instructions 943def : InstRW<[Ampere1BWrite_3cyc_1L], 944 (instregex "(LDNP|LDP|LDPSW)(X|W)")>; 945def : InstRW<[Ampere1BWrite_3cyc_1L], 946 (instregex "LDR(B|D|H|Q|S)ui")>; 947def : InstRW<[Ampere1BWrite_3cyc_1L], 948 (instregex "LDR(D|Q|W|X)l")>; 949def : InstRW<[Ampere1BWrite_3cyc_1L], 950 (instregex "LDTR(B|H|W|X)i")>; 951def : InstRW<[Ampere1BWrite_3cyc_1L], 952 (instregex "LDTRS(BW|BX|HW|HX|W)i")>; 953def : InstRW<[Ampere1BWrite_3cyc_1L], 954 (instregex "LDUR(BB|HH|X|W)i")>; 955def : InstRW<[Ampere1BWrite_3cyc_1L], 956 (instregex "LDURS(BW|BX|HW|HX|W)i")>; 957def : InstRW<[Ampere1BWrite_3cyc_1L], 958 (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; 959def : InstRW<[Ampere1BWrite_1cyc_1L], 960 (instrs PRFMl, PRFUMi, PRFUMi)>; 961def : InstRW<[Ampere1BWrite_1cyc_1L], 962 (instrs PRFMroW, PRFMroX)>; 963 964// Integer miscellaneous instructions 965def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs ADR, ADRP)>; 966def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "EXTR(W|X)")>; 967def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; 968def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; 969def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "CLS(W|X)")>; 970def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs SETF8, SETF16)>; 971def : InstRW<[Ampere1BWrite_1cyc_1AB], 972 (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; 973def : InstRW<[Ampere1BWrite_1cyc_1B], 974 (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; 975def : InstRW<[Ampere1BWrite_1cyc_1B], 976 (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; 977 978// Integer store instructions 979def : InstRW<[Ampere1BWrite_1cyc_2S], (instregex "STNP(X|W)i")>; 980def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STPXi)>; 981def : InstRW<[Ampere1BWrite_2cyc_1B_1S], (instrs STPWi)>; 982def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>; 983def : InstRW<[Ampere1BWrite_1cyc_1S], (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; 984def : InstRW<[Ampere1BWrite_1cyc_1S], (instregex "STUR(BB|HH|X|W)i", 985 "STR(X|W)ui", 986 "STUR(BB|HH|X|W)i")>; 987def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroX, STRXroX)>; 988def : InstRW<[Ampere1BWrite_1cyc_2S], (instrs STRWroW, STRXroW)>; 989 990// Memory tagging 991 992// Insert Random Tags 993def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>; 994// Load allocation tag 995def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>; 996// Store allocation tags 997def : InstRW<[Ampere1BWrite_1cyc_1S], 998 (instrs STGi, STGM, STGPreIndex, STGPostIndex)>; 999// Store allocation tags and pair of registers 1000def : InstRW<[Ampere1BWrite_1cyc_2S], 1001 (instrs STGPi, STGPpre, STGPpost)>; 1002// Store allocation tags and zero data 1003def : InstRW<[Ampere1BWrite_1cyc_1S], 1004 (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>; 1005// Store two tags 1006def : InstRW<[Ampere1BWrite_1cyc_2S], 1007 (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>; 1008// Store two tags and zero data 1009def : InstRW<[Ampere1BWrite_1cyc_2S], 1010 (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>; 1011// Subtract Pointer 1012def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>; 1013// Subtract Pointer, flagset 1014def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>; 1015// Insert Tag Mask 1016def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>; 1017// Arithmetic, immediate to logical address tag 1018def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs ADDG, SUBG)>; 1019 1020// Pointer authentication 1021def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>; 1022def : InstRW<[Ampere1BWrite_6cyc_1BS_1A], 1023 (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; 1024def : InstRW<[Ampere1BWrite_6cyc_1BS_2A], 1025 (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; 1026def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>; 1027def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>; 1028def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>; 1029 1030// Vector integer instructions 1031// -- absolute difference 1032def : InstRW<[Ampere1BWrite_2cyc_1XY], 1033 (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", 1034 "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; 1035// -- arithmetic 1036def : InstRW<[Ampere1BWrite_2cyc_1XY], 1037 (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", 1038 "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", 1039 "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; 1040// -- arithmetic, horizontal, 16B 1041def : InstRW<[Ampere1BWrite_8cyc_4XY], 1042 (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; 1043def : InstRW<[Ampere1BWrite_8cyc_4XY], 1044 (instregex "^[SU](MIN|MAX)Vv16i8v")>; 1045// -- arithmetic, horizontal, 4H/4S 1046def : InstRW<[Ampere1BWrite_4cyc_2XY], 1047 (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; 1048def : InstRW<[Ampere1BWrite_4cyc_2XY], 1049 (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; 1050// -- arithmetic, horizontal, 8B/8H 1051def : InstRW<[Ampere1BWrite_6cyc_3XY], 1052 (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; 1053def : InstRW<[Ampere1BWrite_6cyc_3XY], 1054 (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; 1055// -- arithmetic, narrowing 1056def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; 1057def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; 1058// -- arithmetic, pairwise 1059def : InstRW<[Ampere1BWrite_2cyc_1XY], 1060 (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; 1061// -- arithmetic, saturating 1062def : InstRW<[Ampere1BWrite_2cyc_1XY], 1063 (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; 1064// -- bit count 1065def : InstRW<[Ampere1BWrite_2cyc_1XY], 1066 (instregex "^(CLS|CLZ|CNT)v")>; 1067// -- compare 1068def : InstRW<[Ampere1BWrite_2cyc_1XY], 1069 (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", 1070 "^CMHIv", "^CMHSv")>; 1071// -- compare non-zero 1072def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>; 1073// -- dot product 1074def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; 1075// -- fp reciprocal estimate 1076def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>; 1077// -- integer reciprocal estimate 1078def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; 1079// -- logical 1080def : InstRW<[Ampere1BWrite_2cyc_1XY], 1081 (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; 1082// -- logical, narrowing 1083def : InstRW<[Ampere1BWrite_6cyc_2XY], 1084 (instregex "RSHRNv", 1085 "SHRNv", "SQSHRNv", "SQSHRUNv", 1086 "UQXTNv")>; 1087// -- matrix multiply 1088def : InstRW<[Ampere1BWrite_3cyc_1XY], 1089 (instrs SMMLA, UMMLA, USMMLA)>; 1090// -- max/min 1091def : InstRW<[Ampere1BWrite_2cyc_1XY], 1092 (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; 1093def : InstRW<[Ampere1BWrite_2cyc_1XY], 1094 (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; 1095// -- move immediate 1096def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; 1097// -- multiply 1098def : InstRW<[Ampere1BWrite_3cyc_1XY], 1099 (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; 1100// -- multiply accumulate 1101def : InstRW<[Ampere1BWrite_3cyc_1XY], 1102 (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; 1103// -- negation, saturating 1104def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; 1105// -- reverse bits/bytes 1106def : InstRW<[Ampere1BWrite_2cyc_1XY], 1107 (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; 1108// -- shift 1109def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; 1110// -- shift and accumulate 1111def : InstRW<[Ampere1BWrite_2cyc_1XY], 1112 (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; 1113// -- shift, saturating 1114def : InstRW<[Ampere1BWrite_2cyc_1XY], 1115 (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", 1116 "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", 1117 "^UQSHL")>; 1118 1119// Vector miscellaneous instructions 1120// -- duplicate element 1121def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>; 1122// -- duplicate from GPR 1123def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>; 1124// -- extract narrow 1125def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>; 1126// -- insert/extract element 1127def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; 1128// -- move FP immediate 1129def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>; 1130// -- move element to GPR 1131def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>; 1132// -- move from GPR to any element 1133def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; 1134// -- table lookup 1135def : InstRW<[Ampere1BWrite_2cyc_1XY], 1136 (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; 1137def : InstRW<[Ampere1BWrite_4cyc_2XY], 1138 (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; 1139def : InstRW<[Ampere1BWrite_6cyc_3XY], 1140 (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; 1141def : InstRW<[Ampere1BWrite_8cyc_4XY], 1142 (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; 1143// -- transpose 1144def : InstRW<[Ampere1BWrite_2cyc_1XY], 1145 (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; 1146// -- zip/unzip 1147def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; 1148 1149} // SchedModel = Ampere1BModel 1150