1//=- AArch64SchedNeoverseV2.td - NeoverseV2 Scheduling Defs --*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the scheduling model for the Arm Neoverse V2 processors. 10// All information is taken from the V2 Software Optimisation guide: 11// 12// https://developer.arm.com/documentation/PJDOC-466751330-593177/r0p2 13// 14//===----------------------------------------------------------------------===// 15 16def NeoverseV2Model : SchedMachineModel { 17 let IssueWidth = 16; // Micro-ops dispatched at a time. 18 let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer. 19 let LoadLatency = 4; // Optimistic load latency. 20 let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2. 21 let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57. 22 let CompleteModel = 1; 23 24 list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, 25 [HasSVE2p1, HasCPA, 26 HasCSSC]); 27} 28 29//===----------------------------------------------------------------------===// 30// Define each kind of processor resource and number available on Neoverse V2. 31// Instructions are first fetched and then decoded into internal macro-ops 32// (MOPs). From there, the MOPs proceed through register renaming and dispatch 33// stages. A MOP can be split into two micro-ops further down the pipeline 34// after the decode stage. Once dispatched, micro-ops wait for their operands 35// and issue out-of-order to one of seventeen issue pipelines. Each issue 36// pipeline can accept one micro-op per cycle. 37 38let SchedModel = NeoverseV2Model in { 39 40// Define the (17) issue ports. 41def V2UnitB : ProcResource<2>; // Branch 0/1 42def V2UnitS0 : ProcResource<1>; // Integer single-cycle 0 43def V2UnitS1 : ProcResource<1>; // Integer single-cycle 1 44def V2UnitS2 : ProcResource<1>; // Integer single-cycle 2 45def V2UnitS3 : ProcResource<1>; // Integer single-cycle 3 46def V2UnitM0 : ProcResource<1>; // Integer single/multicycle 0 47def V2UnitM1 : ProcResource<1>; // Integer single/multicycle 1 48def V2UnitV0 : ProcResource<1>; // FP/ASIMD 0 49def V2UnitV1 : ProcResource<1>; // FP/ASIMD 1 50def V2UnitV2 : ProcResource<1>; // FP/ASIMD 2 51def V2UnitV3 : ProcResource<1>; // FP/ASIMD 3 52def V2UnitL01 : ProcResource<2>; // Load/Store 0/1 53def V2UnitL2 : ProcResource<1>; // Load 2 54def V2UnitD : ProcResource<2>; // Store data 0/1 55 56def V2UnitR : ProcResGroup<[V2UnitS0, V2UnitS1]>; // Integer single-cycle 0/1 57def V2UnitS : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitS2, V2UnitS3]>; // Integer single-cycle 0/1/2/3 58def V2UnitF : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitM0, V2UnitM1]>; // Integer single-cycle 0/1 and single/multicycle 0/1 59def V2UnitI : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitS2, V2UnitS3, V2UnitM0, V2UnitM1]>; // Integer single-cycle 0/1/2/3 and single/multicycle 0/1 60def V2UnitM : ProcResGroup<[V2UnitM0, V2UnitM1]>; // Integer single/multicycle 0/1 61def V2UnitL : ProcResGroup<[V2UnitL01, V2UnitL2]>; // Load/Store 0/1 and Load 2 62def V2UnitV : ProcResGroup<[V2UnitV0, V2UnitV1, V2UnitV2, V2UnitV3]>; // FP/ASIMD 0/1/2/3 63def V2UnitV01 : ProcResGroup<[V2UnitV0, V2UnitV1]>; // FP/ASIMD 0/1 64def V2UnitV02 : ProcResGroup<[V2UnitV0, V2UnitV2]>; // FP/ASIMD 0/2 65def V2UnitV13 : ProcResGroup<[V2UnitV1, V2UnitV3]>; // FP/ASIMD 1/3 66def V2UnitV23 : ProcResGroup<[V2UnitV2, V2UnitV3]>; // FP/ASIMD 2/3 67 68// Define commonly used read types. 69 70// No forwarding is provided for these types. 71def : ReadAdvance<ReadI, 0>; 72def : ReadAdvance<ReadISReg, 0>; 73def : ReadAdvance<ReadIEReg, 0>; 74def : ReadAdvance<ReadIM, 0>; 75def : ReadAdvance<ReadIMA, 0>; 76def : ReadAdvance<ReadID, 0>; 77def : ReadAdvance<ReadExtrHi, 0>; 78def : ReadAdvance<ReadAdrBase, 0>; 79def : ReadAdvance<ReadST, 0>; 80def : ReadAdvance<ReadVLD, 0>; 81 82// NOTE: Copied from N2. 83def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 84def : WriteRes<WriteBarrier, []> { let Latency = 1; } 85def : WriteRes<WriteHint, []> { let Latency = 1; } 86def : WriteRes<WriteLDHi, []> { let Latency = 4; } 87 88//===----------------------------------------------------------------------===// 89// Define customized scheduler read/write types specific to the Neoverse V2. 90 91//===----------------------------------------------------------------------===// 92 93// Define generic 0 micro-op types 94def V2Write_0cyc : SchedWriteRes<[]> { let Latency = 0; } 95 96// Define generic 1 micro-op types 97 98def V2Write_1cyc_1B : SchedWriteRes<[V2UnitB]> { let Latency = 1; } 99def V2Write_1cyc_1F : SchedWriteRes<[V2UnitF]> { let Latency = 1; } 100def V2Write_1cyc_1I : SchedWriteRes<[V2UnitI]> { let Latency = 1; } 101def V2Write_1cyc_1M : SchedWriteRes<[V2UnitM]> { let Latency = 1; } 102def V2Write_1cyc_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 1; } 103def V2Write_1cyc_1L01 : SchedWriteRes<[V2UnitL01]> { let Latency = 1; } 104def V2Write_2cyc_1M : SchedWriteRes<[V2UnitM]> { let Latency = 2; } 105def V2Write_3cyc_1M : SchedWriteRes<[V2UnitM]> { let Latency = 3; } 106def V2Write_2cyc_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 2; } 107def V2Write_3cyc_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 3; } 108def V2Write_5cyc_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 5; } 109def V2Write_12cyc_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 12; 110 let ReleaseAtCycles = [12]; } 111def V2Write_20cyc_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 20; 112 let ReleaseAtCycles = [20]; } 113def V2Write_4cyc_1L : SchedWriteRes<[V2UnitL]> { let Latency = 4; } 114def V2Write_6cyc_1L : SchedWriteRes<[V2UnitL]> { let Latency = 6; } 115def V2Write_2cyc_1V : SchedWriteRes<[V2UnitV]> { let Latency = 2; } 116def V2Write_2cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 2; } 117def V2Write_2cyc_1V01 : SchedWriteRes<[V2UnitV01]> { let Latency = 2; } 118def V2Write_2cyc_1V23 : SchedWriteRes<[V2UnitV23]> { let Latency = 2; } 119def V2Write_3cyc_1V : SchedWriteRes<[V2UnitV]> { let Latency = 3; } 120def V2Write_3cyc_1V01 : SchedWriteRes<[V2UnitV01]> { let Latency = 3; 121 let ReleaseAtCycles = [2]; } 122def V2Write_3cyc_1V23 : SchedWriteRes<[V2UnitV23]> { let Latency = 3; } 123def V2Write_4cyc_1V : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 124def V2Write_5cyc_1V : SchedWriteRes<[V2UnitV]> { let Latency = 5; } 125def V2Write_6cyc_1V : SchedWriteRes<[V2UnitV]> { let Latency = 6; } 126def V2Write_12cyc_1V : SchedWriteRes<[V2UnitV]> { let Latency = 12; } 127def V2Write_3cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 3; } 128def V2Write_3cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 3; } 129def V2Write_4cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 4; } 130def V2Write_4cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 131def V2Write_7cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 7; 132 let ReleaseAtCycles = [7]; } 133def V2Write_7cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 7; 134 let ReleaseAtCycles = [2]; } 135def V2Write_9cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 9; } 136def V2Write_9cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 9; 137 let ReleaseAtCycles = [2]; } 138def V2Write_10cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 10; } 139def V2Write_10cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 10; 140 let ReleaseAtCycles = [2]; } 141def V2Write_12cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 12; 142 let ReleaseAtCycles = [11]; } 143def V2Write_13cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 13; } 144def V2Write_15cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 15; } 145def V2Write_15cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 15; 146 let ReleaseAtCycles = [8]; } 147def V2Write_16cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 16; } 148def V2Write_16cyc_1V02 : SchedWriteRes<[V2UnitV02]> { let Latency = 16; 149 let ReleaseAtCycles = [8]; } 150def V2Write_20cyc_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 20; 151 let ReleaseAtCycles = [20]; } 152def V2Write_2cyc_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 2; } 153def V2Write_2cyc_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 2; } 154def V2Write_3cyc_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 3; } 155def V2Write_4cyc_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 4; } 156def V2Write_4cyc_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 157def V2Write_6cyc_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 6; } 158def V2Write_10cyc_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 10; } 159def V2Write_6cyc_1L01 : SchedWriteRes<[V2UnitL01]> { let Latency = 6; } 160 161//===----------------------------------------------------------------------===// 162// Define generic 2 micro-op types 163 164def V2Write_1cyc_1B_1R : SchedWriteRes<[V2UnitB, V2UnitR]> { 165 let Latency = 1; 166 let NumMicroOps = 2; 167} 168 169def V2Write_6cyc_1M0_1B : SchedWriteRes<[V2UnitM0, V2UnitB]> { 170 let Latency = 6; 171 let NumMicroOps = 2; 172} 173 174def V2Write_9cyc_1M0_1L : SchedWriteRes<[V2UnitM0, V2UnitL]> { 175 let Latency = 9; 176 let NumMicroOps = 2; 177} 178 179def V2Write_3cyc_1I_1M : SchedWriteRes<[V2UnitI, V2UnitM]> { 180 let Latency = 3; 181 let NumMicroOps = 2; 182} 183 184def V2Write_1cyc_2M : SchedWriteRes<[V2UnitM, V2UnitM]> { 185 let Latency = 1; 186 let NumMicroOps = 2; 187} 188 189def V2Write_3cyc_2M : SchedWriteRes<[V2UnitM, V2UnitM]> { 190 let Latency = 3; 191 let NumMicroOps = 2; 192} 193 194def V2Write_4cyc_2M : SchedWriteRes<[V2UnitM, V2UnitM]> { 195 let Latency = 4; 196 let NumMicroOps = 2; 197} 198 199def V2Write_5cyc_1L_1F : SchedWriteRes<[V2UnitL, V2UnitF]> { 200 let Latency = 5; 201 let NumMicroOps = 2; 202} 203 204def V2Write_6cyc_1I_1L : SchedWriteRes<[V2UnitI, V2UnitL]> { 205 let Latency = 6; 206 let NumMicroOps = 2; 207} 208 209def V2Write_7cyc_1F_1L : SchedWriteRes<[V2UnitF, V2UnitL]> { 210 let Latency = 7; 211 let NumMicroOps = 2; 212} 213 214def V2Write_7cyc_1I_1L : SchedWriteRes<[V2UnitI, V2UnitL]> { 215 let Latency = 7; 216 let NumMicroOps = 2; 217} 218 219def V2Write_1cyc_1L01_1D : SchedWriteRes<[V2UnitL01, V2UnitD]> { 220 let Latency = 1; 221 let NumMicroOps = 2; 222} 223 224def V2Write_5cyc_1M0_1V : SchedWriteRes<[V2UnitM0, V2UnitV]> { 225 let Latency = 5; 226 let NumMicroOps = 2; 227} 228 229def V2Write_2cyc_1L01_1V01 : SchedWriteRes<[V2UnitL01, V2UnitV01]> { 230 let Latency = 2; 231 let NumMicroOps = 2; 232} 233 234def V2Write_2cyc_1L01_1V : SchedWriteRes<[V2UnitL01, V2UnitV]> { 235 let Latency = 2; 236 let NumMicroOps = 2; 237} 238 239def V2Write_2cyc_2V01 : SchedWriteRes<[V2UnitV01, V2UnitV01]> { 240 let Latency = 2; 241 let NumMicroOps = 2; 242} 243 244def V2Write_4cyc_2V01 : SchedWriteRes<[V2UnitV01, V2UnitV01]> { 245 let Latency = 4; 246 let NumMicroOps = 2; 247} 248 249def V2Write_4cyc_1L01_1V01 : SchedWriteRes<[V2UnitL01, V2UnitV01]> { 250 let Latency = 4; 251 let NumMicroOps = 2; 252} 253 254def V2Write_4cyc_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> { 255 let Latency = 4; 256 let NumMicroOps = 2; 257} 258 259def V2Write_4cyc_2V0 : SchedWriteRes<[V2UnitV0, V2UnitV0]> { 260 let Latency = 4; 261 let NumMicroOps = 2; 262} 263 264def V2Write_4cyc_2V02 : SchedWriteRes<[V2UnitV02, V2UnitV02]> { 265 let Latency = 4; 266 let NumMicroOps = 2; 267} 268 269def V2Write_4cyc_2V : SchedWriteRes<[V2UnitV, V2UnitV]> { 270 let Latency = 4; 271 let NumMicroOps = 2; 272} 273 274def V2Write_6cyc_2V : SchedWriteRes<[V2UnitV, V2UnitV]> { 275 let Latency = 6; 276 let NumMicroOps = 2; 277} 278 279def V2Write_6cyc_2L : SchedWriteRes<[V2UnitL, V2UnitL]> { 280 let Latency = 6; 281 let NumMicroOps = 2; 282} 283 284def V2Write_8cyc_1L_1V : SchedWriteRes<[V2UnitL, V2UnitV]> { 285 let Latency = 8; 286 let NumMicroOps = 2; 287} 288 289def V2Write_4cyc_1L01_1V : SchedWriteRes<[V2UnitL01, V2UnitV]> { 290 let Latency = 4; 291 let NumMicroOps = 2; 292} 293 294def V2Write_3cyc_1M0_1M : SchedWriteRes<[V2UnitM0, V2UnitM]> { 295 let Latency = 3; 296 let NumMicroOps = 2; 297} 298 299def V2Write_4cyc_1M0_1M : SchedWriteRes<[V2UnitM0, V2UnitM]> { 300 let Latency = 4; 301 let NumMicroOps = 2; 302} 303 304def V2Write_1cyc_1M0_1M : SchedWriteRes<[V2UnitM0, V2UnitM]> { 305 let Latency = 1; 306 let NumMicroOps = 2; 307} 308 309def V2Write_2cyc_1M0_1M : SchedWriteRes<[V2UnitM0, V2UnitM]> { 310 let Latency = 2; 311 let NumMicroOps = 2; 312} 313 314def V2Write_6cyc_2V1 : SchedWriteRes<[V2UnitV1, V2UnitV1]> { 315 let Latency = 6; 316 let NumMicroOps = 2; 317} 318 319def V2Write_4cyc_1V0_1M0 : SchedWriteRes<[V2UnitV0, V2UnitM0]> { 320 let Latency = 4; 321 let NumMicroOps = 2; 322} 323 324def V2Write_5cyc_1V0_1M0 : SchedWriteRes<[V2UnitV0, V2UnitM0]> { 325 let Latency = 5; 326 let NumMicroOps = 2; 327} 328 329def V2Write_5cyc_2V0 : SchedWriteRes<[V2UnitV0, V2UnitV0]> { 330 let Latency = 5; 331 let NumMicroOps = 2; 332} 333 334def V2Write_5cyc_2V02 : SchedWriteRes<[V2UnitV02, V2UnitV02]> { 335 let Latency = 5; 336 let NumMicroOps = 2; 337} 338 339def V2Write_6cyc_1V1_1M0 : SchedWriteRes<[V2UnitV1, V2UnitM0]> { 340 let Latency = 6; 341 let NumMicroOps = 2; 342} 343 344def V2Write_7cyc_1M0_1V02 : SchedWriteRes<[V2UnitM0, V2UnitV02]> { 345 let Latency = 7; 346 let NumMicroOps = 2; 347} 348 349def V2Write_2cyc_1V0_1M : SchedWriteRes<[V2UnitV0, V2UnitM]> { 350 let Latency = 2; 351 let NumMicroOps = 2; 352} 353 354def V2Write_3cyc_1V0_1M : SchedWriteRes<[V2UnitV0, V2UnitM]> { 355 let Latency = 3; 356 let NumMicroOps = 2; 357} 358 359def V2Write_6cyc_1V_1V13 : SchedWriteRes<[V2UnitV, V2UnitV13]> { 360 let Latency = 6; 361 let NumMicroOps = 2; 362} 363 364def V2Write_6cyc_1L_1M : SchedWriteRes<[V2UnitL, V2UnitM]> { 365 let Latency = 6; 366 let NumMicroOps = 2; 367} 368 369def V2Write_6cyc_1L_1S : SchedWriteRes<[V2UnitL, V2UnitS]> { 370 let Latency = 6; 371 let NumMicroOps = 2; 372} 373 374def V2Write_4cyc_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> { 375 let Latency = 4; 376 let NumMicroOps = 2; 377} 378 379def V2Write_8cyc_1M0_1V01 : SchedWriteRes<[V2UnitM0, V2UnitV01]> { 380 let Latency = 8; 381 let NumMicroOps = 2; 382} 383 384//===----------------------------------------------------------------------===// 385// Define generic 3 micro-op types 386 387def V2Write_1cyc_1L01_1D_1I : SchedWriteRes<[V2UnitL01, V2UnitD, V2UnitI]> { 388 let Latency = 1; 389 let NumMicroOps = 3; 390} 391 392def V2Write_2cyc_1L01_1V01_1I : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitI]> { 393 let Latency = 2; 394 let NumMicroOps = 3; 395} 396 397def V2Write_2cyc_1L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01]> { 398 let Latency = 2; 399 let NumMicroOps = 3; 400} 401 402def V2Write_4cyc_1L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01]> { 403 let Latency = 4; 404 let NumMicroOps = 3; 405} 406 407def V2Write_9cyc_1L_2V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV]> { 408 let Latency = 9; 409 let NumMicroOps = 3; 410} 411 412def V2Write_4cyc_3V01 : SchedWriteRes<[V2UnitV01, V2UnitV01, V2UnitV01]> { 413 let Latency = 4; 414 let NumMicroOps = 3; 415} 416 417def V2Write_7cyc_1M_1M0_1V : SchedWriteRes<[V2UnitM, V2UnitM0, V2UnitV]> { 418 let Latency = 7; 419 let NumMicroOps = 3; 420} 421 422def V2Write_2cyc_1L01_1S_1V : SchedWriteRes<[V2UnitL01, V2UnitS, V2UnitV]> { 423 let Latency = 2; 424 let NumMicroOps = 3; 425} 426 427def V2Write_2cyc_1L01_1S_1V01 : SchedWriteRes<[V2UnitL01, V2UnitS, V2UnitV01]> { 428 let Latency = 2; 429 let NumMicroOps = 3; 430} 431 432def V2Write_6cyc_3L : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL]> { 433 let Latency = 6; 434 let NumMicroOps = 3; 435} 436 437def V2Write_6cyc_3V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV]> { 438 let Latency = 6; 439 let NumMicroOps = 3; 440} 441 442def V2Write_8cyc_1L_2V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV]> { 443 let Latency = 8; 444 let NumMicroOps = 3; 445} 446 447//===----------------------------------------------------------------------===// 448// Define generic 4 micro-op types 449 450def V2Write_2cyc_1L01_2V01_1I : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01, 451 V2UnitI]> { 452 let Latency = 2; 453 let NumMicroOps = 4; 454} 455 456def V2Write_2cyc_2L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, 457 V2UnitV01, V2UnitV01]> { 458 let Latency = 2; 459 let NumMicroOps = 4; 460} 461 462def V2Write_4cyc_2L01_2V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, 463 V2UnitV01, V2UnitV01]> { 464 let Latency = 4; 465 let NumMicroOps = 4; 466} 467 468def V2Write_5cyc_1I_3L : SchedWriteRes<[V2UnitI, V2UnitL, V2UnitL, V2UnitL]> { 469 let Latency = 5; 470 let NumMicroOps = 4; 471} 472 473def V2Write_9cyc_2L_2V1 : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV1, 474 V2UnitV1]> { 475 let Latency = 9; 476 let NumMicroOps = 4; 477} 478 479def V2Write_6cyc_4V0 : SchedWriteRes<[V2UnitV0, V2UnitV0, V2UnitV0, V2UnitV0]> { 480 let Latency = 6; 481 let NumMicroOps = 4; 482} 483 484def V2Write_8cyc_4V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, V2UnitV]> { 485 let Latency = 8; 486 let NumMicroOps = 4; 487} 488 489def V2Write_6cyc_2V_2V13 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV13, 490 V2UnitV13]> { 491 let Latency = 6; 492 let NumMicroOps = 4; 493} 494 495def V2Write_8cyc_2V_2V13 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV13, 496 V2UnitV13]> { 497 let Latency = 8; 498 let NumMicroOps = 4; 499} 500 501def V2Write_6cyc_4V02 : SchedWriteRes<[V2UnitV02, V2UnitV02, V2UnitV02, 502 V2UnitV02]> { 503 let Latency = 6; 504 let NumMicroOps = 4; 505} 506 507def V2Write_6cyc_4V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, V2UnitV]> { 508 let Latency = 6; 509 let NumMicroOps = 4; 510} 511 512def V2Write_8cyc_2L_2V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, V2UnitV]> { 513 let Latency = 8; 514 let NumMicroOps = 4; 515} 516 517def V2Write_9cyc_2L_2V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, V2UnitV]> { 518 let Latency = 9; 519 let NumMicroOps = 4; 520} 521 522def V2Write_2cyc_2L01_2V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV, 523 V2UnitV]> { 524 let Latency = 2; 525 let NumMicroOps = 4; 526} 527 528def V2Write_4cyc_2L01_2V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV, 529 V2UnitV]> { 530 let Latency = 4; 531 let NumMicroOps = 4; 532} 533 534def V2Write_8cyc_2M0_2V02 : SchedWriteRes<[V2UnitM0, V2UnitM0, V2UnitV02, 535 V2UnitV02]> { 536 let Latency = 8; 537 let NumMicroOps = 4; 538} 539 540def V2Write_8cyc_2V_2V1 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV1, 541 V2UnitV1]> { 542 let Latency = 8; 543 let NumMicroOps = 4; 544} 545 546def V2Write_4cyc_2M0_2M : SchedWriteRes<[V2UnitM0, V2UnitM0, V2UnitM, 547 V2UnitM]> { 548 let Latency = 4; 549 let NumMicroOps = 4; 550} 551 552def V2Write_5cyc_2M0_2M : SchedWriteRes<[V2UnitM0, V2UnitM0, V2UnitM, 553 V2UnitM]> { 554 let Latency = 5; 555 let NumMicroOps = 4; 556} 557 558def V2Write_6cyc_2I_2L : SchedWriteRes<[V2UnitI, V2UnitI, V2UnitL, V2UnitL]> { 559 let Latency = 6; 560 let NumMicroOps = 4; 561} 562 563def V2Write_7cyc_4L : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, V2UnitL]> { 564 let Latency = 7; 565 let NumMicroOps = 4; 566} 567 568def V2Write_6cyc_1L01_3V01 : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01, 569 V2UnitV01]> { 570 let Latency = 6; 571 let NumMicroOps = 4; 572} 573 574//===----------------------------------------------------------------------===// 575// Define generic 5 micro-op types 576 577def V2Write_2cyc_1L01_2V01_2I : SchedWriteRes<[V2UnitL01, V2UnitV01, V2UnitV01, 578 V2UnitI, V2UnitI]> { 579 let Latency = 2; 580 let NumMicroOps = 5; 581} 582 583def V2Write_8cyc_2L_3V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, V2UnitV, 584 V2UnitV]> { 585 let Latency = 8; 586 let NumMicroOps = 5; 587} 588 589def V2Write_9cyc_1L_4V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV, V2UnitV, 590 V2UnitV]> { 591 let Latency = 9; 592 let NumMicroOps = 5; 593} 594 595def V2Write_10cyc_1L_4V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV, V2UnitV, 596 V2UnitV]> { 597 let Latency = 10; 598 let NumMicroOps = 5; 599} 600 601def V2Write_6cyc_5V : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, V2UnitV, 602 V2UnitV]> { 603 let Latency = 6; 604 let NumMicroOps = 5; 605} 606 607//===----------------------------------------------------------------------===// 608// Define generic 6 micro-op types 609 610def V2Write_8cyc_3L_3V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, 611 V2UnitV, V2UnitV, V2UnitV]> { 612 let Latency = 8; 613 let NumMicroOps = 6; 614} 615 616def V2Write_9cyc_3L_3V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, 617 V2UnitV, V2UnitV, V2UnitV]> { 618 let Latency = 9; 619 let NumMicroOps = 6; 620} 621 622def V2Write_9cyc_2L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, 623 V2UnitV, V2UnitV, V2UnitV]> { 624 let Latency = 9; 625 let NumMicroOps = 6; 626} 627 628def V2Write_9cyc_2L_2V_2S : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitV, 629 V2UnitV, V2UnitS, V2UnitS]> { 630 let Latency = 9; 631 let NumMicroOps = 6; 632} 633 634def V2Write_9cyc_2V_4V13 : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV13, 635 V2UnitV13, V2UnitV13, V2UnitV13]> { 636 let Latency = 9; 637 let NumMicroOps = 6; 638} 639 640def V2Write_2cyc_3L01_3V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 641 V2UnitV, V2UnitV, V2UnitV]> { 642 let Latency = 2; 643 let NumMicroOps = 6; 644} 645 646def V2Write_4cyc_2L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV01, 647 V2UnitV01, V2UnitV01, V2UnitV01]> { 648 let Latency = 4; 649 let NumMicroOps = 6; 650} 651 652def V2Write_5cyc_2L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV01, 653 V2UnitV01, V2UnitV01, V2UnitV01]> { 654 let Latency = 5; 655 let NumMicroOps = 6; 656} 657 658def V2Write_2cyc_3L01_3V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 659 V2UnitV01, V2UnitV01, V2UnitV01]> { 660 let Latency = 2; 661 let NumMicroOps = 6; 662} 663 664def V2Write_4cyc_2L01_2S_2V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitS, 665 V2UnitS, V2UnitV01, V2UnitV01]> { 666 let Latency = 4; 667 let NumMicroOps = 6; 668} 669 670//===----------------------------------------------------------------------===// 671// Define generic 7 micro-op types 672 673def V2Write_8cyc_3L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, 674 V2UnitV, V2UnitV, V2UnitV, V2UnitV]> { 675 let Latency = 8; 676 let NumMicroOps = 7; 677} 678 679//===----------------------------------------------------------------------===// 680// Define generic 8 micro-op types 681 682def V2Write_2cyc_4L01_4V : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 683 V2UnitL01, V2UnitV, V2UnitV, V2UnitV, 684 V2UnitV]> { 685 let Latency = 2; 686 let NumMicroOps = 8; 687} 688 689def V2Write_2cyc_4L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 690 V2UnitL01, V2UnitV01, V2UnitV01, 691 V2UnitV01, V2UnitV01]> { 692 let Latency = 2; 693 let NumMicroOps = 8; 694} 695 696def V2Write_4cyc_4L01_4V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 697 V2UnitL01, V2UnitV01, V2UnitV01, 698 V2UnitV01, V2UnitV01]> { 699 let Latency = 4; 700 let NumMicroOps = 8; 701} 702 703def V2Write_6cyc_2L01_6V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitV01, 704 V2UnitV01, V2UnitV01, V2UnitV01, 705 V2UnitV01, V2UnitV01]> { 706 let Latency = 6; 707 let NumMicroOps = 8; 708} 709 710def V2Write_8cyc_4L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, V2UnitL, 711 V2UnitV, V2UnitV, V2UnitV, V2UnitV]> { 712 let Latency = 8; 713 let NumMicroOps = 8; 714} 715 716//===----------------------------------------------------------------------===// 717// Define generic 9 micro-op types 718 719def V2Write_6cyc_3L01_6V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 720 V2UnitV01, V2UnitV01, V2UnitV01, 721 V2UnitV01, V2UnitV01, V2UnitV01]> { 722 let Latency = 6; 723 let NumMicroOps = 9; 724} 725 726def V2Write_10cyc_1L_8V : SchedWriteRes<[V2UnitL, V2UnitV, V2UnitV, V2UnitV, 727 V2UnitV, V2UnitV, V2UnitV, V2UnitV, 728 V2UnitV]> { 729 let Latency = 10; 730 let NumMicroOps = 9; 731} 732 733def V2Write_10cyc_3V_3L_3S : SchedWriteRes<[V2UnitV, V2UnitV, V2UnitV, 734 V2UnitL, V2UnitL, V2UnitL, 735 V2UnitS, V2UnitS, V2UnitS]> { 736 let Latency = 10; 737 let NumMicroOps = 9; 738} 739 740//===----------------------------------------------------------------------===// 741// Define generic 10 micro-op types 742 743def V2Write_9cyc_6L_4V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, V2UnitL, 744 V2UnitL, V2UnitL, V2UnitV, V2UnitV, 745 V2UnitV, V2UnitV]> { 746 let Latency = 9; 747 let NumMicroOps = 10; 748} 749 750//===----------------------------------------------------------------------===// 751// Define generic 12 micro-op types 752 753def V2Write_5cyc_4L01_8V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 754 V2UnitL01, V2UnitV01, V2UnitV01, 755 V2UnitV01, V2UnitV01, V2UnitV01, 756 V2UnitV01, V2UnitV01, V2UnitV01]> { 757 let Latency = 5; 758 let NumMicroOps = 12; 759} 760 761def V2Write_9cyc_4L_8V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, 762 V2UnitL, V2UnitV, V2UnitV, 763 V2UnitV, V2UnitV, V2UnitV, 764 V2UnitV, V2UnitV, V2UnitV]> { 765 let Latency = 9; 766 let NumMicroOps = 12; 767} 768 769def V2Write_10cyc_4L_8V : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, 770 V2UnitL, V2UnitV, V2UnitV, 771 V2UnitV, V2UnitV, V2UnitV, 772 V2UnitV, V2UnitV, V2UnitV]> { 773 let Latency = 10; 774 let NumMicroOps = 12; 775} 776 777//===----------------------------------------------------------------------===// 778// Define generic 16 micro-op types 779 780def V2Write_7cyc_4L01_12V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 781 V2UnitL01, V2UnitV01, V2UnitV01, 782 V2UnitV01, V2UnitV01, V2UnitV01, 783 V2UnitV01, V2UnitV01, V2UnitV01, 784 V2UnitV01, V2UnitV01, V2UnitV01, 785 V2UnitV01]> { 786 let Latency = 7; 787 let NumMicroOps = 16; 788} 789 790def V2Write_10cyc_4L_8V_4S : SchedWriteRes<[V2UnitL, V2UnitL, V2UnitL, 791 V2UnitL, V2UnitV, V2UnitV, 792 V2UnitV, V2UnitV, V2UnitV, 793 V2UnitV, V2UnitV, V2UnitV, 794 V2UnitS, V2UnitS, V2UnitS, 795 V2UnitS]> { 796 let Latency = 10; 797 let NumMicroOps = 16; 798} 799 800//===----------------------------------------------------------------------===// 801// Define generic 18 micro-op types 802 803def V2Write_7cyc_9L01_9V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 804 V2UnitL01, V2UnitL01, V2UnitL01, 805 V2UnitL01, V2UnitL01, V2UnitL01, 806 V2UnitV01, V2UnitV01, V2UnitV01, 807 V2UnitV01, V2UnitV01, V2UnitV01, 808 V2UnitV01, V2UnitV01, V2UnitV01]> { 809 let Latency = 7; 810 let NumMicroOps = 18; 811} 812 813//===----------------------------------------------------------------------===// 814// Define generic 27 micro-op types 815 816def V2Write_7cyc_9L01_9S_9V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 817 V2UnitL01, V2UnitL01, V2UnitL01, 818 V2UnitL01, V2UnitL01, V2UnitL01, 819 V2UnitS, V2UnitS, V2UnitS, 820 V2UnitS, V2UnitS, V2UnitS, 821 V2UnitS, V2UnitS, V2UnitS, 822 V2UnitV01, V2UnitV01, V2UnitV01, 823 V2UnitV01, V2UnitV01, V2UnitV01, 824 V2UnitV01, V2UnitV01, 825 V2UnitV01]> { 826 let Latency = 7; 827 let NumMicroOps = 27; 828} 829 830//===----------------------------------------------------------------------===// 831// Define generic 36 micro-op types 832 833def V2Write_11cyc_18L01_18V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, V2UnitL01, 834 V2UnitL01, V2UnitL01, V2UnitL01, 835 V2UnitL01, V2UnitL01, V2UnitL01, 836 V2UnitL01, V2UnitL01, V2UnitL01, 837 V2UnitL01, V2UnitL01, V2UnitL01, 838 V2UnitL01, V2UnitL01, V2UnitL01, 839 V2UnitV01, V2UnitV01, V2UnitV01, 840 V2UnitV01, V2UnitV01, V2UnitV01, 841 V2UnitV01, V2UnitV01, V2UnitV01, 842 V2UnitV01, V2UnitV01, V2UnitV01, 843 V2UnitV01, V2UnitV01, V2UnitV01, 844 V2UnitV01, V2UnitV01, 845 V2UnitV01]> { 846 let Latency = 11; 847 let NumMicroOps = 36; 848} 849 850//===----------------------------------------------------------------------===// 851// Define generic 54 micro-op types 852 853def V2Write_11cyc_18L01_18S_18V01 : SchedWriteRes<[V2UnitL01, V2UnitL01, 854 V2UnitL01, V2UnitL01, 855 V2UnitL01, V2UnitL01, 856 V2UnitL01, V2UnitL01, 857 V2UnitL01, V2UnitL01, 858 V2UnitL01, V2UnitL01, 859 V2UnitL01, V2UnitL01, 860 V2UnitL01, V2UnitL01, 861 V2UnitL01, V2UnitL01, 862 V2UnitS, V2UnitS, V2UnitS, 863 V2UnitS, V2UnitS, V2UnitS, 864 V2UnitS, V2UnitS, V2UnitS, 865 V2UnitS, V2UnitS, V2UnitS, 866 V2UnitS, V2UnitS, V2UnitS, 867 V2UnitS, V2UnitS, V2UnitS, 868 V2UnitV01, V2UnitV01, 869 V2UnitV01, V2UnitV01, 870 V2UnitV01, V2UnitV01, 871 V2UnitV01, V2UnitV01, 872 V2UnitV01, V2UnitV01, 873 V2UnitV01, V2UnitV01, 874 V2UnitV01, V2UnitV01, 875 V2UnitV01, V2UnitV01, 876 V2UnitV01, V2UnitV01]> { 877 let Latency = 11; 878 let NumMicroOps = 54; 879} 880 881//===----------------------------------------------------------------------===// 882// Define predicate-controlled types 883 884def V2Write_ArithI : SchedWriteVariant<[ 885 SchedVar<IsCheapLSL, [V2Write_1cyc_1I]>, 886 SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>; 887 888def V2Write_ArithF : SchedWriteVariant<[ 889 SchedVar<IsCheapLSL, [V2Write_1cyc_1F]>, 890 SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>; 891 892def V2Write_Logical : SchedWriteVariant<[ 893 SchedVar<NeoverseNoLSL, [V2Write_1cyc_1F]>, 894 SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>; 895 896def V2Write_Extr : SchedWriteVariant<[ 897 SchedVar<IsRORImmIdiomPred, [V2Write_1cyc_1I]>, 898 SchedVar<NoSchedPred, [V2Write_3cyc_1I_1M]>]>; 899 900def V2Write_LdrHQ : SchedWriteVariant<[ 901 SchedVar<NeoverseHQForm, [V2Write_7cyc_1I_1L]>, 902 SchedVar<NoSchedPred, [V2Write_6cyc_1L]>]>; 903 904def V2Write_StrHQ : SchedWriteVariant<[ 905 SchedVar<NeoverseHQForm, [V2Write_2cyc_1L01_1V01_1I]>, 906 SchedVar<NoSchedPred, [V2Write_2cyc_1L01_1V01]>]>; 907 908def V2Write_0or1cyc_1I : SchedWriteVariant<[ 909 SchedVar<NeoverseZeroMove, [V2Write_0cyc]>, 910 SchedVar<NoSchedPred, [V2Write_1cyc_1I]>]>; 911 912def V2Write_0or2cyc_1V : SchedWriteVariant<[ 913 SchedVar<NeoverseZeroMove, [V2Write_0cyc]>, 914 SchedVar<NoSchedPred, [V2Write_2cyc_1V]>]>; 915 916def V2Write_0or3cyc_1M0 : SchedWriteVariant<[ 917 SchedVar<NeoverseZeroMove, [V2Write_0cyc]>, 918 SchedVar<NoSchedPred, [V2Write_3cyc_1M0]>]>; 919 920def V2Write_2or3cyc_1M : SchedWriteVariant<[ 921 SchedVar<NeoversePdIsPg, [V2Write_3cyc_1M]>, 922 SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>; 923 924def V2Write_3or4cyc_2M : SchedWriteVariant<[ 925 SchedVar<NeoversePdIsPg, [V2Write_4cyc_2M]>, 926 SchedVar<NoSchedPred, [V2Write_3cyc_2M]>]>; 927 928def V2Write_1or2cyc_1M0 : SchedWriteVariant<[ 929 SchedVar<NeoversePdIsPg, [V2Write_2cyc_1M0]>, 930 SchedVar<NoSchedPred, [V2Write_1cyc_1M0]>]>; 931 932def V2Write_2or3cyc_1M0 : SchedWriteVariant<[ 933 SchedVar<NeoversePdIsPg, [V2Write_3cyc_1M0]>, 934 SchedVar<NoSchedPred, [V2Write_2cyc_1M0]>]>; 935 936def V2Write_1or2cyc_1M0_1M : SchedWriteVariant<[ 937 SchedVar<NeoversePdIsPg, [V2Write_2cyc_1M0_1M]>, 938 SchedVar<NoSchedPred, [V2Write_1cyc_1M0_1M]>]>; 939 940def V2Write_3or4cyc_1M0_1M : SchedWriteVariant<[ 941 SchedVar<NeoversePdIsPg, [V2Write_4cyc_1M0_1M]>, 942 SchedVar<NoSchedPred, [V2Write_3cyc_1M0_1M]>]>; 943 944def V2Write_4or5cyc_2M0_2M : SchedWriteVariant<[ 945 SchedVar<NeoversePdIsPg, [V2Write_5cyc_2M0_2M]>, 946 SchedVar<NoSchedPred, [V2Write_4cyc_2M0_2M]>]>; 947 948def V2Write_4or5cyc_1V0_1M0 : SchedWriteVariant<[ 949 SchedVar<NeoversePdIsPg, [V2Write_5cyc_1V0_1M0]>, 950 SchedVar<NoSchedPred, [V2Write_4cyc_1V0_1M0]>]>; 951 952def V2Write_2or3cyc_1V0_1M : SchedWriteVariant<[ 953 SchedVar<NeoversePdIsPg, [V2Write_3cyc_1V0_1M]>, 954 SchedVar<NoSchedPred, [V2Write_2cyc_1V0_1M]>]>; 955 956def V2Write_IncDec : SchedWriteVariant<[ 957 SchedVar<NeoverseCheapIncDec, [V2Write_1cyc_1F]>, 958 SchedVar<NoSchedPred, [V2Write_2cyc_1M]>]>; 959 960//===----------------------------------------------------------------------===// 961// Define forwarded types 962 963// NOTE: SOG, p. 16, n. 2: Accumulator forwarding is not supported for 964// consumers of 64 bit multiply high operations? 965def V2Wr_IM : SchedWriteRes<[V2UnitM]> { let Latency = 2; } 966def V2Wr_IMA : SchedWriteRes<[V2UnitM0]> { let Latency = 2; } 967def V2Wr_IMUL : SchedWriteVariant<[ 968 SchedVar<IsReg3ZeroPred, [V2Wr_IM]>, 969 SchedVar<NoSchedPred, [V2Wr_IMA]>]>; 970def V2Rd_IMA : SchedReadAdvance<1, [V2Wr_IMA]>; 971 972def V2Wr_FMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 973def V2Rd_FMA : SchedReadAdvance<2, [WriteFMul, V2Wr_FMA]>; 974 975def V2Wr_VA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 976def V2Rd_VA : SchedReadAdvance<3, [V2Wr_VA]>; 977 978def V2Wr_VDOT : SchedWriteRes<[V2UnitV]> { let Latency = 3; } 979def V2Rd_VDOT : SchedReadAdvance<2, [V2Wr_VDOT]>; 980 981def V2Wr_VMMA : SchedWriteRes<[V2UnitV]> { let Latency = 3; } 982def V2Rd_VMMA : SchedReadAdvance<2, [V2Wr_VMMA]>; 983 984def V2Wr_VMA : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 985def V2Rd_VMA : SchedReadAdvance<3, [V2Wr_VMA]>; 986 987def V2Wr_VMAH : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 4; } 988def V2Rd_VMAH : SchedReadAdvance<2, [V2Wr_VMAH]>; 989 990def V2Wr_VMAL : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 991def V2Rd_VMAL : SchedReadAdvance<3, [V2Wr_VMAL]>; 992 993def V2Wr_VPA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 994def V2Rd_VPA : SchedReadAdvance<3, [V2Wr_VPA]>; 995 996def V2Wr_VSA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 997def V2Rd_VSA : SchedReadAdvance<3, [V2Wr_VSA]>; 998 999def V2Wr_VFCMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 1000def V2Rd_VFCMA : SchedReadAdvance<2, [V2Wr_VFCMA]>; 1001 1002def V2Wr_VFM : SchedWriteRes<[V2UnitV]> { let Latency = 3; } 1003def V2Wr_VFMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 1004def V2Rd_VFMA : SchedReadAdvance<2, [V2Wr_VFM, V2Wr_VFMA]>; 1005 1006def V2Wr_VFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 1007def V2Rd_VFMAL : SchedReadAdvance<2, [V2Wr_VFMAL]>; 1008 1009def V2Wr_VBFDOT : SchedWriteRes<[V2UnitV]> { let Latency = 5; } 1010def V2Rd_VBFDOT : SchedReadAdvance<2, [V2Wr_VBFDOT]>; 1011def V2Wr_VBFMMA : SchedWriteRes<[V2UnitV]> { let Latency = 6; } 1012def V2Rd_VBFMMA : SchedReadAdvance<2, [V2Wr_VBFMMA]>; 1013def V2Wr_VBFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 5; } 1014def V2Rd_VBFMAL : SchedReadAdvance<3, [V2Wr_VBFMAL]>; 1015 1016def V2Wr_CRC : SchedWriteRes<[V2UnitM0]> { let Latency = 2; } 1017def V2Rd_CRC : SchedReadAdvance<1, [V2Wr_CRC]>; 1018 1019def V2Wr_ZA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 1020def V2Rd_ZA : SchedReadAdvance<3, [V2Wr_ZA]>; 1021def V2Wr_ZPA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 1022def V2Rd_ZPA : SchedReadAdvance<3, [V2Wr_ZPA]>; 1023def V2Wr_ZSA : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } 1024def V2Rd_ZSA : SchedReadAdvance<3, [V2Wr_ZSA]>; 1025 1026def V2Wr_ZDOTB : SchedWriteRes<[V2UnitV]> { let Latency = 3; } 1027def V2Rd_ZDOTB : SchedReadAdvance<2, [V2Wr_ZDOTB]>; 1028def V2Wr_ZDOTH : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 1029def V2Rd_ZDOTH : SchedReadAdvance<3, [V2Wr_ZDOTH]>; 1030 1031// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce 1032// throughput to 1 in case of forwarding? 1033def V2Wr_ZCMABHS : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 1034def V2Rd_ZCMABHS : SchedReadAdvance<3, [V2Wr_ZCMABHS]>; 1035def V2Wr_ZCMAD : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 5; } 1036def V2Rd_ZCMAD : SchedReadAdvance<2, [V2Wr_ZCMAD]>; 1037 1038def V2Wr_ZMMA : SchedWriteRes<[V2UnitV]> { let Latency = 3; } 1039def V2Rd_ZMMA : SchedReadAdvance<2, [V2Wr_ZMMA]>; 1040 1041def V2Wr_ZMABHS : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 4; } 1042def V2Rd_ZMABHS : SchedReadAdvance<3, [V2Wr_ZMABHS]>; 1043def V2Wr_ZMAD : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 5; } 1044def V2Rd_ZMAD : SchedReadAdvance<2, [V2Wr_ZMAD]>; 1045 1046def V2Wr_ZMAL : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 1047def V2Rd_ZMAL : SchedReadAdvance<3, [V2Wr_ZMAL]>; 1048 1049def V2Wr_ZMASQL : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 1050def V2Wr_ZMASQBHS : SchedWriteRes<[V2UnitV02]> { let Latency = 4; } 1051def V2Wr_ZMASQD : SchedWriteRes<[V2UnitV02, V2UnitV02]> { let Latency = 5; } 1052def V2Rd_ZMASQ : SchedReadAdvance<2, [V2Wr_ZMASQL, V2Wr_ZMASQBHS, 1053 V2Wr_ZMASQD]>; 1054 1055def V2Wr_ZFCMA : SchedWriteRes<[V2UnitV]> { let Latency = 5; } 1056def V2Rd_ZFCMA : SchedReadAdvance<3, [V2Wr_ZFCMA]>; 1057 1058def V2Wr_ZFMA : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 1059def V2Rd_ZFMA : SchedReadAdvance<2, [V2Wr_ZFMA]>; 1060 1061def V2Wr_ZFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 4; } 1062def V2Rd_ZFMAL : SchedReadAdvance<2, [V2Wr_ZFMAL]>; 1063 1064def V2Wr_ZBFDOT : SchedWriteRes<[V2UnitV]> { let Latency = 5; } 1065def V2Rd_ZBFDOT : SchedReadAdvance<2, [V2Wr_ZBFDOT]>; 1066def V2Wr_ZBFMMA : SchedWriteRes<[V2UnitV]> { let Latency = 6; } 1067def V2Rd_ZBFMMA : SchedReadAdvance<2, [V2Wr_ZBFMMA]>; 1068def V2Wr_ZBFMAL : SchedWriteRes<[V2UnitV]> { let Latency = 5; } 1069def V2Rd_ZBFMAL : SchedReadAdvance<3, [V2Wr_ZBFMAL]>; 1070 1071//===----------------------------------------------------------------------===// 1072// Define types with long resource cycles (rc) 1073 1074def V2Write_6cyc_1V1_5rc : SchedWriteRes<[V2UnitV1]> { let Latency = 6; let ReleaseAtCycles = [ 5]; } 1075def V2Write_7cyc_1V02_7rc : SchedWriteRes<[V2UnitV02]> { let Latency = 7; let ReleaseAtCycles = [ 7]; } 1076def V2Write_10cyc_1V02_5rc : SchedWriteRes<[V2UnitV02]> { let Latency = 10; let ReleaseAtCycles = [ 5]; } 1077def V2Write_10cyc_1V02_9rc : SchedWriteRes<[V2UnitV02]> { let Latency = 10; let ReleaseAtCycles = [ 9]; } 1078def V2Write_10cyc_1V02_10rc : SchedWriteRes<[V2UnitV02]> { let Latency = 10; let ReleaseAtCycles = [10]; } 1079def V2Write_10cyc_1V1_9rc : SchedWriteRes<[V2UnitV1]> { let Latency = 10; let ReleaseAtCycles = [ 9]; } 1080def V2Write_13cyc_1V02_12rc : SchedWriteRes<[V2UnitV02]> { let Latency = 13; let ReleaseAtCycles = [12]; } 1081def V2Write_13cyc_1V02_13rc : SchedWriteRes<[V2UnitV02]> { let Latency = 13; let ReleaseAtCycles = [13]; } 1082def V2Write_15cyc_1V02_14rc : SchedWriteRes<[V2UnitV02]> { let Latency = 15; let ReleaseAtCycles = [14]; } 1083def V2Write_16cyc_1V02_14rc : SchedWriteRes<[V2UnitV02]> { let Latency = 16; let ReleaseAtCycles = [14]; } 1084def V2Write_16cyc_1V02_15rc : SchedWriteRes<[V2UnitV02]> { let Latency = 16; let ReleaseAtCycles = [15]; } 1085 1086// Miscellaneous 1087// ----------------------------------------------------------------------------- 1088 1089def : InstRW<[WriteI], (instrs COPY)>; 1090 1091// §3.3 Branch instructions 1092// ----------------------------------------------------------------------------- 1093 1094// Branch, immed 1095// Compare and branch 1096def : SchedAlias<WriteBr, V2Write_1cyc_1B>; 1097 1098// Branch, register 1099def : SchedAlias<WriteBrReg, V2Write_1cyc_1B>; 1100 1101// Branch and link, immed 1102// Branch and link, register 1103def : InstRW<[V2Write_1cyc_1B_1R], (instrs BL, BLR)>; 1104 1105// §3.4 Arithmetic and Logical Instructions 1106// ----------------------------------------------------------------------------- 1107 1108// ALU, basic 1109// ALU, basic, flagset 1110def : SchedAlias<WriteI, V2Write_1cyc_1I>; 1111def : InstRW<[V2Write_1cyc_1F], (instregex "^(ADD|SUB)S[WX]r[ir]$", 1112 "^(ADC|SBC)S[WX]r$", 1113 "^ANDS[WX]ri$")>; 1114def : InstRW<[V2Write_0or1cyc_1I], (instregex "^MOVZ[WX]i$")>; 1115 1116// ALU, extend and shift 1117def : SchedAlias<WriteIEReg, V2Write_2cyc_1M>; 1118 1119// Conditional compare 1120def : InstRW<[V2Write_1cyc_1F], (instregex "^CCM[NP][WX][ir]")>; 1121 1122// Arithmetic, LSL shift, shift <= 4 1123// Arithmetic, flagset, LSL shift, shift <= 4 1124// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4 1125def : SchedAlias<WriteISReg, V2Write_ArithI>; 1126def : InstRW<[V2Write_ArithF], 1127 (instregex "^(ADD|SUB)S[WX]rs$")>; 1128 1129// Arithmetic, immediate to logical address tag 1130def : InstRW<[V2Write_2cyc_1M], (instrs ADDG, SUBG)>; 1131 1132// Convert floating-point condition flags 1133// Flag manipulation instructions 1134def : WriteRes<WriteSys, []> { let Latency = 1; } 1135 1136// Insert Random Tags 1137def : InstRW<[V2Write_2cyc_1M], (instrs IRG, IRGstack)>; 1138 1139// Insert Tag Mask 1140// Subtract Pointer 1141// Subtract Pointer, flagset 1142def : InstRW<[V2Write_1cyc_1I], (instrs GMI, SUBP, SUBPS)>; 1143 1144// Logical, shift, no flagset 1145def : InstRW<[V2Write_1cyc_1I], (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>; 1146def : InstRW<[V2Write_0or1cyc_1I], (instregex "^ORR[WX]rs$")>; 1147 1148// Logical, shift, flagset 1149def : InstRW<[V2Write_Logical], (instregex "^(AND|BIC)S[WX]rs$")>; 1150 1151// Move and shift instructions 1152// ----------------------------------------------------------------------------- 1153 1154def : SchedAlias<WriteImm, V2Write_1cyc_1I>; 1155 1156// §3.5 Divide and multiply instructions 1157// ----------------------------------------------------------------------------- 1158 1159// SDIV, UDIV 1160def : SchedAlias<WriteID32, V2Write_12cyc_1M0>; 1161def : SchedAlias<WriteID64, V2Write_20cyc_1M0>; 1162 1163def : SchedAlias<WriteIM32, V2Write_2cyc_1M>; 1164def : SchedAlias<WriteIM64, V2Write_2cyc_1M>; 1165 1166// Multiply 1167// Multiply accumulate, W-form 1168// Multiply accumulate, X-form 1169def : InstRW<[V2Wr_IMUL, ReadIM, ReadIM, V2Rd_IMA], 1170 (instregex "^M(ADD|SUB)[WX]rrr$")>; 1171 1172// Multiply accumulate long 1173// Multiply long 1174def : InstRW<[V2Wr_IMUL, ReadIM, ReadIM, V2Rd_IMA], 1175 (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; 1176 1177// Multiply high 1178def : InstRW<[V2Write_3cyc_1M], (instrs SMULHrr, UMULHrr)>; 1179 1180// Pointer Authentication Instructions (v8.3 PAC) 1181// ----------------------------------------------------------------------------- 1182 1183// Authenticate data address 1184// Authenticate instruction address 1185// Compute pointer authentication code for data address 1186// Compute pointer authentication code, using generic key 1187// Compute pointer authentication code for instruction address 1188def : InstRW<[V2Write_5cyc_1M0], (instregex "^AUT", "^PAC")>; 1189 1190// Branch and link, register, with pointer authentication 1191// Branch, register, with pointer authentication 1192// Branch, return, with pointer authentication 1193def : InstRW<[V2Write_6cyc_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, 1194 BRAAZ, BRAB, BRABZ, RETAA, RETAB, 1195 ERETAA, ERETAB)>; 1196 1197 1198// Load register, with pointer authentication 1199def : InstRW<[V2Write_9cyc_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>; 1200 1201// Strip pointer authentication code 1202def : InstRW<[V2Write_2cyc_1M0], (instrs XPACD, XPACI, XPACLRI)>; 1203 1204// Miscellaneous data-processing instructions 1205// ----------------------------------------------------------------------------- 1206 1207// Address generation 1208def : InstRW<[V2Write_1cyc_1F], (instrs ADR, ADRP)>; 1209 1210// Bitfield extract, one reg 1211// Bitfield extract, two regs 1212def : SchedAlias<WriteExtr, V2Write_Extr>; 1213def : InstRW<[V2Write_Extr], (instrs EXTRWrri, EXTRXrri)>; 1214 1215// Bitfield move, basic 1216def : SchedAlias<WriteIS, V2Write_1cyc_1I>; 1217 1218// Bitfield move, insert 1219def : InstRW<[V2Write_2cyc_1M], (instregex "^BFM[WX]ri$")>; 1220 1221// Load instructions 1222// ----------------------------------------------------------------------------- 1223 1224// NOTE: SOG p. 19: Throughput of LDN?P X-form should be 2, but reported as 3. 1225 1226def : SchedAlias<WriteLD, V2Write_4cyc_1L>; 1227def : SchedAlias<WriteLDIdx, V2Write_4cyc_1L>; 1228 1229// Load register, literal 1230def : InstRW<[V2Write_5cyc_1L_1F], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>; 1231 1232// Load pair, signed immed offset, signed words 1233def : InstRW<[V2Write_5cyc_1I_3L, WriteLDHi], (instrs LDPSWi)>; 1234 1235// Load pair, immed post-index or immed pre-index, signed words 1236def : InstRW<[WriteAdr, V2Write_5cyc_1I_3L, WriteLDHi], 1237 (instregex "^LDPSW(post|pre)$")>; 1238 1239// Store instructions 1240// ----------------------------------------------------------------------------- 1241 1242// NOTE: SOG, p. 20: Unsure if STRH uses pipeline I. 1243 1244def : SchedAlias<WriteST, V2Write_1cyc_1L01_1D>; 1245def : SchedAlias<WriteSTIdx, V2Write_1cyc_1L01_1D>; 1246def : SchedAlias<WriteSTP, V2Write_1cyc_1L01_1D>; 1247def : SchedAlias<WriteAdr, V2Write_1cyc_1I>; 1248 1249// Tag load instructions 1250// ----------------------------------------------------------------------------- 1251 1252// Load allocation tag 1253// Load multiple allocation tags 1254def : InstRW<[V2Write_4cyc_1L], (instrs LDG, LDGM)>; 1255 1256// Tag store instructions 1257// ----------------------------------------------------------------------------- 1258 1259// Store allocation tags to one or two granules, post-index 1260// Store allocation tags to one or two granules, pre-index 1261// Store allocation tag to one or two granules, zeroing, post-index 1262// Store Allocation Tag to one or two granules, zeroing, pre-index 1263// Store allocation tag and reg pair to memory, post-Index 1264// Store allocation tag and reg pair to memory, pre-Index 1265def : InstRW<[V2Write_1cyc_1L01_1D_1I], (instrs STGPreIndex, STGPostIndex, 1266 ST2GPreIndex, ST2GPostIndex, 1267 STZGPreIndex, STZGPostIndex, 1268 STZ2GPreIndex, STZ2GPostIndex, 1269 STGPpre, STGPpost)>; 1270 1271// Store allocation tags to one or two granules, signed offset 1272// Store allocation tag to two granules, zeroing, signed offset 1273// Store allocation tag and reg pair to memory, signed offset 1274// Store multiple allocation tags 1275def : InstRW<[V2Write_1cyc_1L01_1D], (instrs STGi, ST2Gi, STZGi, 1276 STZ2Gi, STGPi, STGM, STZGM)>; 1277 1278// FP data processing instructions 1279// ----------------------------------------------------------------------------- 1280 1281// FP absolute value 1282// FP arithmetic 1283// FP min/max 1284// FP negate 1285// FP select 1286def : SchedAlias<WriteF, V2Write_2cyc_1V>; 1287 1288// FP compare 1289def : SchedAlias<WriteFCmp, V2Write_2cyc_1V0>; 1290 1291// FP divide, square root 1292def : SchedAlias<WriteFDiv, V2Write_7cyc_1V02>; 1293 1294// FP divide, H-form 1295def : InstRW<[V2Write_7cyc_1V02], (instrs FDIVHrr)>; 1296// FP divide, S-form 1297def : InstRW<[V2Write_10cyc_1V02], (instrs FDIVSrr)>; 1298// FP divide, D-form 1299def : InstRW<[V2Write_15cyc_1V02], (instrs FDIVDrr)>; 1300 1301// FP square root, H-form 1302def : InstRW<[V2Write_7cyc_1V02], (instrs FSQRTHr)>; 1303// FP square root, S-form 1304def : InstRW<[V2Write_9cyc_1V02], (instrs FSQRTSr)>; 1305// FP square root, D-form 1306def : InstRW<[V2Write_16cyc_1V02], (instrs FSQRTDr)>; 1307 1308// FP multiply 1309def : WriteRes<WriteFMul, [V2UnitV]> { let Latency = 3; } 1310 1311// FP multiply accumulate 1312def : InstRW<[V2Wr_FMA, ReadDefault, ReadDefault, V2Rd_FMA], 1313 (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; 1314 1315// FP round to integral 1316def : InstRW<[V2Write_3cyc_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$", 1317 "^FRINT(32|64)[XZ][SD]r$")>; 1318 1319// FP miscellaneous instructions 1320// ----------------------------------------------------------------------------- 1321 1322// FP convert, from gen to vec reg 1323def : InstRW<[V2Write_3cyc_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>; 1324 1325// FP convert, from vec to gen reg 1326def : InstRW<[V2Write_3cyc_1V01], 1327 (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]ri?$")>; 1328 1329// FP convert, Javascript from vec to gen reg 1330def : SchedAlias<WriteFCvt, V2Write_3cyc_1V0>; 1331 1332// FP convert, from vec to vec reg 1333def : InstRW<[V2Write_3cyc_1V02], (instrs FCVTSHr, FCVTDHr, FCVTHSr, FCVTDSr, 1334 FCVTHDr, FCVTSDr, FCVTXNv1i64)>; 1335 1336// FP move, immed 1337// FP move, register 1338def : SchedAlias<WriteFImm, V2Write_2cyc_1V>; 1339 1340// FP transfer, from gen to low half of vec reg 1341def : InstRW<[V2Write_0or3cyc_1M0], 1342 (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr)>; 1343 1344// FP transfer, from gen to high half of vec reg 1345def : InstRW<[V2Write_5cyc_1M0_1V], (instrs FMOVXDHighr)>; 1346 1347// FP transfer, from vec to gen reg 1348def : SchedAlias<WriteFCopy, V2Write_2cyc_2V01>; 1349 1350// FP load instructions 1351// ----------------------------------------------------------------------------- 1352 1353// Load vector reg, literal, S/D/Q forms 1354def : InstRW<[V2Write_7cyc_1F_1L], (instregex "^LDR[SDQ]l$")>; 1355 1356// Load vector reg, unscaled immed 1357def : InstRW<[V2Write_6cyc_1L], (instregex "^LDUR[BHSDQ]i$")>; 1358 1359// Load vector reg, immed post-index 1360// Load vector reg, immed pre-index 1361def : InstRW<[WriteAdr, V2Write_6cyc_1I_1L], 1362 (instregex "^LDR[BHSDQ](pre|post)$")>; 1363 1364// Load vector reg, unsigned immed 1365def : InstRW<[V2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>; 1366 1367// Load vector reg, register offset, basic 1368// Load vector reg, register offset, scale, S/D-form 1369// Load vector reg, register offset, scale, H/Q-form 1370// Load vector reg, register offset, extend 1371// Load vector reg, register offset, extend, scale, S/D-form 1372// Load vector reg, register offset, extend, scale, H/Q-form 1373def : InstRW<[V2Write_LdrHQ, ReadAdrBase], (instregex "^LDR[BHSDQ]ro[WX]$")>; 1374 1375// Load vector pair, immed offset, S/D-form 1376def : InstRW<[V2Write_6cyc_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>; 1377 1378// Load vector pair, immed offset, Q-form 1379def : InstRW<[V2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>; 1380 1381// Load vector pair, immed post-index, S/D-form 1382// Load vector pair, immed pre-index, S/D-form 1383def : InstRW<[WriteAdr, V2Write_6cyc_1I_1L, WriteLDHi], 1384 (instregex "^LDP[SD](pre|post)$")>; 1385 1386// Load vector pair, immed post-index, Q-form 1387// Load vector pair, immed pre-index, Q-form 1388def : InstRW<[WriteAdr, V2Write_6cyc_2I_2L, WriteLDHi], (instrs LDPQpost, 1389 LDPQpre)>; 1390 1391// FP store instructions 1392// ----------------------------------------------------------------------------- 1393 1394// Store vector reg, unscaled immed, B/H/S/D-form 1395// Store vector reg, unscaled immed, Q-form 1396def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^STUR[BHSDQ]i$")>; 1397 1398// Store vector reg, immed post-index, B/H/S/D-form 1399// Store vector reg, immed post-index, Q-form 1400// Store vector reg, immed pre-index, B/H/S/D-form 1401// Store vector reg, immed pre-index, Q-form 1402def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01_1I], 1403 (instregex "^STR[BHSDQ](pre|post)$")>; 1404 1405// Store vector reg, unsigned immed, B/H/S/D-form 1406// Store vector reg, unsigned immed, Q-form 1407def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^STR[BHSDQ]ui$")>; 1408 1409// Store vector reg, register offset, basic, B/H/S/D-form 1410// Store vector reg, register offset, basic, Q-form 1411// Store vector reg, register offset, scale, H-form 1412// Store vector reg, register offset, scale, S/D-form 1413// Store vector reg, register offset, scale, Q-form 1414// Store vector reg, register offset, extend, B/H/S/D-form 1415// Store vector reg, register offset, extend, Q-form 1416// Store vector reg, register offset, extend, scale, H-form 1417// Store vector reg, register offset, extend, scale, S/D-form 1418// Store vector reg, register offset, extend, scale, Q-form 1419def : InstRW<[V2Write_StrHQ, ReadAdrBase], 1420 (instregex "^STR[BHSDQ]ro[WX]$")>; 1421 1422// Store vector pair, immed offset, S-form 1423// Store vector pair, immed offset, D-form 1424def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^STN?P[SD]i$")>; 1425 1426// Store vector pair, immed offset, Q-form 1427def : InstRW<[V2Write_2cyc_1L01_2V01], (instrs STPQi, STNPQi)>; 1428 1429// Store vector pair, immed post-index, S-form 1430// Store vector pair, immed post-index, D-form 1431// Store vector pair, immed pre-index, S-form 1432// Store vector pair, immed pre-index, D-form 1433def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01_1I], 1434 (instregex "^STP[SD](pre|post)$")>; 1435 1436// Store vector pair, immed post-index, Q-form 1437def : InstRW<[V2Write_2cyc_1L01_2V01_1I], (instrs STPQpost)>; 1438 1439// Store vector pair, immed pre-index, Q-form 1440def : InstRW<[V2Write_2cyc_1L01_2V01_2I], (instrs STPQpre)>; 1441 1442// ASIMD integer instructions 1443// ----------------------------------------------------------------------------- 1444 1445// ASIMD absolute diff 1446// ASIMD absolute diff long 1447// ASIMD arith, basic 1448// ASIMD arith, complex 1449// ASIMD arith, pair-wise 1450// ASIMD compare 1451// ASIMD logical 1452// ASIMD max/min, basic and pair-wise 1453def : SchedAlias<WriteVd, V2Write_2cyc_1V>; 1454def : SchedAlias<WriteVq, V2Write_2cyc_1V>; 1455 1456// ASIMD absolute diff accum 1457// ASIMD absolute diff accum long 1458def : InstRW<[V2Wr_VA, V2Rd_VA], (instregex "^[SU]ABAL?v")>; 1459 1460// ASIMD arith, reduce, 4H/4S 1461def : InstRW<[V2Write_2cyc_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; 1462 1463// ASIMD arith, reduce, 8B/8H 1464def : InstRW<[V2Write_4cyc_1V13_1V], 1465 (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>; 1466 1467// ASIMD arith, reduce, 16B 1468def : InstRW<[V2Write_4cyc_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>; 1469 1470// ASIMD dot product 1471// ASIMD dot product using signed and unsigned integers 1472def : InstRW<[V2Wr_VDOT, V2Rd_VDOT], 1473 (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; 1474 1475// ASIMD matrix multiply-accumulate 1476def : InstRW<[V2Wr_VMMA, V2Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; 1477 1478// ASIMD max/min, reduce, 4H/4S 1479def : InstRW<[V2Write_2cyc_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$", 1480 "^[SU](MAX|MIN)Vv4i32v$")>; 1481 1482// ASIMD max/min, reduce, 8B/8H 1483def : InstRW<[V2Write_4cyc_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", 1484 "^[SU](MAX|MIN)Vv8i16v$")>; 1485 1486// ASIMD max/min, reduce, 16B 1487def : InstRW<[V2Write_4cyc_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>; 1488 1489// ASIMD multiply 1490def : InstRW<[V2Write_4cyc_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>; 1491 1492// ASIMD multiply accumulate 1493def : InstRW<[V2Wr_VMA, V2Rd_VMA], (instregex "^MLAv", "^MLSv")>; 1494 1495// ASIMD multiply accumulate high 1496def : InstRW<[V2Wr_VMAH, V2Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; 1497 1498// ASIMD multiply accumulate long 1499def : InstRW<[V2Wr_VMAL, V2Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; 1500 1501// ASIMD multiply accumulate saturating long 1502def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQDML[AS]L[iv]")>; 1503 1504// ASIMD multiply/multiply long (8x8) polynomial, D-form 1505// ASIMD multiply/multiply long (8x8) polynomial, Q-form 1506def : InstRW<[V2Write_3cyc_1V23], (instregex "^PMULL?(v8i8|v16i8)$")>; 1507 1508// ASIMD multiply long 1509def : InstRW<[V2Write_3cyc_1V02], (instregex "^[SU]MULLv", "^SQDMULL[iv]")>; 1510 1511// ASIMD pairwise add and accumulate long 1512def : InstRW<[V2Wr_VPA, V2Rd_VPA], (instregex "^[SU]ADALPv")>; 1513 1514// ASIMD shift accumulate 1515def : InstRW<[V2Wr_VSA, V2Rd_VSA], (instregex "^[SU]SRA[dv]", "^[SU]RSRA[dv]")>; 1516 1517// ASIMD shift by immed, basic 1518def : InstRW<[V2Write_2cyc_1V13], (instregex "^SHL[dv]", "^SHLLv", "^SHRNv", 1519 "^SSHLLv", "^SSHR[dv]", "^USHLLv", 1520 "^USHR[dv]")>; 1521 1522// ASIMD shift by immed and insert, basic 1523def : InstRW<[V2Write_2cyc_1V13], (instregex "^SLI[dv]", "^SRI[dv]")>; 1524 1525// ASIMD shift by immed, complex 1526def : InstRW<[V2Write_4cyc_1V13], 1527 (instregex "^RSHRNv", "^SQRSHRU?N[bhsv]", "^(SQSHLU?|UQSHL)[bhsd]$", 1528 "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", 1529 "^SQSHRU?N[bhsv]", "^SRSHR[dv]", "^UQRSHRN[bhsv]", 1530 "^UQSHRN[bhsv]", "^URSHR[dv]")>; 1531 1532// ASIMD shift by register, basic 1533def : InstRW<[V2Write_2cyc_1V13], (instregex "^[SU]SHLv")>; 1534 1535// ASIMD shift by register, complex 1536def : InstRW<[V2Write_4cyc_1V13], 1537 (instregex "^[SU]RSHLv", "^[SU]QRSHLv", 1538 "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>; 1539 1540// ASIMD floating-point instructions 1541// ----------------------------------------------------------------------------- 1542 1543// ASIMD FP absolute value/difference 1544// ASIMD FP arith, normal 1545// ASIMD FP compare 1546// ASIMD FP complex add 1547// ASIMD FP max/min, normal 1548// ASIMD FP max/min, pairwise 1549// ASIMD FP negate 1550// Handled by SchedAlias<WriteV[dq], ...> 1551 1552// ASIMD FP complex multiply add 1553def : InstRW<[V2Wr_VFCMA, V2Rd_VFCMA], (instregex "^FCMLAv")>; 1554 1555// ASIMD FP convert, long (F16 to F32) 1556def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVTL(v4|v8)i16")>; 1557 1558// ASIMD FP convert, long (F32 to F64) 1559def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVTL(v2|v4)i32")>; 1560 1561// ASIMD FP convert, narrow (F32 to F16) 1562def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVTN(v4|v8)i16")>; 1563 1564// ASIMD FP convert, narrow (F64 to F32) 1565def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVTN(v2|v4)i32", 1566 "^FCVTXN(v2|v4)f32")>; 1567 1568// ASIMD FP convert, other, D-form F32 and Q-form F64 1569def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVT[AMNPZ][SU]v2f(32|64)$", 1570 "^FCVT[AMNPZ][SU]v1i64$", 1571 "^FCVTZ[SU]d$", 1572 "^[SU]CVTFv2f(32|64)$", 1573 "^[SU]CVTFv1i64$", 1574 "^[SU]CVTFd$")>; 1575 1576// ASIMD FP convert, other, D-form F16 and Q-form F32 1577def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVT[AMNPZ][SU]v4f(16|32)$", 1578 "^FCVT[AMNPZ][SU]v1i32$", 1579 "^FCVTZ[SU]s$", 1580 "^[SU]CVTFv4f(16|32)$", 1581 "^[SU]CVTFv1i32$", 1582 "^[SU]CVTFs$")>; 1583 1584// ASIMD FP convert, other, Q-form F16 1585def : InstRW<[V2Write_6cyc_4V02], (instregex "^FCVT[AMNPZ][SU]v8f16$", 1586 "^FCVT[AMNPZ][SU]v1f16$", 1587 "^FCVTZ[SU]h$", 1588 "^[SU]CVTFv8f16$", 1589 "^[SU]CVTFv1i16$", 1590 "^[SU]CVTFh$")>; 1591 1592// ASIMD FP divide, D-form, F16 1593def : InstRW<[V2Write_7cyc_1V02_7rc], (instrs FDIVv4f16)>; 1594 1595// ASIMD FP divide, D-form, F32 1596def : InstRW<[V2Write_10cyc_1V02_5rc], (instrs FDIVv2f32)>; 1597 1598// ASIMD FP divide, Q-form, F16 1599def : InstRW<[V2Write_13cyc_1V02_13rc], (instrs FDIVv8f16)>; 1600 1601// ASIMD FP divide, Q-form, F32 1602def : InstRW<[V2Write_10cyc_1V02_10rc], (instrs FDIVv4f32)>; 1603 1604// ASIMD FP divide, Q-form, F64 1605def : InstRW<[V2Write_15cyc_1V02_14rc], (instrs FDIVv2f64)>; 1606 1607// ASIMD FP max/min, reduce, F32 and D-form F16 1608def : InstRW<[V2Write_4cyc_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>; 1609 1610// ASIMD FP max/min, reduce, Q-form F16 1611def : InstRW<[V2Write_6cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>; 1612 1613// ASIMD FP multiply 1614def : InstRW<[V2Wr_VFM], (instregex "^FMULv", "^FMULXv")>; 1615 1616// ASIMD FP multiply accumulate 1617def : InstRW<[V2Wr_VFMA, V2Rd_VFMA], (instregex "^FMLAv", "^FMLSv")>; 1618 1619// ASIMD FP multiply accumulate long 1620def : InstRW<[V2Wr_VFMAL, V2Rd_VFMAL], (instregex "^FML[AS]L2?(lane)?v")>; 1621 1622// ASIMD FP round, D-form F32 and Q-form F64 1623def : InstRW<[V2Write_3cyc_1V02], 1624 (instregex "^FRINT[AIMNPXZ]v2f(32|64)$", 1625 "^FRINT(32|64)[XZ]v2f(32|64)$")>; 1626 1627// ASIMD FP round, D-form F16 and Q-form F32 1628def : InstRW<[V2Write_4cyc_2V02], 1629 (instregex "^FRINT[AIMNPXZ]v4f(16|32)$", 1630 "^FRINT(32|64)[XZ]v4f32$")>; 1631 1632// ASIMD FP round, Q-form F16 1633def : InstRW<[V2Write_6cyc_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>; 1634 1635// ASIMD FP square root, D-form, F16 1636def : InstRW<[V2Write_7cyc_1V02_7rc], (instrs FSQRTv4f16)>; 1637 1638// ASIMD FP square root, D-form, F32 1639def : InstRW<[V2Write_10cyc_1V02_5rc], (instrs FSQRTv2f32)>; 1640 1641// ASIMD FP square root, Q-form, F16 1642def : InstRW<[V2Write_13cyc_1V02_13rc], (instrs FSQRTv8f16)>; 1643 1644// ASIMD FP square root, Q-form, F32 1645def : InstRW<[V2Write_10cyc_1V02_9rc], (instrs FSQRTv4f32)>; 1646 1647// ASIMD FP square root, Q-form, F64 1648def : InstRW<[V2Write_16cyc_1V02_15rc], (instrs FSQRTv2f64)>; 1649 1650// ASIMD BFloat16 (BF16) instructions 1651// ----------------------------------------------------------------------------- 1652 1653// ASIMD convert, F32 to BF16 1654def : InstRW<[V2Write_4cyc_2V02], (instrs BFCVTN, BFCVTN2)>; 1655 1656// ASIMD dot product 1657def : InstRW<[V2Wr_VBFDOT, V2Rd_VBFDOT], (instrs BFDOTv4bf16, BFDOTv8bf16)>; 1658 1659// ASIMD matrix multiply accumulate 1660def : InstRW<[V2Wr_VBFMMA, V2Rd_VBFMMA], (instrs BFMMLA)>; 1661 1662// ASIMD multiply accumulate long 1663def : InstRW<[V2Wr_VBFMAL, V2Rd_VBFMAL], (instrs BFMLALB, BFMLALBIdx, BFMLALT, 1664 BFMLALTIdx)>; 1665 1666// Scalar convert, F32 to BF16 1667def : InstRW<[V2Write_3cyc_1V02], (instrs BFCVT)>; 1668 1669// ASIMD miscellaneous instructions 1670// ----------------------------------------------------------------------------- 1671 1672// ASIMD bit reverse 1673// ASIMD bitwise insert 1674// ASIMD count 1675// ASIMD duplicate, element 1676// ASIMD extract 1677// ASIMD extract narrow 1678// ASIMD insert, element to element 1679// ASIMD move, FP immed 1680// ASIMD move, integer immed 1681// ASIMD reverse 1682// ASIMD table lookup extension, 1 table reg 1683// ASIMD transpose 1684// ASIMD unzip/zip 1685// Handled by SchedAlias<WriteV[dq], ...> 1686def : InstRW<[V2Write_0or2cyc_1V], (instrs MOVID, MOVIv2d_ns)>; 1687 1688// ASIMD duplicate, gen reg 1689def : InstRW<[V2Write_3cyc_1M0], (instregex "^DUPv.+gpr")>; 1690 1691// ASIMD extract narrow, saturating 1692def : InstRW<[V2Write_4cyc_1V13], (instregex "^[SU]QXTNv", "^SQXTUNv")>; 1693 1694// ASIMD reciprocal and square root estimate, D-form U32 1695def : InstRW<[V2Write_3cyc_1V02], (instrs URECPEv2i32, URSQRTEv2i32)>; 1696 1697// ASIMD reciprocal and square root estimate, Q-form U32 1698def : InstRW<[V2Write_4cyc_2V02], (instrs URECPEv4i32, URSQRTEv4i32)>; 1699 1700// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms 1701def : InstRW<[V2Write_3cyc_1V02], (instrs FRECPEv1f16, FRECPEv1i32, 1702 FRECPEv1i64, FRECPEv2f32, 1703 FRSQRTEv1f16, FRSQRTEv1i32, 1704 FRSQRTEv1i64, FRSQRTEv2f32)>; 1705 1706// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32 1707def : InstRW<[V2Write_4cyc_2V02], (instrs FRECPEv4f16, FRECPEv4f32, 1708 FRSQRTEv4f16, FRSQRTEv4f32)>; 1709 1710// ASIMD reciprocal and square root estimate, Q-form F16 1711def : InstRW<[V2Write_6cyc_4V02], (instrs FRECPEv8f16, FRSQRTEv8f16)>; 1712 1713// ASIMD reciprocal exponent 1714def : InstRW<[V2Write_3cyc_1V02], (instregex "^FRECPXv")>; 1715 1716// ASIMD reciprocal step 1717def : InstRW<[V2Write_4cyc_1V], (instregex "^FRECPS(32|64|v)", 1718 "^FRSQRTS(32|64|v)")>; 1719 1720// ASIMD table lookup, 1 or 2 table regs 1721def : InstRW<[V2Write_2cyc_1V01], (instrs TBLv8i8One, TBLv16i8One, 1722 TBLv8i8Two, TBLv16i8Two)>; 1723 1724// ASIMD table lookup, 3 table regs 1725def : InstRW<[V2Write_4cyc_2V01], (instrs TBLv8i8Three, TBLv16i8Three)>; 1726 1727// ASIMD table lookup, 4 table regs 1728def : InstRW<[V2Write_4cyc_3V01], (instrs TBLv8i8Four, TBLv16i8Four)>; 1729 1730// ASIMD table lookup extension, 2 table reg 1731def : InstRW<[V2Write_4cyc_2V], (instrs TBXv8i8Two, TBXv16i8Two)>; 1732 1733// ASIMD table lookup extension, 3 table reg 1734def : InstRW<[V2Write_6cyc_3V], (instrs TBXv8i8Three, TBXv16i8Three)>; 1735 1736// ASIMD table lookup extension, 4 table reg 1737def : InstRW<[V2Write_6cyc_5V], (instrs TBXv8i8Four, TBXv16i8Four)>; 1738 1739// ASIMD transfer, element to gen reg 1740def : InstRW<[V2Write_2cyc_2V01], (instregex "^[SU]MOVv")>; 1741 1742// ASIMD transfer, gen reg to element 1743def : InstRW<[V2Write_5cyc_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>; 1744 1745// ASIMD load instructions 1746// ----------------------------------------------------------------------------- 1747 1748// ASIMD load, 1 element, multiple, 1 reg, D-form 1749def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>; 1750def : InstRW<[WriteAdr, V2Write_6cyc_1L], 1751 (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>; 1752 1753// ASIMD load, 1 element, multiple, 1 reg, Q-form 1754def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>; 1755def : InstRW<[WriteAdr, V2Write_6cyc_1L], 1756 (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>; 1757 1758// ASIMD load, 1 element, multiple, 2 reg, D-form 1759def : InstRW<[V2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; 1760def : InstRW<[WriteAdr, V2Write_6cyc_2L], 1761 (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; 1762 1763// ASIMD load, 1 element, multiple, 2 reg, Q-form 1764def : InstRW<[V2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; 1765def : InstRW<[WriteAdr, V2Write_6cyc_2L], 1766 (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; 1767 1768// ASIMD load, 1 element, multiple, 3 reg, D-form 1769def : InstRW<[V2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; 1770def : InstRW<[WriteAdr, V2Write_6cyc_3L], 1771 (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; 1772 1773// ASIMD load, 1 element, multiple, 3 reg, Q-form 1774def : InstRW<[V2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; 1775def : InstRW<[WriteAdr, V2Write_6cyc_3L], 1776 (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; 1777 1778// ASIMD load, 1 element, multiple, 4 reg, D-form 1779def : InstRW<[V2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; 1780def : InstRW<[WriteAdr, V2Write_7cyc_4L], 1781 (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; 1782 1783// ASIMD load, 1 element, multiple, 4 reg, Q-form 1784def : InstRW<[V2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; 1785def : InstRW<[WriteAdr, V2Write_7cyc_4L], 1786 (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; 1787 1788// ASIMD load, 1 element, one lane, B/H/S 1789// ASIMD load, 1 element, one lane, D 1790def : InstRW<[V2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)$")>; 1791def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>; 1792 1793// ASIMD load, 1 element, all lanes, D-form, B/H/S 1794// ASIMD load, 1 element, all lanes, D-form, D 1795def : InstRW<[V2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>; 1796def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>; 1797 1798// ASIMD load, 1 element, all lanes, Q-form 1799def : InstRW<[V2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; 1800def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; 1801 1802// ASIMD load, 2 element, multiple, D-form, B/H/S 1803def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>; 1804def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>; 1805 1806// ASIMD load, 2 element, multiple, Q-form, B/H/S 1807// ASIMD load, 2 element, multiple, Q-form, D 1808def : InstRW<[V2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>; 1809def : InstRW<[WriteAdr, V2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>; 1810 1811// ASIMD load, 2 element, one lane, B/H 1812// ASIMD load, 2 element, one lane, S 1813// ASIMD load, 2 element, one lane, D 1814def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)$")>; 1815def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>; 1816 1817// ASIMD load, 2 element, all lanes, D-form, B/H/S 1818// ASIMD load, 2 element, all lanes, D-form, D 1819def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>; 1820def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>; 1821 1822// ASIMD load, 2 element, all lanes, Q-form 1823def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; 1824def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; 1825 1826// ASIMD load, 3 element, multiple, D-form, B/H/S 1827def : InstRW<[V2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>; 1828def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>; 1829 1830// ASIMD load, 3 element, multiple, Q-form, B/H/S 1831// ASIMD load, 3 element, multiple, Q-form, D 1832def : InstRW<[V2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)$")>; 1833def : InstRW<[WriteAdr, V2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; 1834 1835// ASIMD load, 3 element, one lane, B/H 1836// ASIMD load, 3 element, one lane, S 1837// ASIMD load, 3 element, one lane, D 1838def : InstRW<[V2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)$")>; 1839def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>; 1840 1841// ASIMD load, 3 element, all lanes, D-form, B/H/S 1842// ASIMD load, 3 element, all lanes, D-form, D 1843def : InstRW<[V2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>; 1844def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>; 1845 1846// ASIMD load, 3 element, all lanes, Q-form, B/H/S 1847// ASIMD load, 3 element, all lanes, Q-form, D 1848def : InstRW<[V2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>; 1849def : InstRW<[WriteAdr, V2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>; 1850 1851// ASIMD load, 4 element, multiple, D-form, B/H/S 1852def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>; 1853def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; 1854 1855// ASIMD load, 4 element, multiple, Q-form, B/H/S 1856// ASIMD load, 4 element, multiple, Q-form, D 1857def : InstRW<[V2Write_9cyc_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 1858def : InstRW<[WriteAdr, V2Write_9cyc_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; 1859 1860// ASIMD load, 4 element, one lane, B/H 1861// ASIMD load, 4 element, one lane, S 1862// ASIMD load, 4 element, one lane, D 1863def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)$")>; 1864def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>; 1865 1866// ASIMD load, 4 element, all lanes, D-form, B/H/S 1867// ASIMD load, 4 element, all lanes, D-form, D 1868def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>; 1869def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>; 1870 1871// ASIMD load, 4 element, all lanes, Q-form, B/H/S 1872// ASIMD load, 4 element, all lanes, Q-form, D 1873def : InstRW<[V2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>; 1874def : InstRW<[WriteAdr, V2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>; 1875 1876// ASIMD store instructions 1877// ----------------------------------------------------------------------------- 1878 1879// ASIMD store, 1 element, multiple, 1 reg, D-form 1880def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(8b|4h|2s|1d)$")>; 1881def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; 1882 1883// ASIMD store, 1 element, multiple, 1 reg, Q-form 1884def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(16b|8h|4s|2d)$")>; 1885def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; 1886 1887// ASIMD store, 1 element, multiple, 2 reg, D-form 1888def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "ST1Twov(8b|4h|2s|1d)$")>; 1889def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; 1890 1891// ASIMD store, 1 element, multiple, 2 reg, Q-form 1892def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "ST1Twov(16b|8h|4s|2d)$")>; 1893def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; 1894 1895// ASIMD store, 1 element, multiple, 3 reg, D-form 1896def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "ST1Threev(8b|4h|2s|1d)$")>; 1897def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; 1898 1899// ASIMD store, 1 element, multiple, 3 reg, Q-form 1900def : InstRW<[V2Write_2cyc_3L01_3V01], (instregex "ST1Threev(16b|8h|4s|2d)$")>; 1901def : InstRW<[WriteAdr, V2Write_2cyc_3L01_3V01], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; 1902 1903// ASIMD store, 1 element, multiple, 4 reg, D-form 1904def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 1905def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; 1906 1907// ASIMD store, 1 element, multiple, 4 reg, Q-form 1908def : InstRW<[V2Write_2cyc_4L01_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 1909def : InstRW<[WriteAdr, V2Write_2cyc_4L01_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; 1910 1911// ASIMD store, 1 element, one lane, B/H/S 1912// ASIMD store, 1 element, one lane, D 1913def : InstRW<[V2Write_4cyc_1L01_2V01], (instregex "ST1i(8|16|32|64)$")>; 1914def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST1i(8|16|32|64)_POST$")>; 1915 1916// ASIMD store, 2 element, multiple, D-form, B/H/S 1917def : InstRW<[V2Write_4cyc_1L01_2V01], (instregex "ST2Twov(8b|4h|2s)$")>; 1918def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST2Twov(8b|4h|2s)_POST$")>; 1919 1920// ASIMD store, 2 element, multiple, Q-form, B/H/S 1921// ASIMD store, 2 element, multiple, Q-form, D 1922def : InstRW<[V2Write_4cyc_2L01_4V01], (instregex "ST2Twov(16b|8h|4s|2d)$")>; 1923def : InstRW<[WriteAdr, V2Write_4cyc_2L01_4V01], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; 1924 1925// ASIMD store, 2 element, one lane, B/H/S 1926// ASIMD store, 2 element, one lane, D 1927def : InstRW<[V2Write_4cyc_1L01_2V01], (instregex "ST2i(8|16|32|64)$")>; 1928def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST2i(8|16|32|64)_POST$")>; 1929 1930// ASIMD store, 3 element, multiple, D-form, B/H/S 1931def : InstRW<[V2Write_5cyc_2L01_4V01], (instregex "ST3Threev(8b|4h|2s)$")>; 1932def : InstRW<[WriteAdr, V2Write_5cyc_2L01_4V01], (instregex "ST3Threev(8b|4h|2s)_POST$")>; 1933 1934// ASIMD store, 3 element, multiple, Q-form, B/H/S 1935// ASIMD store, 3 element, multiple, Q-form, D 1936def : InstRW<[V2Write_6cyc_3L01_6V01], (instregex "ST3Threev(16b|8h|4s|2d)$")>; 1937def : InstRW<[WriteAdr, V2Write_6cyc_3L01_6V01], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; 1938 1939// ASIMD store, 3 element, one lane, B/H 1940// ASIMD store, 3 element, one lane, S 1941// ASIMD store, 3 element, one lane, D 1942def : InstRW<[V2Write_5cyc_2L01_4V01], (instregex "ST3i(8|16|32|64)$")>; 1943def : InstRW<[WriteAdr, V2Write_5cyc_2L01_4V01], (instregex "ST3i(8|16|32|64)_POST$")>; 1944 1945// ASIMD store, 4 element, multiple, D-form, B/H/S 1946def : InstRW<[V2Write_6cyc_2L01_6V01], (instregex "ST4Fourv(8b|4h|2s)$")>; 1947def : InstRW<[WriteAdr, V2Write_6cyc_2L01_6V01], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; 1948 1949// ASIMD store, 4 element, multiple, Q-form, B/H/S 1950def : InstRW<[V2Write_7cyc_4L01_12V01], (instregex "ST4Fourv(16b|8h|4s)$")>; 1951def : InstRW<[WriteAdr, V2Write_7cyc_4L01_12V01], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; 1952 1953// ASIMD store, 4 element, multiple, Q-form, D 1954def : InstRW<[V2Write_5cyc_4L01_8V01], (instregex "ST4Fourv(2d)$")>; 1955def : InstRW<[WriteAdr, V2Write_5cyc_4L01_8V01], (instregex "ST4Fourv(2d)_POST$")>; 1956 1957// ASIMD store, 4 element, one lane, B/H/S 1958def : InstRW<[V2Write_6cyc_1L01_3V01], (instregex "ST4i(8|16|32)$")>; 1959def : InstRW<[WriteAdr, V2Write_6cyc_1L01_3V01], (instregex "ST4i(8|16|32)_POST$")>; 1960 1961// ASIMD store, 4 element, one lane, D 1962def : InstRW<[V2Write_4cyc_2L01_4V01], (instregex "ST4i(64)$")>; 1963def : InstRW<[WriteAdr, V2Write_4cyc_2L01_4V01], (instregex "ST4i(64)_POST$")>; 1964 1965// Cryptography extensions 1966// ----------------------------------------------------------------------------- 1967 1968// Crypto AES ops 1969def : InstRW<[V2Write_2cyc_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; 1970 1971// Crypto polynomial (64x64) multiply long 1972def : InstRW<[V2Write_2cyc_1V], (instrs PMULLv1i64, PMULLv2i64)>; 1973 1974// Crypto SHA1 hash acceleration op 1975// Crypto SHA1 schedule acceleration ops 1976def : InstRW<[V2Write_2cyc_1V0], (instregex "^SHA1(H|SU0|SU1)")>; 1977 1978// Crypto SHA1 hash acceleration ops 1979// Crypto SHA256 hash acceleration ops 1980def : InstRW<[V2Write_4cyc_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>; 1981 1982// Crypto SHA256 schedule acceleration ops 1983def : InstRW<[V2Write_2cyc_1V0], (instregex "^SHA256SU[01]")>; 1984 1985// Crypto SHA512 hash acceleration ops 1986def : InstRW<[V2Write_2cyc_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>; 1987 1988// Crypto SHA3 ops 1989def : InstRW<[V2Write_2cyc_1V0], (instrs BCAX, EOR3, RAX1, XAR)>; 1990 1991// Crypto SM3 ops 1992def : InstRW<[V2Write_2cyc_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$", 1993 "^SM3TT[12][AB]$")>; 1994 1995// Crypto SM4 ops 1996def : InstRW<[V2Write_4cyc_1V0], (instrs SM4E, SM4ENCKEY)>; 1997 1998// CRC 1999// ----------------------------------------------------------------------------- 2000 2001def : InstRW<[V2Wr_CRC, V2Rd_CRC], (instregex "^CRC32")>; 2002 2003// SVE Predicate instructions 2004// ----------------------------------------------------------------------------- 2005 2006// Loop control, based on predicate 2007def : InstRW<[V2Write_2or3cyc_1M], (instrs BRKA_PPmP, BRKA_PPzP, 2008 BRKB_PPmP, BRKB_PPzP)>; 2009 2010// Loop control, based on predicate and flag setting 2011def : InstRW<[V2Write_3or4cyc_2M], (instrs BRKAS_PPzP, BRKBS_PPzP)>; 2012 2013// Loop control, propagating 2014def : InstRW<[V2Write_2or3cyc_1M0], (instrs BRKN_PPzP, BRKPA_PPzPP, 2015 BRKPB_PPzPP)>; 2016 2017// Loop control, propagating and flag setting 2018def : InstRW<[V2Write_3or4cyc_1M0_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP, 2019 BRKPBS_PPzPP)>; 2020 2021// Loop control, based on GPR 2022def : InstRW<[V2Write_3cyc_2M], 2023 (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; 2024def : InstRW<[V2Write_3cyc_2M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; 2025 2026// Loop terminate 2027def : InstRW<[V2Write_1cyc_2M], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; 2028 2029// Predicate counting scalar 2030def : InstRW<[V2Write_2cyc_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; 2031def : InstRW<[V2Write_2cyc_1M], 2032 (instregex "^(CNT|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI", 2033 "^SQ(DEC|INC)[BHWD]_XPiWdI", 2034 "^UQ(DEC|INC)[BHWD]_WPiI")>; 2035 2036// Predicate counting scalar, ALL, {1,2,4} 2037def : InstRW<[V2Write_IncDec], (instregex "^(DEC|INC)[BHWD]_XPiI")>; 2038 2039// Predicate counting scalar, active predicate 2040def : InstRW<[V2Write_2cyc_1M], 2041 (instregex "^CNTP_XPP_[BHSD]", 2042 "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", 2043 "^(UQDEC|UQINC)P_WP_[BHSD]", 2044 "^(SQDEC|SQINC)P_XPWd_[BHSD]")>; 2045 2046// Predicate counting vector, active predicate 2047def : InstRW<[V2Write_7cyc_1M_1M0_1V], 2048 (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; 2049 2050// Predicate logical 2051def : InstRW<[V2Write_1or2cyc_1M0], 2052 (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; 2053 2054// Predicate logical, flag setting 2055def : InstRW<[V2Write_1or2cyc_1M0_1M], 2056 (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; 2057 2058// Predicate reverse 2059def : InstRW<[V2Write_2cyc_1M], (instregex "^REV_PP_[BHSD]")>; 2060 2061// Predicate select 2062def : InstRW<[V2Write_1cyc_1M0], (instrs SEL_PPPP)>; 2063 2064// Predicate set 2065def : InstRW<[V2Write_2cyc_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; 2066 2067// Predicate set/initialize, set flags 2068def : InstRW<[V2Write_3cyc_2M], (instregex "^PTRUES_[BHSD]")>; 2069 2070// Predicate find first/next 2071def : InstRW<[V2Write_2cyc_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; 2072 2073// Predicate test 2074def : InstRW<[V2Write_1cyc_1M], (instrs PTEST_PP)>; 2075 2076// Predicate transpose 2077def : InstRW<[V2Write_2cyc_1M], (instregex "^TRN[12]_PPP_[BHSD]")>; 2078 2079// Predicate unpack and widen 2080def : InstRW<[V2Write_2cyc_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>; 2081 2082// Predicate zip/unzip 2083def : InstRW<[V2Write_2cyc_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSD]")>; 2084 2085// SVE integer instructions 2086// ----------------------------------------------------------------------------- 2087 2088// Arithmetic, absolute diff 2089def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]", 2090 "^[SU]ABD_ZPZZ_[BHSD]")>; 2091 2092// Arithmetic, absolute diff accum 2093def : InstRW<[V2Wr_ZA, V2Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; 2094 2095// Arithmetic, absolute diff accum long 2096def : InstRW<[V2Wr_ZA, V2Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; 2097 2098// Arithmetic, absolute diff long 2099def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; 2100 2101// Arithmetic, basic 2102def : InstRW<[V2Write_2cyc_1V], 2103 (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]", 2104 "^(ADD|SUB)_ZZZ_[BHSD]", 2105 "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]", 2106 "^(ADD|SUB|SUBR)_ZI_[BHSD]", 2107 "^ADR_[SU]XTW_ZZZ_D_[0123]", 2108 "^ADR_LSL_ZZZ_[SD]_[0123]", 2109 "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", 2110 "^SADDLBT_ZZZ_[HSD]", 2111 "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]", 2112 "^SSUBL(BT|TB)_ZZZ_[HSD]")>; 2113 2114// Arithmetic, complex 2115def : InstRW<[V2Write_2cyc_1V], 2116 (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]", 2117 "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]", 2118 "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", 2119 "^[SU]Q(ADD|SUB)_ZI_[BHSD]", 2120 "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", 2121 "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; 2122 2123// Arithmetic, large integer 2124def : InstRW<[V2Write_2cyc_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; 2125 2126// Arithmetic, pairwise add 2127def : InstRW<[V2Write_2cyc_1V], (instregex "^ADDP_ZPmZ_[BHSD]")>; 2128 2129// Arithmetic, pairwise add and accum long 2130def : InstRW<[V2Wr_ZPA, ReadDefault, V2Rd_ZPA], 2131 (instregex "^[SU]ADALP_ZPmZ_[HSD]")>; 2132 2133// Arithmetic, shift 2134def : InstRW<[V2Write_2cyc_1V13], 2135 (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]", 2136 "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]", 2137 "^(ASR|LSL|LSR)_ZPmI_[BHSD]", 2138 "^(ASR|LSL|LSR)_ZPmZ_[BHSD]", 2139 "^(ASR|LSL|LSR)_ZZI_[BHSD]", 2140 "^(ASR|LSL|LSR)_ZPZ[IZ]_[BHSD]", 2141 "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; 2142 2143// Arithmetic, shift and accumulate 2144def : InstRW<[V2Wr_ZSA, V2Rd_ZSA], (instregex "^[SU]R?SRA_ZZI_[BHSD]")>; 2145 2146// Arithmetic, shift by immediate 2147def : InstRW<[V2Write_2cyc_1V13], (instregex "^SHRN[BT]_ZZI_[BHS]", 2148 "^[SU]SHLL[BT]_ZZI_[HSD]")>; 2149 2150// Arithmetic, shift by immediate and insert 2151def : InstRW<[V2Write_2cyc_1V13], (instregex "^(SLI|SRI)_ZZI_[BHSD]")>; 2152 2153// Arithmetic, shift complex 2154def : InstRW<[V2Write_4cyc_1V13], 2155 (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]", 2156 "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]", 2157 "^[SU]QR?SHL_ZPZZ_[BHSD]", 2158 "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]", 2159 "^SQSHRU?N[BT]_ZZI_[BHS]", 2160 "^UQR?SHRN[BT]_ZZI_[BHS]")>; 2161 2162// Arithmetic, shift right for divide 2163def : InstRW<[V2Write_4cyc_1V13], (instregex "^ASRD_(ZPmI|ZPZI)_[BHSD]")>; 2164 2165// Arithmetic, shift rounding 2166def : InstRW<[V2Write_4cyc_1V13], (instregex "^[SU]RSHLR?_ZPmZ_[BHSD]", 2167 "^[SU]RSHL_ZPZZ_[BHSD]", 2168 "^[SU]RSHR_(ZPmI|ZPZI)_[BHSD]")>; 2169 2170// Bit manipulation 2171def : InstRW<[V2Write_6cyc_2V1], (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]")>; 2172 2173// Bitwise select 2174def : InstRW<[V2Write_2cyc_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>; 2175 2176// Count/reverse bits 2177def : InstRW<[V2Write_2cyc_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]")>; 2178 2179// Broadcast logical bitmask immediate to vector 2180def : InstRW<[V2Write_2cyc_1V], (instrs DUPM_ZI)>; 2181 2182// Compare and set flags 2183def : InstRW<[V2Write_4or5cyc_1V0_1M0], 2184 (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", 2185 "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; 2186 2187// Complex add 2188def : InstRW<[V2Write_2cyc_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]")>; 2189 2190// Complex dot product 8-bit element 2191def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; 2192 2193// Complex dot product 16-bit element 2194def : InstRW<[V2Wr_ZDOTH, V2Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; 2195 2196// Complex multiply-add B, H, S element size 2197def : InstRW<[V2Wr_ZCMABHS, V2Rd_ZCMABHS], (instregex "^CMLA_ZZZ_[BHS]", 2198 "^CMLA_ZZZI_[HS]")>; 2199 2200// Complex multiply-add D element size 2201def : InstRW<[V2Wr_ZCMAD, V2Rd_ZCMAD], (instrs CMLA_ZZZ_D)>; 2202 2203// Conditional extract operations, scalar form 2204def : InstRW<[V2Write_8cyc_1M0_1V01], (instregex "^CLAST[AB]_RPZ_[BHSD]")>; 2205 2206// Conditional extract operations, SIMD&FP scalar and vector forms 2207def : InstRW<[V2Write_3cyc_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]", 2208 "^COMPACT_ZPZ_[SD]", 2209 "^SPLICE_ZPZZ?_[BHSD]")>; 2210 2211// Convert to floating point, 64b to float or convert to double 2212def : InstRW<[V2Write_3cyc_1V02], (instregex "^[SU]CVTF_ZPmZ_Dto[HSD]", 2213 "^[SU]CVTF_ZPmZ_StoD")>; 2214 2215// Convert to floating point, 32b to single or half 2216def : InstRW<[V2Write_4cyc_2V02], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>; 2217 2218// Convert to floating point, 16b to half 2219def : InstRW<[V2Write_6cyc_4V02], (instregex "^[SU]CVTF_ZPmZ_HtoH")>; 2220 2221// Copy, scalar 2222def : InstRW<[V2Write_5cyc_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]")>; 2223 2224// Copy, scalar SIMD&FP or imm 2225def : InstRW<[V2Write_2cyc_1V], (instregex "^CPY_ZPm[IV]_[BHSD]", 2226 "^CPY_ZPzI_[BHSD]")>; 2227 2228// Divides, 32 bit 2229def : InstRW<[V2Write_12cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_S", 2230 "^[SU]DIV_ZPZZ_S")>; 2231 2232// Divides, 64 bit 2233def : InstRW<[V2Write_20cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", 2234 "^[SU]DIV_ZPZZ_D")>; 2235 2236// Dot product, 8 bit 2237def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S")>; 2238 2239// Dot product, 8 bit, using signed and unsigned integers 2240def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; 2241 2242// Dot product, 16 bit 2243def : InstRW<[V2Wr_ZDOTH, V2Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D")>; 2244 2245// Duplicate, immediate and indexed form 2246def : InstRW<[V2Write_2cyc_1V], (instregex "^DUP_ZI_[BHSD]", 2247 "^DUP_ZZI_[BHSDQ]")>; 2248 2249// Duplicate, scalar form 2250def : InstRW<[V2Write_3cyc_1M0], (instregex "^DUP_ZR_[BHSD]")>; 2251 2252// Extend, sign or zero 2253def : InstRW<[V2Write_2cyc_1V13], (instregex "^[SU]XTB_ZPmZ_[HSD]", 2254 "^[SU]XTH_ZPmZ_[SD]", 2255 "^[SU]XTW_ZPmZ_[D]")>; 2256 2257// Extract 2258def : InstRW<[V2Write_2cyc_1V], (instrs EXT_ZZI, EXT_ZZI_B)>; 2259 2260// Extract narrow saturating 2261def : InstRW<[V2Write_4cyc_1V13], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", 2262 "^SQXTUN[BT]_ZZ_[BHS]")>; 2263 2264// Extract/insert operation, SIMD and FP scalar form 2265def : InstRW<[V2Write_3cyc_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]", 2266 "^INSR_ZV_[BHSD]")>; 2267 2268// Extract/insert operation, scalar 2269def : InstRW<[V2Write_6cyc_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]", 2270 "^INSR_ZR_[BHSD]")>; 2271 2272// Histogram operations 2273def : InstRW<[V2Write_2cyc_1V], (instregex "^HISTCNT_ZPzZZ_[SD]", 2274 "^HISTSEG_ZZZ")>; 2275 2276// Horizontal operations, B, H, S form, immediate operands only 2277def : InstRW<[V2Write_4cyc_1V02], (instregex "^INDEX_II_[BHS]")>; 2278 2279// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar 2280// operands only / immediate, scalar operands 2281def : InstRW<[V2Write_7cyc_1M0_1V02], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>; 2282 2283// Horizontal operations, D form, immediate operands only 2284def : InstRW<[V2Write_5cyc_2V02], (instrs INDEX_II_D)>; 2285 2286// Horizontal operations, D form, scalar, immediate operands)/ scalar operands 2287// only / immediate, scalar operands 2288def : InstRW<[V2Write_8cyc_2M0_2V02], (instregex "^INDEX_(IR|RI|RR)_D")>; 2289 2290// Logical 2291def : InstRW<[V2Write_2cyc_1V], 2292 (instregex "^(AND|EOR|ORR)_ZI", 2293 "^(AND|BIC|EOR|ORR)_ZZZ", 2294 "^EOR(BT|TB)_ZZZ_[BHSD]", 2295 "^(AND|BIC|EOR|NOT|ORR)_(ZPmZ|ZPZZ)_[BHSD]", 2296 "^NOT_ZPmZ_[BHSD]")>; 2297 2298// Max/min, basic and pairwise 2299def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", 2300 "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]", 2301 "^[SU](MAX|MIN)_ZPZZ_[BHSD]")>; 2302 2303// Matching operations 2304// FIXME: SOG p. 44, n. 5: If the consuming instruction has a flag source, the 2305// latency for this instruction is 4 cycles. 2306def : InstRW<[V2Write_2or3cyc_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]")>; 2307 2308// Matrix multiply-accumulate 2309def : InstRW<[V2Wr_ZMMA, V2Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; 2310 2311// Move prefix 2312def : InstRW<[V2Write_2cyc_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]", 2313 "^MOVPRFX_ZZ")>; 2314 2315// Multiply, B, H, S element size 2316def : InstRW<[V2Write_4cyc_1V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", 2317 "^MUL_ZPZZ_[BHS]", 2318 "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]", 2319 "^[SU]MULH_ZPZZ_[BHS]")>; 2320 2321// Multiply, D element size 2322def : InstRW<[V2Write_5cyc_2V02], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D", 2323 "^MUL_ZPZZ_D", 2324 "^[SU]MULH_(ZPmZ|ZZZ)_D", 2325 "^[SU]MULH_ZPZZ_D")>; 2326 2327// Multiply long 2328def : InstRW<[V2Write_4cyc_1V02], (instregex "^[SU]MULL[BT]_ZZZI_[SD]", 2329 "^[SU]MULL[BT]_ZZZ_[HSD]")>; 2330 2331// Multiply accumulate, B, H, S element size 2332def : InstRW<[V2Wr_ZMABHS, V2Rd_ZMABHS], 2333 (instregex "^ML[AS]_ZZZI_[HS]", "^ML[AS]_ZPZZZ_[BHS]")>; 2334def : InstRW<[V2Wr_ZMABHS, ReadDefault, V2Rd_ZMABHS], 2335 (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>; 2336 2337// Multiply accumulate, D element size 2338def : InstRW<[V2Wr_ZMAD, V2Rd_ZMAD], 2339 (instregex "^ML[AS]_ZZZI_D", "^ML[AS]_ZPZZZ_D")>; 2340def : InstRW<[V2Wr_ZMAD, ReadDefault, V2Rd_ZMAD], 2341 (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; 2342 2343// Multiply accumulate long 2344def : InstRW<[V2Wr_ZMAL, V2Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]", 2345 "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>; 2346 2347// Multiply accumulate saturating doubling long regular 2348def : InstRW<[V2Wr_ZMASQL, V2Rd_ZMASQ], 2349 (instregex "^SQDML[AS]L(B|T|BT)_ZZZ_[HSD]", 2350 "^SQDML[AS]L[BT]_ZZZI_[SD]")>; 2351 2352// Multiply saturating doubling high, B, H, S element size 2353def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQDMULH_ZZZ_[BHS]", 2354 "^SQDMULH_ZZZI_[HS]")>; 2355 2356// Multiply saturating doubling high, D element size 2357def : InstRW<[V2Write_5cyc_2V02], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; 2358 2359// Multiply saturating doubling long 2360def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQDMULL[BT]_ZZZ_[HSD]", 2361 "^SQDMULL[BT]_ZZZI_[SD]")>; 2362 2363// Multiply saturating rounding doubling regular/complex accumulate, B, H, S 2364// element size 2365def : InstRW<[V2Wr_ZMASQBHS, V2Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]", 2366 "^SQRDCMLAH_ZZZ_[BHS]", 2367 "^SQRDML[AS]H_ZZZI_[HS]", 2368 "^SQRDCMLAH_ZZZI_[HS]")>; 2369 2370// Multiply saturating rounding doubling regular/complex accumulate, D element 2371// size 2372def : InstRW<[V2Wr_ZMASQD, V2Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D", 2373 "^SQRDCMLAH_ZZZ_D")>; 2374 2375// Multiply saturating rounding doubling regular/complex, B, H, S element size 2376def : InstRW<[V2Write_4cyc_1V02], (instregex "^SQRDMULH_ZZZ_[BHS]", 2377 "^SQRDMULH_ZZZI_[HS]")>; 2378 2379// Multiply saturating rounding doubling regular/complex, D element size 2380def : InstRW<[V2Write_5cyc_2V02], (instregex "^SQRDMULH_ZZZI?_D")>; 2381 2382// Multiply/multiply long, (8x8) polynomial 2383def : InstRW<[V2Write_2cyc_1V23], (instregex "^PMUL_ZZZ_B", 2384 "^PMULL[BT]_ZZZ_[HDQ]")>; 2385 2386// Predicate counting vector 2387def : InstRW<[V2Write_2cyc_1V], (instregex "^([SU]Q)?(DEC|INC)[HWD]_ZPiI")>; 2388 2389// Reciprocal estimate 2390def : InstRW<[V2Write_4cyc_2V02], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; 2391 2392// Reduction, arithmetic, B form 2393def : InstRW<[V2Write_9cyc_2V_4V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; 2394 2395// Reduction, arithmetic, H form 2396def : InstRW<[V2Write_8cyc_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; 2397 2398// Reduction, arithmetic, S form 2399def : InstRW<[V2Write_6cyc_2V_2V13], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; 2400 2401// Reduction, arithmetic, D form 2402def : InstRW<[V2Write_4cyc_2V], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; 2403 2404// Reduction, logical 2405def : InstRW<[V2Write_6cyc_1V_1V13], (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]")>; 2406 2407// Reverse, vector 2408def : InstRW<[V2Write_2cyc_1V], (instregex "^REV_ZZ_[BHSD]", 2409 "^REVB_ZPmZ_[HSD]", 2410 "^REVH_ZPmZ_[SD]", 2411 "^REVW_ZPmZ_D")>; 2412 2413// Select, vector form 2414def : InstRW<[V2Write_2cyc_1V], (instregex "^SEL_ZPZZ_[BHSD]")>; 2415 2416// Table lookup 2417def : InstRW<[V2Write_2cyc_1V], (instregex "^TBL_ZZZZ?_[BHSD]")>; 2418 2419// Table lookup extension 2420def : InstRW<[V2Write_2cyc_1V], (instregex "^TBX_ZZZ_[BHSD]")>; 2421 2422// Transpose, vector form 2423def : InstRW<[V2Write_2cyc_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; 2424 2425// Unpack and extend 2426def : InstRW<[V2Write_2cyc_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; 2427 2428// Zip/unzip 2429def : InstRW<[V2Write_2cyc_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; 2430 2431// SVE floating-point instructions 2432// ----------------------------------------------------------------------------- 2433 2434// Floating point absolute value/difference 2435def : InstRW<[V2Write_2cyc_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]", 2436 "^FABD_ZPZZ_[HSD]", 2437 "^FABS_ZPmZ_[HSD]")>; 2438 2439// Floating point arithmetic 2440def : InstRW<[V2Write_2cyc_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]", 2441 "^F(ADD|SUB)_ZPZ[IZ]_[HSD]", 2442 "^FADDP_ZPmZZ_[HSD]", 2443 "^FNEG_ZPmZ_[HSD]", 2444 "^FSUBR_ZPm[IZ]_[HSD]", 2445 "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>; 2446 2447// Floating point associative add, F16 2448def : InstRW<[V2Write_10cyc_1V1_9rc], (instrs FADDA_VPZ_H)>; 2449 2450// Floating point associative add, F32 2451def : InstRW<[V2Write_6cyc_1V1_5rc], (instrs FADDA_VPZ_S)>; 2452 2453// Floating point associative add, F64 2454def : InstRW<[V2Write_4cyc_1V], (instrs FADDA_VPZ_D)>; 2455 2456// Floating point compare 2457def : InstRW<[V2Write_2cyc_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]", 2458 "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", 2459 "^FCM(LE|LT)_PPzZ0_[HSD]", 2460 "^FCMUO_PPzZZ_[HSD]")>; 2461 2462// Floating point complex add 2463def : InstRW<[V2Write_3cyc_1V], (instregex "^FCADD_ZPmZ_[HSD]")>; 2464 2465// Floating point complex multiply add 2466def : InstRW<[V2Wr_ZFCMA, ReadDefault, V2Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>; 2467def : InstRW<[V2Wr_ZFCMA, V2Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; 2468 2469// Floating point convert, long or narrow (F16 to F32 or F32 to F16) 2470def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", 2471 "^FCVTLT_ZPmZ_HtoS", 2472 "^FCVTNT_ZPmZ_StoH")>; 2473 2474// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 2475// or F64 to F16) 2476def : InstRW<[V2Write_3cyc_1V02], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)", 2477 "^FCVTLT_ZPmZ_StoD", 2478 "^FCVTNT_ZPmZ_DtoS")>; 2479 2480// Floating point convert, round to odd 2481def : InstRW<[V2Write_3cyc_1V02], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>; 2482 2483// Floating point base2 log, F16 2484def : InstRW<[V2Write_6cyc_4V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>; 2485 2486// Floating point base2 log, F32 2487def : InstRW<[V2Write_4cyc_2V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>; 2488 2489// Floating point base2 log, F64 2490def : InstRW<[V2Write_3cyc_1V02], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>; 2491 2492// Floating point convert to integer, F16 2493def : InstRW<[V2Write_6cyc_4V02], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>; 2494 2495// Floating point convert to integer, F32 2496def : InstRW<[V2Write_4cyc_2V02], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>; 2497 2498// Floating point convert to integer, F64 2499def : InstRW<[V2Write_3cyc_1V02], 2500 (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>; 2501 2502// Floating point copy 2503def : InstRW<[V2Write_2cyc_1V], (instregex "^FCPY_ZPmI_[HSD]", 2504 "^FDUP_ZI_[HSD]")>; 2505 2506// Floating point divide, F16 2507def : InstRW<[V2Write_13cyc_1V02_12rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>; 2508 2509// Floating point divide, F32 2510def : InstRW<[V2Write_10cyc_1V02_9rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>; 2511 2512// Floating point divide, F64 2513def : InstRW<[V2Write_15cyc_1V02_14rc], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>; 2514 2515// Floating point min/max pairwise 2516def : InstRW<[V2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>; 2517 2518// Floating point min/max 2519def : InstRW<[V2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]", 2520 "^F(MAX|MIN)(NM)?_ZPZ[IZ]_[HSD]")>; 2521 2522// Floating point multiply 2523def : InstRW<[V2Write_3cyc_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", 2524 "^FMULX_ZPZZ_[HSD]", 2525 "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]", 2526 "^FMUL_ZPZ[IZ]_[HSD]")>; 2527 2528// Floating point multiply accumulate 2529def : InstRW<[V2Wr_ZFMA, ReadDefault, V2Rd_ZFMA], 2530 (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", 2531 "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; 2532def : InstRW<[V2Wr_ZFMA, V2Rd_ZFMA], 2533 (instregex "^FML[AS]_ZZZI_[HSD]", 2534 "^FN?ML[AS]_ZPZZZ_[HSD]")>; 2535 2536// Floating point multiply add/sub accumulate long 2537def : InstRW<[V2Wr_ZFMAL, V2Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>; 2538 2539// Floating point reciprocal estimate, F16 2540def : InstRW<[V2Write_6cyc_4V02], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>; 2541 2542// Floating point reciprocal estimate, F32 2543def : InstRW<[V2Write_4cyc_2V02], (instregex "^FR(ECP|SQRT)E_ZZ_S", "^FRECPX_ZPmZ_S")>; 2544 2545// Floating point reciprocal estimate, F64 2546def : InstRW<[V2Write_3cyc_1V02], (instregex "^FR(ECP|SQRT)E_ZZ_D", "^FRECPX_ZPmZ_D")>; 2547 2548// Floating point reciprocal step 2549def : InstRW<[V2Write_4cyc_1V], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; 2550 2551// Floating point reduction, F16 2552def : InstRW<[V2Write_8cyc_4V], 2553 (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H")>; 2554 2555// Floating point reduction, F32 2556def : InstRW<[V2Write_6cyc_3V], 2557 (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S")>; 2558 2559// Floating point reduction, F64 2560def : InstRW<[V2Write_4cyc_2V], 2561 (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D")>; 2562 2563// Floating point round to integral, F16 2564def : InstRW<[V2Write_6cyc_4V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>; 2565 2566// Floating point round to integral, F32 2567def : InstRW<[V2Write_4cyc_2V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>; 2568 2569// Floating point round to integral, F64 2570def : InstRW<[V2Write_3cyc_1V02], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>; 2571 2572// Floating point square root, F16 2573def : InstRW<[V2Write_13cyc_1V02_12rc], (instregex "^FSQRT_ZPmZ_H")>; 2574 2575// Floating point square root, F32 2576def : InstRW<[V2Write_10cyc_1V02_9rc], (instregex "^FSQRT_ZPmZ_S")>; 2577 2578// Floating point square root, F64 2579def : InstRW<[V2Write_16cyc_1V02_14rc], (instregex "^FSQRT_ZPmZ_D")>; 2580 2581// Floating point trigonometric exponentiation 2582def : InstRW<[V2Write_3cyc_1V1], (instregex "^FEXPA_ZZ_[HSD]")>; 2583 2584// Floating point trigonometric multiply add 2585def : InstRW<[V2Write_4cyc_1V], (instregex "^FTMAD_ZZI_[HSD]")>; 2586 2587// Floating point trigonometric, miscellaneous 2588def : InstRW<[V2Write_3cyc_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]")>; 2589 2590// SVE BFloat16 (BF16) instructions 2591// ----------------------------------------------------------------------------- 2592 2593// Convert, F32 to BF16 2594def : InstRW<[V2Write_4cyc_1V02], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; 2595 2596// Dot product 2597def : InstRW<[V2Wr_ZBFDOT, V2Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; 2598 2599// Matrix multiply accumulate 2600def : InstRW<[V2Wr_ZBFMMA, V2Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>; 2601 2602// Multiply accumulate long 2603def : InstRW<[V2Wr_ZBFMAL, V2Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>; 2604 2605// SVE Load instructions 2606// ----------------------------------------------------------------------------- 2607 2608// Load vector 2609def : InstRW<[V2Write_6cyc_1L], (instrs LDR_ZXI)>; 2610 2611// Load predicate 2612def : InstRW<[V2Write_6cyc_1L_1M], (instrs LDR_PXI)>; 2613 2614// Contiguous load, scalar + imm 2615def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM$", 2616 "^LD1S?B_[HSD]_IMM$", 2617 "^LD1S?H_[SD]_IMM$", 2618 "^LD1S?W_D_IMM$" )>; 2619// Contiguous load, scalar + scalar 2620def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]$", 2621 "^LD1S?B_[HSD]$", 2622 "^LD1S?H_[SD]$", 2623 "^LD1S?W_D$" )>; 2624 2625// Contiguous load broadcast, scalar + imm 2626def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1R[BHWD]_IMM$", 2627 "^LD1RS?B_[HSD]_IMM$", 2628 "^LD1RS?H_[SD]_IMM$", 2629 "^LD1RW_D_IMM$", 2630 "^LD1RSW_IMM$", 2631 "^LD1RQ_[BHWD]_IMM$")>; 2632 2633// Contiguous load broadcast, scalar + scalar 2634def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1RQ_[BHWD]$")>; 2635 2636// Non temporal load, scalar + imm 2637// Non temporal load, scalar + scalar 2638def : InstRW<[V2Write_6cyc_1L], (instregex "^LDNT1[BHWD]_ZR[IR]$")>; 2639 2640// Non temporal gather load, vector + scalar 32-bit element size 2641def : InstRW<[V2Write_9cyc_2L_4V], (instregex "^LDNT1[BHW]_ZZR_S$", 2642 "^LDNT1S[BH]_ZZR_S$")>; 2643 2644// Non temporal gather load, vector + scalar 64-bit element size 2645def : InstRW<[V2Write_9cyc_2L_2V1], (instregex "^LDNT1S?[BHW]_ZZR_D$")>; 2646def : InstRW<[V2Write_9cyc_2L_2V1], (instrs LDNT1D_ZZR_D)>; 2647 2648// Contiguous first faulting load, scalar + scalar 2649def : InstRW<[V2Write_6cyc_1L_1S], (instregex "^LDFF1[BHWD]$", 2650 "^LDFF1S?B_[HSD]$", 2651 "^LDFF1S?H_[SD]$", 2652 "^LDFF1S?W_D$")>; 2653 2654// Contiguous non faulting load, scalar + imm 2655def : InstRW<[V2Write_6cyc_1L], (instregex "^LDNF1[BHWD]_IMM$", 2656 "^LDNF1S?B_[HSD]_IMM$", 2657 "^LDNF1S?H_[SD]_IMM$", 2658 "^LDNF1S?W_D_IMM$")>; 2659 2660// Contiguous Load two structures to two vectors, scalar + imm 2661def : InstRW<[V2Write_8cyc_2L_2V], (instregex "^LD2[BHWD]_IMM$")>; 2662 2663// Contiguous Load two structures to two vectors, scalar + scalar 2664def : InstRW<[V2Write_9cyc_2L_2V_2S], (instregex "^LD2[BHWD]$")>; 2665 2666// Contiguous Load three structures to three vectors, scalar + imm 2667def : InstRW<[V2Write_9cyc_3L_3V], (instregex "^LD3[BHWD]_IMM$")>; 2668 2669// Contiguous Load three structures to three vectors, scalar + scalar 2670def : InstRW<[V2Write_10cyc_3V_3L_3S], (instregex "^LD3[BHWD]$")>; 2671 2672// Contiguous Load four structures to four vectors, scalar + imm 2673def : InstRW<[V2Write_9cyc_4L_8V], (instregex "^LD4[BHWD]_IMM$")>; 2674 2675// Contiguous Load four structures to four vectors, scalar + scalar 2676def : InstRW<[V2Write_10cyc_4L_8V_4S], (instregex "^LD4[BHWD]$")>; 2677 2678// Gather load, vector + imm, 32-bit element size 2679def : InstRW<[V2Write_9cyc_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_IMM$", 2680 "^GLD(FF)?1W_IMM$")>; 2681 2682// Gather load, vector + imm, 64-bit element size 2683def : InstRW<[V2Write_9cyc_1L_4V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$", 2684 "^GLD(FF)?1D_IMM$")>; 2685 2686// Gather load, 32-bit scaled offset 2687def : InstRW<[V2Write_10cyc_1L_8V], 2688 (instregex "^GLD(FF)?1S?H_S_[SU]XTW_SCALED$", 2689 "^GLD(FF)?1W_[SU]XTW_SCALED")>; 2690 2691// Gather load, 64-bit scaled offset 2692// NOTE: These instructions are not specified in the SOG. 2693def : InstRW<[V2Write_10cyc_1L_4V], 2694 (instregex "^GLD(FF)?1S?[HW]_D_([SU]XTW_)?SCALED$", 2695 "^GLD(FF)?1D_([SU]XTW_)?SCALED$")>; 2696 2697// Gather load, 32-bit unpacked unscaled offset 2698def : InstRW<[V2Write_9cyc_1L_4V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", 2699 "^GLD(FF)?1W_[SU]XTW$")>; 2700 2701// Gather load, 64-bit unpacked unscaled offset 2702// NOTE: These instructions are not specified in the SOG. 2703def : InstRW<[V2Write_9cyc_1L_2V], 2704 (instregex "^GLD(FF)?1S?[BHW]_D(_[SU]XTW)?$", 2705 "^GLD(FF)?1D(_[SU]XTW)?$")>; 2706 2707// SVE Store instructions 2708// ----------------------------------------------------------------------------- 2709 2710// Store from predicate reg 2711def : InstRW<[V2Write_1cyc_1L01], (instrs STR_PXI)>; 2712 2713// Store from vector reg 2714def : InstRW<[V2Write_2cyc_1L01_1V01], (instrs STR_ZXI)>; 2715 2716// Contiguous store, scalar + imm 2717def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^ST1[BHWD]_IMM$", 2718 "^ST1B_[HSD]_IMM$", 2719 "^ST1H_[SD]_IMM$", 2720 "^ST1W_D_IMM$")>; 2721 2722// Contiguous store, scalar + scalar 2723def : InstRW<[V2Write_2cyc_1L01_1S_1V01], (instregex "^ST1H(_[SD])?$")>; 2724def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "^ST1[BWD]$", 2725 "^ST1B_[HSD]$", 2726 "^ST1W_D$")>; 2727 2728// Contiguous store two structures from two vectors, scalar + imm 2729def : InstRW<[V2Write_4cyc_1L01_1V01], (instregex "^ST2[BHWD]_IMM$")>; 2730 2731// Contiguous store two structures from two vectors, scalar + scalar 2732def : InstRW<[V2Write_4cyc_2L01_2S_2V01], (instrs ST2H)>; 2733def : InstRW<[V2Write_4cyc_2L01_2V01], (instregex "^ST2[BWD]$")>; 2734 2735// Contiguous store three structures from three vectors, scalar + imm 2736def : InstRW<[V2Write_7cyc_9L01_9V01], (instregex "^ST3[BHWD]_IMM$")>; 2737 2738// Contiguous store three structures from three vectors, scalar + scalar 2739def : InstRW<[V2Write_7cyc_9L01_9S_9V01], (instregex "^ST3[BHWD]$")>; 2740 2741// Contiguous store four structures from four vectors, scalar + imm 2742def : InstRW<[V2Write_11cyc_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>; 2743 2744// Contiguous store four structures from four vectors, scalar + scalar 2745def : InstRW<[V2Write_11cyc_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>; 2746 2747// Non temporal store, scalar + imm 2748def : InstRW<[V2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>; 2749 2750// Non temporal store, scalar + scalar 2751def : InstRW<[V2Write_2cyc_1L01_1S_1V], (instrs STNT1H_ZRR)>; 2752def : InstRW<[V2Write_2cyc_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>; 2753 2754// Scatter non temporal store, vector + scalar 32-bit element size 2755def : InstRW<[V2Write_4cyc_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>; 2756 2757// Scatter non temporal store, vector + scalar 64-bit element size 2758def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^STNT1[BHWD]_ZZR_D")>; 2759 2760// Scatter store vector + imm 32-bit element size 2761def : InstRW<[V2Write_4cyc_4L01_4V01], (instregex "^SST1[BH]_S_IMM$", 2762 "^SST1W_IMM$")>; 2763 2764// Scatter store vector + imm 64-bit element size 2765def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[BHW]_D_IMM$", 2766 "^SST1D_IMM$")>; 2767 2768// Scatter store, 32-bit scaled offset 2769def : InstRW<[V2Write_4cyc_4L01_4V01], 2770 (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; 2771 2772// Scatter store, 32-bit unpacked unscaled offset 2773def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[BHW]_D_[SU]XTW$", 2774 "^SST1D_[SU]XTW$")>; 2775 2776// Scatter store, 32-bit unpacked scaled offset 2777def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", 2778 "^SST1D_[SU]XTW_SCALED$")>; 2779 2780// Scatter store, 32-bit unscaled offset 2781def : InstRW<[V2Write_4cyc_4L01_4V01], (instregex "^SST1[BH]_S_[SU]XTW$", 2782 "^SST1W_[SU]XTW$")>; 2783 2784// Scatter store, 64-bit scaled offset 2785def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[HW]_D_SCALED$", 2786 "^SST1D_SCALED$")>; 2787 2788// Scatter store, 64-bit unscaled offset 2789def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "^SST1[BHW]_D$", 2790 "^SST1D$")>; 2791 2792// SVE Miscellaneous instructions 2793// ----------------------------------------------------------------------------- 2794 2795// Read first fault register, unpredicated 2796def : InstRW<[V2Write_2cyc_1M0], (instrs RDFFR_P)>; 2797 2798// Read first fault register, predicated 2799def : InstRW<[V2Write_3or4cyc_1M0_1M], (instrs RDFFR_PPz)>; 2800 2801// Read first fault register and set flags 2802def : InstRW<[V2Write_4or5cyc_2M0_2M], (instrs RDFFRS_PPz)>; 2803 2804// Set first fault register 2805// Write to first fault register 2806def : InstRW<[V2Write_2cyc_1M0], (instrs SETFFR, WRFFR)>; 2807 2808// Prefetch 2809// NOTE: This is not specified in the SOG. 2810def : InstRW<[V2Write_4cyc_1L], (instregex "^PRF[BHWD]")>; 2811 2812// SVE Cryptographic instructions 2813// ----------------------------------------------------------------------------- 2814 2815// Crypto AES ops 2816def : InstRW<[V2Write_2cyc_1V], (instregex "^AES[DE]_ZZZ_B$", 2817 "^AESI?MC_ZZ_B$")>; 2818 2819// Crypto SHA3 ops 2820def : InstRW<[V2Write_2cyc_1V0], (instregex "^(BCAX|EOR3)_ZZZZ$", 2821 "^RAX1_ZZZ_D$", 2822 "^XAR_ZZZI_[BHSD]$")>; 2823 2824// Crypto SM4 ops 2825def : InstRW<[V2Write_4cyc_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>; 2826 2827} 2828