1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver4 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14//===----------------------------------------------------------------------===// 15 16def Znver4Model : SchedMachineModel { 17 // AMD SOG 19h, 2.9.6 Dispatch 18 // The processor may dispatch up to 6 macro ops per cycle 19 // into the execution engine. 20 let IssueWidth = 6; 21 // AMD SOG 19h, 2.10.3 22 // The retire control unit (RCU) tracks the completion status of all 23 // outstanding operations (integer, load/store, and floating-point) and is 24 // the final arbiter for exception processing and recovery. 25 // The unit can receive up to 6 macro ops dispatched per cycle and track up 26 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. 27 let MicroOpBufferSize = 320; 28 // AMD SOG 19h, 2.9.1 Op Cache 29 // The op cache is organized as an associative cache with 64 sets and 8 ways. 30 // At each set-way intersection is an entry containing up to 8 macro ops. 31 // The maximum capacity of the op cache is 6.75K ops. 32 // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from 33 // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop 34 // unrolling leading to excessive filling of the op-cache from frontend. 35 let LoopMicroOpBufferSize = 108; 36 // AMD SOG 19h, 2.6.2 L1 Data Cache 37 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 38 // AMD SOG 19h, 2.12 L1 Data Cache 39 // The AGU and LS pipelines are optimized for simple address generation modes. 40 // <...> and can achieve 4-cycle load-to-use integer load latency. 41 let LoadLatency = 4; 42 // AMD SOG 19h, 2.12 L1 Data Cache 43 // The AGU and LS pipelines are optimized for simple address generation modes. 44 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 45 int VecLoadLatency = 7; 46 // Latency of a simple store operation. 47 int StoreLatency = 1; 48 // FIXME: 49 let HighLatency = 25; // FIXME: any better choice? 50 // AMD SOG 19h, 2.8 Optimizing Branching 51 // The branch misprediction penalty is in the range from 11 to 18 cycles, 52 // <...>. The common case penalty is 13 cycles. 53 let MispredictPenalty = 13; 54 55 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 56 57 let CompleteModel = 1; 58} 59 60let SchedModel = Znver4Model in { 61 62 63//===----------------------------------------------------------------------===// 64// RCU 65//===----------------------------------------------------------------------===// 66 67// AMD SOG 19h, 2.10.3 Retire Control Unit 68// The unit can receive up to 6 macro ops dispatched per cycle and track up to 69// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 70// The retire unit handles in-order commit of up to nine macro ops per cycle. 71def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>; 72 73//===----------------------------------------------------------------------===// 74// Integer Execution Unit 75// 76 77// AMD SOG 19h, 2.4 Superscalar Organization 78// The processor uses four decoupled independent integer scheduler queues, 79// each one servicing one ALU pipeline and one or two other pipelines 80 81// 82// Execution pipes 83//===----------------------------------------------------------------------===// 84 85// AMD SOG 19h, 2.10.2 Execution Units 86// The processor contains 4 general purpose integer execution pipes. 87// Each pipe has an ALU capable of general purpose integer operations. 88def Zn4ALU0 : ProcResource<1>; 89def Zn4ALU1 : ProcResource<1>; 90def Zn4ALU2 : ProcResource<1>; 91def Zn4ALU3 : ProcResource<1>; 92 93// AMD SOG 19h, 2.10.2 Execution Units 94// There is also a separate branch execution unit. 95def Zn4BRU1 : ProcResource<1>; 96 97// AMD SOG 19h, 2.10.2 Execution Units 98// There are three Address Generation Units (AGUs) for all load and store 99// address generation. There are also 3 store data movement units 100// associated with the same schedulers as the AGUs. 101def Zn4AGU0 : ProcResource<1>; 102def Zn4AGU1 : ProcResource<1>; 103def Zn4AGU2 : ProcResource<1>; 104 105// 106// Execution Units 107//===----------------------------------------------------------------------===// 108 109// AMD SOG 19h, 2.10.2 Execution Units 110// ALU0 additionally has divide <...> execution capability. 111defvar Zn4Divider = Zn4ALU0; 112 113// AMD SOG 19h, 2.10.2 Execution Units 114// ALU0 additionally has <...> branch execution capability. 115defvar Zn4BRU0 = Zn4ALU0; 116 117// Integer Multiplication issued on ALU1. 118defvar Zn4Multiplier = Zn4ALU1; 119 120// Execution pipeline grouping 121//===----------------------------------------------------------------------===// 122 123// General ALU operations 124def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>; 125 126// General AGU operations 127def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>; 128 129// Control flow: jumps, calls 130def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>; 131 132// Everything that isn't control flow, but still needs to access CC register, 133// namely: conditional moves, SETcc. 134def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>; 135 136// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 137 138// Simple bit twiddling: bit test, shift/rotate, bit extraction 139def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>; 140 141 142// 143// Scheduling 144//===----------------------------------------------------------------------===// 145 146// AMD SOG 19h, 2.10.3 Retire Control Unit 147// The integer physical register file (PRF) consists of 224 registers. 148def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0], 149 6, // Max moves that can be eliminated per cycle. 150 0>; // Restrict move elimination to zero regs. 151 152// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 153// AMD SOG 19h, 2.10.1 Schedulers 154// The schedulers can receive up to six macro ops per cycle, with a limit of 155// two per scheduler. Each scheduler can issue one micro op per cycle into 156// each of its associated pipelines 157def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 158 Zn4ALU1, Zn4AGU1, // scheduler 1 159 Zn4ALU2, Zn4AGU2, // scheduler 2 160 Zn4ALU3, Zn4BRU1 // scheduler 3 161 ]> { 162 let BufferSize = !mul(4, 24); 163} 164 165 166//===----------------------------------------------------------------------===// 167// Floating-Point Unit 168// 169 170// AMD SOG 19h, 2.4 Superscalar Organization 171// The processor uses <...> two decoupled independent floating point schedulers 172// each servicing two FP pipelines and one store or FP-to-integer pipeline. 173 174// 175// Execution pipes 176//===----------------------------------------------------------------------===// 177 178// AMD SOG 19h, 2.10.1 Schedulers 179// <...>, and six FPU pipes. 180// Agner, 22.10 Floating point execution pipes 181// There are six floating point/vector execution pipes, 182def Zn4FP0 : ProcResource<1>; 183def Zn4FP1 : ProcResource<1>; 184def Zn4FP2 : ProcResource<1>; 185def Zn4FP3 : ProcResource<1>; 186def Zn4FP45 : ProcResource<2>; 187 188// 189// Execution Units 190//===----------------------------------------------------------------------===// 191// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 192 193// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 194defvar Zn4FPFMul0 = Zn4FP0; 195defvar Zn4FPFMul1 = Zn4FP1; 196 197// (v)FADD* 198defvar Zn4FPFAdd0 = Zn4FP2; 199defvar Zn4FPFAdd1 = Zn4FP3; 200 201// All convert operations except pack/unpack 202defvar Zn4FPFCvt0 = Zn4FP2; 203defvar Zn4FPFCvt1 = Zn4FP3; 204 205// All Divide and Square Root except Reciprocal Approximation 206// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 207// FDIV unit can support 2 simultaneous operations in flight 208// even though it occupies a single pipe. 209// FIXME: BufferSize=2 ? 210defvar Zn4FPFDiv = Zn4FP1; 211 212// Moves and Logical operations on Floating Point Data Types 213defvar Zn4FPFMisc0 = Zn4FP0; 214defvar Zn4FPFMisc1 = Zn4FP1; 215defvar Zn4FPFMisc2 = Zn4FP2; 216defvar Zn4FPFMisc3 = Zn4FP3; 217 218// Integer Adds, Subtracts, and Compares 219// Some complex VADD operations are not available in all pipes. 220defvar Zn4FPVAdd0 = Zn4FP0; 221defvar Zn4FPVAdd1 = Zn4FP1; 222defvar Zn4FPVAdd2 = Zn4FP2; 223defvar Zn4FPVAdd3 = Zn4FP3; 224 225// Integer Multiplies, SAD, Blendvb 226defvar Zn4FPVMul0 = Zn4FP0; 227defvar Zn4FPVMul1 = Zn4FP3; 228 229// Data Shuffles, Packs, Unpacks, Permute 230// Some complex shuffle operations are only available in pipe1. 231defvar Zn4FPVShuf = Zn4FP1; 232defvar Zn4FPVShufAux = Zn4FP2; 233 234// Bit Shift Left/Right operations 235defvar Zn4FPVShift0 = Zn4FP1; 236defvar Zn4FPVShift1 = Zn4FP2; 237 238// Moves and Logical operations on Packed Integer Data Types 239defvar Zn4FPVMisc0 = Zn4FP0; 240defvar Zn4FPVMisc1 = Zn4FP1; 241defvar Zn4FPVMisc2 = Zn4FP2; 242defvar Zn4FPVMisc3 = Zn4FP3; 243 244// *AES* 245defvar Zn4FPAES0 = Zn4FP0; 246defvar Zn4FPAES1 = Zn4FP1; 247 248// *CLM* 249defvar Zn4FPCLM0 = Zn4FP0; 250defvar Zn4FPCLM1 = Zn4FP1; 251 252// Execution pipeline grouping 253//===----------------------------------------------------------------------===// 254 255// AMD SOG 19h, 2.11 Floating-Point Unit 256// Stores and floating point to general purpose register transfer 257// have 2 dedicated pipelines (pipe 5 and 6). 258def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>; 259 260// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 261def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>; 262 263// (v)FADD* 264// Some complex VADD operations are not available in all pipes. 265def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>; 266 267// All convert operations except pack/unpack 268def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>; 269 270// All Divide and Square Root except Reciprocal Approximation 271// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>; 272 273// Moves and Logical operations on Floating Point Data Types 274def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>; 275 276// FIXUP and RANGE use FP01 pipelines 277def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>; 278def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>; 279// SCALE instructions use FP23 pipelines 280def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>; 281def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>; 282 283// Loads, Stores and Move to General Register (EX) Operations 284// AMD SOG 19h, 2.11 Floating-Point Unit 285// Stores and floating point to general purpose register transfer 286// have 2 dedicated pipelines (pipe 5 and 6). 287defvar Zn4FPLd01 = Zn4FP45; 288 289// AMD SOG 19h, 2.11 Floating-Point Unit 290// Note that FP stores are supported on two pipelines, 291// but throughput is limited to one per cycle. 292let Super = Zn4FP45 in 293def Zn4FPSt : ProcResource<1>; 294 295// Integer Adds, Subtracts, and Compares 296// Some complex VADD operations are not available in all pipes. 297def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>; 298 299def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>; 300def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>; 301 302// AVX512 Opmask pipelines 303def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>; 304def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>; 305 306// Integer Multiplies, SAD, Blendvb 307def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>; 308 309// Data Shuffles, Packs, Unpacks, Permute 310// Some complex shuffle operations are only available in pipe1. 311def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>; 312 313// Bit Shift Left/Right operations 314def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>; 315 316// Moves and Logical operations on Packed Integer Data Types 317def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>; 318 319// *AES* 320def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>; 321 322// *CLM* 323def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>; 324 325 326// 327// Scheduling 328//===----------------------------------------------------------------------===// 329 330// Agner, 21.8 Register renaming and out-of-order schedulers 331// The floating point register file has 192 vector registers 332// of 512b each in zen4. 333def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1], 334 6, // Max moves that can be eliminated per cycle. 335 0>; // Restrict move elimination to zero regs. 336 337// AMD SOG 19h, 2.11 Floating-Point Unit 338// The floating-point scheduler has a 2*32 entry macro op capacity. 339// AMD SOG 19h, 2.11 Floating-Point Unit 340// <...> the scheduler can issue 1 micro op per cycle for each pipe. 341// FIXME: those are two separate schedulers, not a single big one. 342def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 343 Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1 344 ]> { 345 let BufferSize = !mul(2, 32); 346} 347 348// AMD SOG 19h, 2.11 Floating-Point Unit 349// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 350// even if floating-point scheduler is full. 351// FIXME: how to model this properly? 352 353 354//===----------------------------------------------------------------------===// 355// Load-Store Unit 356// 357 358// AMD SOG 19h, 2.12 Load-Store Unit 359// The LS unit contains three largely independent pipe-lines 360// enabling the execution of three 256-bit memory operations per cycle. 361def Zn4LSU : ProcResource<3>; 362 363// AMD SOG 19h, 2.12 Load-Store Unit 364// All three memory operations can be loads. 365let Super = Zn4LSU in 366def Zn4Load : ProcResource<3> { 367 // AMD SOG 19h, 2.12 Load-Store Unit 368 // The LS unit can process up to 72 out-of-order loads. 369 let BufferSize = 72; 370} 371 372def Zn4LoadQueue : LoadQueue<Zn4Load>; 373 374// AMD SOG 19h, 2.12 Load-Store Unit 375// A maximum of two of the memory operations can be stores. 376let Super = Zn4LSU in 377def Zn4Store : ProcResource<2> { 378 // AMD SOG 19h, 2.12 Load-Store Unit 379 // The LS unit utilizes a 64-entry store queue (STQ). 380 let BufferSize = 64; 381} 382 383def Zn4StoreQueue : StoreQueue<Zn4Store>; 384 385//===----------------------------------------------------------------------===// 386// Basic helper classes. 387//===----------------------------------------------------------------------===// 388 389// Many SchedWrites are defined in pairs with and without a folded load. 390// Instructions with folded loads are usually micro-fused, so they only appear 391// as two micro-ops when dispatched by the schedulers. 392// This multiclass defines the resource usage for variants with and without 393// folded loads. 394 395multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 396 int Lat = 1, list<int> Res = [], int UOps = 1> { 397 def : WriteRes<SchedRW, ExePorts> { 398 let Latency = Lat; 399 let ReleaseAtCycles = Res; 400 let NumMicroOps = UOps; 401 } 402} 403 404multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW, 405 list<ProcResourceKind> ExePorts, int Lat, 406 list<int> Res, int UOps, int LoadLat, int LoadUOps, 407 ProcResourceKind AGU, int LoadRes> { 408 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 409 410 defm : __Zn4WriteRes<SchedRW.Folded, 411 !listconcat([AGU, Zn4Load], ExePorts), 412 !add(Lat, LoadLat), 413 !if(!and(!empty(Res), !eq(LoadRes, 1)), 414 [], 415 !listconcat([1, LoadRes], 416 !if(!empty(Res), 417 !listsplat(1, !size(ExePorts)), 418 Res))), 419 !add(UOps, LoadUOps)>; 420} 421 422// For classes without folded loads. 423multiclass Zn4WriteResInt<SchedWrite SchedRW, 424 list<ProcResourceKind> ExePorts, int Lat = 1, 425 list<int> Res = [], int UOps = 1> { 426 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 427} 428 429multiclass Zn4WriteResXMM<SchedWrite SchedRW, 430 list<ProcResourceKind> ExePorts, int Lat = 1, 431 list<int> Res = [], int UOps = 1> { 432 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 433} 434 435multiclass Zn4WriteResYMM<SchedWrite SchedRW, 436 list<ProcResourceKind> ExePorts, int Lat = 1, 437 list<int> Res = [], int UOps = 1> { 438 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 439} 440 441multiclass Zn4WriteResZMM<SchedWrite SchedRW, 442 list<ProcResourceKind> ExePorts, int Lat = 1, 443 list<int> Res = [], int UOps = 1> { 444 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 445} 446 447// For classes with folded loads. 448multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW, 449 list<ProcResourceKind> ExePorts, int Lat = 1, 450 list<int> Res = [], int UOps = 1, 451 int LoadUOps = 0, int LoadRes = 1> { 452 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 453 Znver4Model.LoadLatency, 454 LoadUOps, Zn4AGU012, LoadRes>; 455} 456 457multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW, 458 list<ProcResourceKind> ExePorts, int Lat = 1, 459 list<int> Res = [], int UOps = 1, 460 int LoadUOps = 0, int LoadRes = 1> { 461 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 462 Znver4Model.VecLoadLatency, 463 LoadUOps, Zn4FPLd01, LoadRes>; 464} 465 466multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW, 467 list<ProcResourceKind> ExePorts, int Lat = 1, 468 list<int> Res = [], int UOps = 1, 469 int LoadUOps = 0, int LoadRes = 1> { 470 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 471 Znver4Model.VecLoadLatency, 472 LoadUOps, Zn4FPLd01, LoadRes>; 473} 474 475multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW, 476 list<ProcResourceKind> ExePorts, int Lat = 1, 477 list<int> Res = [], int UOps = 2, 478 int LoadUOps = 0, int LoadRes = 1> { 479 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 480 Znver4Model.VecLoadLatency, 481 LoadUOps, Zn4FPLd01, LoadRes>; 482} 483 484//===----------------------------------------------------------------------===// 485// Here be dragons. 486//===----------------------------------------------------------------------===// 487 488def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>; 489 490def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>; 491def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>; 492def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>; 493 494// AMD SOG 19h, 2.11 Floating-Point Unit 495// There is 1 cycle of added latency for a result to cross 496// from F to I or I to F domain. 497def : ReadAdvance<ReadInt2Fpu, -1>; 498 499// Instructions with both a load and a store folded are modeled as a folded 500// load + WriteRMW. 501defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>; 502 503// Loads, stores, and moves, not folded with other operations. 504defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>; 505 506// Model the effect of clobbering the read-write mask operand of the GATHER operation. 507// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 508defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>; 509 510def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> { 511 let Latency = !add(Znver4Model.LoadLatency, 1); 512 let ReleaseAtCycles = [3, 1]; 513 let NumMicroOps = 1; 514} 515def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 516 517defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; 518defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; 519defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>; 520 521// Treat misc copies as a move. 522def : InstRW<[WriteMove], (instrs COPY)>; 523 524def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 525 let Latency = Znver4Model.LoadLatency; 526 let ReleaseAtCycles = [1, 1, 4]; 527 let NumMicroOps = 1; 528} 529def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>; 530 531def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> { 532 let Latency = Znver4Model.StoreLatency; 533 let ReleaseAtCycles = [4, 1, 1]; 534 let NumMicroOps = 2; 535} 536def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 537 538// Arithmetic. 539defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op. 540 541def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> { 542 let Latency = 1; 543 let ReleaseAtCycles = [4]; 544 let NumMicroOps = 1; 545} 546def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 547 AND8i8, AND16i16, AND32i32, AND64i32, 548 OR8i8, OR16i16, OR32i32, OR64i32, 549 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 550 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 551 552def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> { 553 let Latency = 1; 554 let ReleaseAtCycles = [4]; 555 let NumMicroOps = 1; 556} 557def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 558 559def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> { 560 let Latency = 1; 561 let ReleaseAtCycles = [2]; 562 let NumMicroOps = 1; 563} 564def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 565 566def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> { 567 let Latency = 3; 568 let ReleaseAtCycles = [1]; 569 let NumMicroOps = 1; 570} 571def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 572 PEXT32rr, PEXT64rr)>; 573 574defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op. 575 576def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> { 577 let Latency = 1; 578 let ReleaseAtCycles = [1, 1, 7, 1]; 579 let NumMicroOps = 1; 580} 581def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 582 583// This is for simple LEAs with one or two input operands. 584defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 585 586// This write is used for slow LEA instructions. 587def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> { 588 let Latency = 2; 589 let ReleaseAtCycles = [1]; 590 let NumMicroOps = 2; 591} 592 593// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset), 594// or an LEA with a `Scale` value different than 1. 595def Zn4SlowLEAPredicate : MCSchedPredicate< 596 CheckAny<[ 597 // A 3-operand LEA (base, index, offset). 598 IsThreeOperandsLEAFn, 599 // An LEA with a "Scale" different than 1. 600 CheckAll<[ 601 CheckIsImmOperand<2>, 602 CheckNot<CheckImmOperand<2, 1>> 603 ]> 604 ]> 605>; 606 607def Zn4WriteLEA : SchedWriteVariant<[ 608 SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>, 609 SchedVar<NoSchedPred, [WriteLEA]> 610]>; 611 612def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 613 614def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { 615 let Latency = 2; // FIXME: not from llvm-exegesis 616 let ReleaseAtCycles = [4]; 617 let NumMicroOps = 2; 618} 619 620def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>; 621 622// Integer multiplication 623defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 624defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 625defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 626defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 627defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 628defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 629defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 630defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 631defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 632defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 633defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 634defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 635defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part. 636defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 637 638defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 639defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 640 641defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 642 643def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> { 644 let Latency = 3; 645 let ReleaseAtCycles = [12]; 646 let NumMicroOps = 3; 647} 648def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 649 650defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 651 652def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 653 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency); 654 let ReleaseAtCycles = [1, 1, 12]; 655 let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2); 656} 657def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 658 659def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { 660 let Latency = 3; // FIXME: not from llvm-exegesis 661 let ReleaseAtCycles = [24]; 662 let NumMicroOps = 19; 663} 664def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 665 666def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { 667 let Latency = 4; // FIXME: not from llvm-exegesis 668 let ReleaseAtCycles = [59]; 669 let NumMicroOps = 28; 670} 671def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 672 673def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> { 674 let Latency = 1; 675 let ReleaseAtCycles = [2]; 676 let NumMicroOps = 2; 677} 678def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 679 680def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 681 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 682 let ReleaseAtCycles = [1, 1, 2]; 683 let NumMicroOps = 5; 684} 685def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 686 687def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 688 let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 689 let ReleaseAtCycles = [1, 1, 2]; 690 let NumMicroOps = 2; 691} 692def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 693 694// Integer division. 695// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 696// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 697defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; 698defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; 699defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; 700defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; 701defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; 702defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; 703defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; 704defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; 705 706defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. 707defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. 708 709defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. 710 711def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 712 let Latency = 1; 713 let ReleaseAtCycles = [4]; 714 let NumMicroOps = 1; 715} 716def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>; 717 718defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count. 719 720def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 721 let Latency = 1; 722 let ReleaseAtCycles = [4]; 723 let NumMicroOps = 1; 724} 725def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; 726 727defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. 728 729def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 730 let Latency = 2; 731 let ReleaseAtCycles = [4]; 732 let NumMicroOps = 2; 733} 734def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; 735 736defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move. 737defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 738defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code. 739defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 740defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH. 741 742defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test 743defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>; 744defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>; 745 746defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set 747defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>; 748defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>; 749 750// Integer shifts and rotates. 751defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 752defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 753defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 754 755def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> { 756 let Latency = 1; 757 let ReleaseAtCycles = [2]; 758 let NumMicroOps = 1; 759} 760def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 761 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 762 763def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 764 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency); 765 let ReleaseAtCycles = [1, 1, 2]; 766 let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1); 767} 768def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 769 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 770 771def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> { 772 let Latency = 3; 773 let ReleaseAtCycles = [6]; 774 let NumMicroOps = 7; 775} 776def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 777 778def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 779 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency); 780 let ReleaseAtCycles = [1, 1, 8]; 781 let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3); 782} 783def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 784 785def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> { 786 let Latency = 4; 787 let ReleaseAtCycles = [8]; 788 let NumMicroOps = 9; 789} 790def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 791 792def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 793 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency); 794 let ReleaseAtCycles = [1, 1, 8]; 795 let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2); 796} 797def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 798 799defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 800 801def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> { 802 let Latency = 3; 803 let ReleaseAtCycles = [6]; 804 let NumMicroOps = 7; 805} 806def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 807 808def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 809 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency); 810 let ReleaseAtCycles = [1, 1, 8]; 811 let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2); 812} 813def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 814 815def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> { 816 let Latency = 4; 817 let ReleaseAtCycles = [8]; 818 let NumMicroOps = 9; 819} 820def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 821 822def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 823 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency); 824 let ReleaseAtCycles = [1, 1, 8]; 825 let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2); 826} 827def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 828 829// Double shift instructions. 830defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>; 831defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>; 832defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; 833defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; 834 835// BMI1 BEXTR/BLS, BMI2 BZHI 836defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 837defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>; 838defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 839 840// Idioms that clear a register, like xorps %xmm0, %xmm0. 841// These can often bypass execution ports completely. 842defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>; 843 844// Branches don't produce values, so they have no latency, but they still 845// consume resources. Indirect branches can fold loads. 846defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 847 848// Floating point. This covers both scalar and vector operations. 849defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>; 850defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; 851defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; 852defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 853defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 854defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 855defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 856defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 857defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 858 859def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> { 860 let Latency = 2; // FIXME: not from llvm-exegesis 861 let ReleaseAtCycles = [1, 1]; 862 let NumMicroOps = 2; 863} 864def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 865 VMOVHPDmr, VMOVHPSmr)>; 866 867defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 868defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 869defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 870defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 871defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 872 873defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 874defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; 875defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; 876defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 877 878defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub. 879 880def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 881 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 882 let ReleaseAtCycles = [1, 1, 24]; 883 let NumMicroOps = 2; 884} 885def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 886 SUB_FI16m, SUB_FI32m, 887 SUBR_FI16m, SUBR_FI32m, 888 MUL_FI16m, MUL_FI32m)>; 889 890def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 891 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 892 let ReleaseAtCycles = [1, 1, 62]; 893 let NumMicroOps = 2; 894} 895def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 896 DIVR_FI16m, DIVR_FI32m)>; 897 898defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 899defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 900defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM). 901defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 902defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 903defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 904defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM). 905defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare. 906defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM). 907defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM). 908defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM). 909defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare. 910defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM). 911defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM). 912defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM). 913defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 914defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 915defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication. 916defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 917defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 918defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM). 919defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication. 920defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 921defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 922defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM). 923defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division. 924defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM). 925defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM). 926defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM). 927defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division. 928defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 929defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 930defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM). 931defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root. 932defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 933defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 934defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM). 935defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root. 936defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 937defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 938defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM). 939defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 940defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate. 941defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM). 942defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM). 943defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM). 944defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate. 945defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM). 946defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM). 947defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM). 948defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add. 949defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM). 950defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM). 951defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM). 952defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 953defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 954defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 955defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 956defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding. 957defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 958defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM). 959 960defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 961defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 962defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM). 963defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 964defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 965defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM). 966defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 967defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 968defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM). 969defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 970defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 971defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM). 972defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends. 973defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 974defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM). 975defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends. 976defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 977defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM). 978 979// Horizontal Add/Sub (float and integer) 980defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>; 981defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>; 982defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>; 983defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 984defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>; 985defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>; 986defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>; 987 988// Vector integer operations. 989defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 990defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 991defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 992defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 993defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 994defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 995defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 996defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 997defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 998 999def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { 1000 let Latency = 4; 1001 let ReleaseAtCycles = [1]; 1002 let NumMicroOps = 1; 1003} 1004def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 1005 1006def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { 1007 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1008 let ReleaseAtCycles = [1, 1, 1]; 1009 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1010} 1011def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1012 1013def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { 1014 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1015 let ReleaseAtCycles = [1, 1, 1]; 1016 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1017} 1018def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1019 1020defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1021defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1022defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1023defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 1024defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; 1025defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; 1026defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 1027 1028defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>; 1029defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>; 1030 1031def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { 1032 let Latency = 1; 1033 let ReleaseAtCycles = [1, 2]; 1034 let NumMicroOps = 2; 1035} 1036def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1037 1038def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { 1039 let Latency = 1; 1040 let ReleaseAtCycles = [1, 4]; 1041 let NumMicroOps = 2; 1042} 1043def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1044 1045defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1046 1047def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { 1048 let Latency = 3; 1049 let ReleaseAtCycles = [1, 1]; 1050 let NumMicroOps = 1; 1051} 1052def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1053 1054def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { 1055 let Latency = 3; 1056 let ReleaseAtCycles = [1, 1]; 1057 let NumMicroOps = 2; 1058} 1059def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1060 1061defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1062 1063def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> { 1064 let Latency = 2; 1065 let ReleaseAtCycles = [2]; 1066 let NumMicroOps = 1; 1067} 1068def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1069 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1070 PAVGBrr, PAVGWrr, 1071 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1072 VPABSBrr, VPABSDrr, VPABSWrr, 1073 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1074 VPAVGBrr, VPAVGWrr, 1075 VPCMPEQQrr, 1076 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1077 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1078 1079def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> { 1080 let Latency = 1; 1081 let ReleaseAtCycles = [1]; 1082 let NumMicroOps = 1; 1083} 1084def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr, 1085 KANDBrr, KANDDrr, KANDQrr, KANDWrr, 1086 KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr, 1087 KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk, 1088 KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk, 1089 KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr, 1090 KORBrr, KORDrr, KORQrr, KORWrr, 1091 KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr, 1092 KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr, 1093 KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr, 1094 KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr, 1095 KXORBrr, KXORDrr, KXORQrr, KXORWrr)>; 1096 1097def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> { 1098 let Latency = 1; 1099 let ReleaseAtCycles = [1]; 1100 let NumMicroOps = 1; 1101} 1102def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>; 1103 1104def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { 1105 let Latency = 1; 1106 let ReleaseAtCycles = [1]; 1107 let NumMicroOps = 1; 1108} 1109def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; 1110 1111def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { 1112 // TODO: All align instructions are expected to be of 4 cycle latency 1113 let Latency = 4; 1114 let ReleaseAtCycles = [1]; 1115 let NumMicroOps = 1; 1116} 1117def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, 1118 VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) 1119 >; 1120defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1121 1122def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { 1123 let Latency = 1; 1124 let ReleaseAtCycles = [1]; 1125 let NumMicroOps = 1; 1126} 1127def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1128 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1129 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1130 VPAVGBYrr, VPAVGWYrr, 1131 VPCMPEQQYrr, 1132 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1133 1134defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM). 1135 1136defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1137defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1138defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1139defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM). 1140defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1141defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1142defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM). 1143defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1144defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM). 1145defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1146defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM). 1147defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1148defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1149defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1150defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM). 1151defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1152defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1153defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1154defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM). 1155defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD. 1156defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1157defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM). 1158defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles. 1159defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1160defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1161defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM). 1162defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1163defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1164defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1165defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM). 1166defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends. 1167defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1168defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM). 1169defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends. 1170defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1171defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM). 1172defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1173defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1174defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1175defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM). 1176defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1177defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1178defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM). 1179defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1180 1181// Vector insert/extract operations. 1182defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1183defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1184defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1185 1186// MOVMSK operations. 1187defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1188defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1189defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>; 1190defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1191 1192// Conversion between integer and float. 1193defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer. 1194defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM). 1195defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM). 1196defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM). 1197 1198def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1199 let Latency = 1; 1200 let ReleaseAtCycles = [2]; 1201 let NumMicroOps = 2; 1202} 1203defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer. 1204 1205defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1206defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM). 1207defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM). 1208 1209defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1210defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1211defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1212defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM). 1213 1214def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1215 let Latency = 2; 1216 let ReleaseAtCycles = [6]; 1217 let NumMicroOps = 2; 1218} 1219 1220defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1221defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1222defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1223defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM). 1224 1225def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1226 let Latency = 3; 1227 let ReleaseAtCycles = [1]; 1228 let NumMicroOps = 2; 1229} 1230 1231defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1232defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1233defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1234defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM). 1235 1236defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1237defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1238defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1239defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM). 1240 1241defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1242defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1243defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM). 1244 1245defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1246defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1247defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM). 1248 1249defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1250defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1251defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM). 1252 1253// CRC32 instruction. 1254defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>; 1255 1256def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { 1257 let Latency = 2; 1258 let ReleaseAtCycles = [2]; 1259 let NumMicroOps = 2; 1260} 1261def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1262 1263def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1264 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); 1265 let ReleaseAtCycles = [1, 1, 2]; 1266 let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); 1267} 1268def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1269 1270def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { 1271 let Latency = 1; 1272 let ReleaseAtCycles = [2]; 1273 let NumMicroOps = 1; 1274} 1275def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1276 1277def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1278 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1279 let ReleaseAtCycles = [1, 1, 2]; 1280 let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1281} 1282def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1283 1284def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { 1285 let Latency = 2; 1286 let ReleaseAtCycles = [3]; 1287 let NumMicroOps = 2; 1288} 1289def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1290 1291def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1292 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); 1293 let ReleaseAtCycles = [1, 1, 3]; 1294 let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); 1295} 1296def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1297 1298def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { 1299 let Latency = 3; 1300 let ReleaseAtCycles = [8]; 1301 let NumMicroOps = 4; 1302} 1303def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1304 1305def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1306 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); 1307 let ReleaseAtCycles = [1, 1, 8]; 1308 let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); 1309} 1310def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1311 1312def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> { 1313 let Latency = 6; 1314 let ReleaseAtCycles = [8]; 1315 let NumMicroOps = 1; 1316} 1317def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1318 1319def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> { 1320 let Latency = 4; 1321 let ReleaseAtCycles = [8]; 1322 let NumMicroOps = 1; 1323} 1324def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1325 1326// Strings instructions. 1327// Packed Compare Implicit Length Strings, Return Mask 1328defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1329// Packed Compare Explicit Length Strings, Return Mask 1330defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1331// Packed Compare Implicit Length Strings, Return Index 1332defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; 1333// Packed Compare Explicit Length Strings, Return Index 1334defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1335 1336// AES instructions. 1337defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption. 1338defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn. 1339defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. 1340 1341// Carry-less multiplication instructions. 1342defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; 1343 1344// EMMS/FEMMS 1345defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1346 1347// Load/store MXCSR 1348defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1349defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1350 1351// Catch-all for expensive system instructions. 1352defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>; 1353 1354def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> { 1355 let Latency = 0; // FIXME: not from llvm-exegesis 1356 let ReleaseAtCycles = [1]; 1357 let NumMicroOps = 1; 1358} 1359def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>; 1360 1361def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> { 1362 let Latency = 10; // FIXME: not from llvm-exegesis 1363 let ReleaseAtCycles = [24]; 1364 let NumMicroOps = 18; 1365} 1366def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>; 1367 1368// AVX2. 1369defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1370defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1371defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles. 1372 1373def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { 1374 let Latency = 3; 1375 let ReleaseAtCycles = [1]; 1376 let NumMicroOps = 1; 1377} 1378def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1379 1380def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1381 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); 1382 let ReleaseAtCycles = [1, 1, 1]; 1383 let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1384} 1385def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1386 1387def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { 1388 let Latency = 7; 1389 let ReleaseAtCycles = [1]; 1390 let NumMicroOps = 2; 1391} 1392def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; 1393 1394def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1395 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); 1396 let ReleaseAtCycles = [1, 1, 2]; 1397 let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); 1398} 1399def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1400 1401def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { 1402 let Latency = 6; 1403 let ReleaseAtCycles = [1]; 1404 let NumMicroOps = 2; 1405} 1406def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1407 1408def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1409 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); 1410 let ReleaseAtCycles = [1, 1, 2]; 1411 let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); 1412} 1413def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1414 1415def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { 1416 let Latency = 5; 1417 let ReleaseAtCycles = [1]; 1418 let NumMicroOps = 2; 1419} 1420def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; 1421 1422def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1423 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); 1424 let ReleaseAtCycles = [1, 1, 2]; 1425 let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); 1426} 1427def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; 1428 1429defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1430defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. 1431defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts. 1432defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1433defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM). 1434 1435// Old microcoded instructions that nobody use. 1436defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>; 1437 1438// Fence instructions. 1439defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>; 1440 1441def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> { 1442 let Latency = 1; 1443 let ReleaseAtCycles = [30]; 1444 let NumMicroOps = 1; 1445} 1446def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>; 1447 1448def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> { 1449 let Latency = 1; 1450 let ReleaseAtCycles = [1]; 1451 let NumMicroOps = 1; 1452} 1453def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>; 1454 1455// Nop, not very useful expect it provides a model for nops! 1456defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1457 1458 1459/////////////////////////////////////////////////////////////////////////////// 1460// Zero Cycle Move 1461/////////////////////////////////////////////////////////////////////////////// 1462 1463def Zn4WriteZeroLatency : SchedWriteRes<[]> { 1464 let Latency = 0; 1465 let ReleaseAtCycles = []; 1466 let NumMicroOps = 1; 1467} 1468def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1469 MOV64rr, MOV64rr_REV, 1470 MOVSX32rr32)>; 1471 1472def Zn4WriteSwapRenameable : SchedWriteRes<[]> { 1473 let Latency = 0; 1474 let ReleaseAtCycles = []; 1475 let NumMicroOps = 2; 1476} 1477def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1478 XCHG64rr, XCHG64ar)>; 1479 1480defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1481 1482defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1483defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1484defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>; 1485 1486defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX 1487defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1488defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1489defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>; 1490 1491def : IsOptimizableRegisterMove<[ 1492 InstructionEquivalenceClass<[ 1493 // GPR variants. 1494 MOV32rr, MOV32rr_REV, 1495 MOV64rr, MOV64rr_REV, 1496 MOVSX32rr32, 1497 XCHG32rr, XCHG32ar, 1498 XCHG64rr, XCHG64ar, 1499 1500 // MMX variants. 1501 // MMX moves are *NOT* eliminated. 1502 1503 // SSE variants. 1504 MOVAPSrr, MOVAPSrr_REV, 1505 MOVUPSrr, MOVUPSrr_REV, 1506 MOVAPDrr, MOVAPDrr_REV, 1507 MOVUPDrr, MOVUPDrr_REV, 1508 MOVDQArr, MOVDQArr_REV, 1509 MOVDQUrr, MOVDQUrr_REV, 1510 1511 // AVX variants. 1512 VMOVAPSrr, VMOVAPSrr_REV, 1513 VMOVUPSrr, VMOVUPSrr_REV, 1514 VMOVAPDrr, VMOVAPDrr_REV, 1515 VMOVUPDrr, VMOVUPDrr_REV, 1516 VMOVDQArr, VMOVDQArr_REV, 1517 VMOVDQUrr, VMOVDQUrr_REV, 1518 1519 // AVX YMM variants. 1520 VMOVAPSYrr, VMOVAPSYrr_REV, 1521 VMOVUPSYrr, VMOVUPSYrr_REV, 1522 VMOVAPDYrr, VMOVAPDYrr_REV, 1523 VMOVUPDYrr, VMOVUPDYrr_REV, 1524 VMOVDQAYrr, VMOVDQAYrr_REV, 1525 VMOVDQUYrr, VMOVDQUYrr_REV, 1526 ], TruePred > 1527]>; 1528 1529// FIXUP and RANGE Instructions 1530def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { 1531 let Latency = 2; 1532 let ReleaseAtCycles = [2]; 1533 let NumMicroOps = 1; 1534} 1535def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex 1536 "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", 1537 "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", 1538 "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" 1539 )>; 1540 1541// SCALE & REDUCE instructions 1542def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> { 1543 let Latency = 6; 1544 let ReleaseAtCycles = [6]; 1545 let NumMicroOps = 2; 1546} 1547def : InstRW<[Zn4WriteSCALErr], (instregex 1548 "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)", 1549 "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)" 1550 )>; 1551 1552//BF16PS Instructions 1553def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> { 1554 let Latency = 6; 1555 let ReleaseAtCycles = [6]; 1556 let NumMicroOps = 2; 1557} 1558def : InstRW<[Zn4WriteBF16], (instregex 1559 "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)" 1560 )>; 1561 1562// BUSD and VPMADD Instructions 1563def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { 1564 let Latency = 4; 1565 let ReleaseAtCycles = [4]; 1566 let NumMicroOps = 1; 1567} 1568def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex 1569 "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", 1570 "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" 1571 )>; 1572 1573// SHIFT instructions 1574def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> { 1575 let Latency = 2; 1576 let ReleaseAtCycles = [2]; 1577 let NumMicroOps = 1; 1578} 1579def : InstRW<[Zn4WriteSHIFTrr], (instregex 1580 "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)", 1581 "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)", 1582 "(V?)P(SLL|SRL|SRA)DQYri", 1583 "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri", 1584 "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)", 1585 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", 1586 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", 1587 "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", 1588 "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz" 1589 )>; 1590 1591def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { 1592 let Latency = 1; 1593 let ReleaseAtCycles = [1]; 1594 let NumMicroOps = 1; 1595} 1596def : InstRW<[Zn4WriteSHIFTri], (instregex 1597 "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" 1598 )>; 1599 1600// ALIGN Instructions 1601def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { 1602 let Latency = 2; 1603 let ReleaseAtCycles = [2]; 1604 let NumMicroOps = 1; 1605} 1606def : InstRW<[Zn4WriteALIGN], (instregex 1607 "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" 1608 )>; 1609 1610//PACK Instructions 1611def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { 1612 let Latency = 2; 1613 let ReleaseAtCycles = [2]; 1614 let NumMicroOps = 1; 1615} 1616def : InstRW<[Zn4WritePACK], (instregex 1617 "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" 1618 )>; 1619 1620// MAX and MIN Instructions 1621def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> { 1622 let Latency = 2; 1623 let ReleaseAtCycles = [2]; 1624 let NumMicroOps = 1; 1625} 1626def : InstRW<[Zn4WriteFCmp64], (instregex 1627 "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)", 1628 "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)", 1629 "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)", 1630 "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)" 1631 )>; 1632 1633// MOV Instructions 1634def Zn4MOVDUPZ: SchedWriteRes<[Zn4FPFMisc12]> { 1635 let Latency = 2; 1636 let ReleaseAtCycles = [2]; 1637 let NumMicroOps = 1; 1638} 1639def : InstRW<[Zn4MOVDUPZ], (instregex 1640 "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)" 1641 )>; 1642 1643def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { 1644 let Latency = 2; 1645 let ReleaseAtCycles = [1]; 1646 let NumMicroOps = 1; 1647} 1648def : InstRW<[Zn4MOVS], (instregex 1649 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)", 1650 "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)", 1651 "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)", 1652 "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?|Z256?)(rr|rrk|rrkz)" 1653 )>; 1654 1655def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> { 1656 let Latency = 4; 1657 let ReleaseAtCycles = [4]; 1658 let NumMicroOps = 1; 1659} 1660def : InstRW<[Zn4MOVSZ], (instregex 1661 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)" 1662 )>; 1663 1664def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> { 1665 let Latency = 5; 1666 let ReleaseAtCycles = [5]; 1667 let NumMicroOps = 1; 1668} 1669def : InstRW<[Zn4MOVSrr], (instregex 1670 "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)" 1671 )>; 1672 1673 1674//VPTEST Instructions 1675def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> { 1676 let Latency = 3; 1677 let ReleaseAtCycles = [3]; 1678 let NumMicroOps = 1; 1679} 1680def : InstRW<[Zn4VPTESTZ128], (instregex 1681 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)" 1682 )>; 1683 1684def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> { 1685 let Latency = 4; 1686 let ReleaseAtCycles = [4]; 1687 let NumMicroOps = 1; 1688} 1689def : InstRW<[Zn4VPTESTZ256], (instregex 1690 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)" 1691 )>; 1692 1693def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> { 1694 let Latency = 5; 1695 let ReleaseAtCycles = [5]; 1696 let NumMicroOps = 1; 1697} 1698def : InstRW<[Zn4VPTESTZ], (instregex 1699 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)" 1700 )>; 1701 1702// CONFLICT Instructions 1703def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> { 1704 let Latency = 2; 1705 let ReleaseAtCycles = [2]; 1706 let NumMicroOps = 1; 1707} 1708def : InstRW<[Zn4CONFLICTZ128], (instregex 1709 "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)" 1710 )>; 1711 1712def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> { 1713 let Latency = 6; 1714 let ReleaseAtCycles = [2,2,2]; 1715 let NumMicroOps = 4; 1716} 1717def : InstRW<[Zn4CONFLICTrr], (instregex 1718 "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)" 1719 )>; 1720 1721// RSQRT Instructions 1722def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> { 1723 let Latency = 5; 1724 let ReleaseAtCycles = [2]; 1725 let NumMicroOps = 1; 1726} 1727def : InstRW<[Zn4VRSQRT14PDZ256], (instregex 1728 "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)" 1729 )>; 1730 1731 1732// PERM Instructions 1733def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> { 1734 let Latency = 2; 1735 let ReleaseAtCycles = [2]; 1736 let NumMicroOps = 1; 1737} 1738def : InstRW<[Zn4PERMILP], (instregex 1739 "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)" 1740 )>; 1741 1742def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> { 1743 let Latency = 3; 1744 let ReleaseAtCycles = [2]; 1745 let NumMicroOps = 1; 1746} 1747def : InstRW<[Zn4PERMIT2_128], (instregex 1748 "VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)", 1749 "VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)" 1750 )>; 1751 1752def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> { 1753 let Latency = 2; 1754 let ReleaseAtCycles = [2]; 1755 let NumMicroOps = 1; 1756} 1757def : InstRW<[Zn4PERMIT2_128rr], (instregex 1758 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)", 1759 "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)" 1760 )>; 1761 1762def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> { 1763 let Latency = 4; 1764 let ReleaseAtCycles = [2]; 1765 let NumMicroOps = 1; 1766} 1767def : InstRW<[Zn4PERMIT2_256], (instregex 1768 "VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)", 1769 "VPERMP(S|D)Z256(rr|rrk|rrkz)", 1770 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)", 1771 "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)", 1772 "VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)", 1773 "VPEXPAND(B|W)Z256(rr|rrk|rrkz)" 1774 )>; 1775 1776def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> { 1777 let Latency = 5; 1778 let ReleaseAtCycles = [2]; 1779 let NumMicroOps = 1; 1780} 1781def : InstRW<[Zn4PERMIT2Z], (instregex 1782 "VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)", 1783 "VPERM(B|D|W)Z(rr|rrk|rrkz)", 1784 "VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)", 1785 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)", 1786 "VPEXPAND(B|W)Z(rr|rrk|rrkz)", 1787 "VPERMP(S|D)Z(rr|rrk|rrkz)" 1788 )>; 1789 1790// ALU SLOW Misc Instructions 1791def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> { 1792 let Latency = 2; 1793 let ReleaseAtCycles = [2]; 1794 let NumMicroOps = 1; 1795} 1796def : InstRW<[Zn4VecALUZSlow], (instrs 1797 VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, 1798 VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, 1799 VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, 1800 VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, 1801 VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, 1802 VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, 1803 VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, 1804 VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, 1805 VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, 1806 VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, 1807 VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, 1808 VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, 1809 VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, 1810 VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz 1811 )>; 1812 1813 1814/////////////////////////////////////////////////////////////////////////////// 1815// Dependency breaking instructions. 1816/////////////////////////////////////////////////////////////////////////////// 1817 1818def Zn4WriteZeroIdiom : SchedWriteVariant<[ 1819 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1820 SchedVar<NoSchedPred, [WriteALU]> 1821]>; 1822def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1823 XOR64rr, XOR64rr_REV, 1824 SUB32rr, SUB32rr_REV, 1825 SUB64rr, SUB64rr_REV)>; 1826 1827def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1828 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>, 1829 SchedVar<NoSchedPred, [WriteALU]> 1830]>; 1831def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1832 CMP16rr, CMP16rr_REV, 1833 CMP32rr, CMP32rr_REV, 1834 CMP64rr, CMP64rr_REV)>; 1835 1836def Zn4WriteFZeroIdiom : SchedWriteVariant<[ 1837 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1838 SchedVar<NoSchedPred, [WriteFLogic]> 1839]>; 1840// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1841def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1842 VANDNPSrr, VANDNPDrr)>; 1843 1844def Zn4WriteFZeroIdiomY : SchedWriteVariant<[ 1845 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1846 SchedVar<NoSchedPred, [WriteFLogicY]> 1847]>; 1848def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1849 VANDNPSYrr, VANDNPDYrr)>; 1850 1851def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1852 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1853 SchedVar<NoSchedPred, [WriteVecLogicX]> 1854]>; 1855// NOTE: PXORrr,PANDNrr are not zero-cycle! 1856def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1857 1858def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1859 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1860 SchedVar<NoSchedPred, [WriteVecLogicY]> 1861]>; 1862def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1863 1864def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[ 1865 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1866 SchedVar<NoSchedPred, [WriteVecALUX]> 1867]>; 1868// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1869// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1870def : InstRW<[Zn4WriteVZeroIdiomALUX], 1871 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1872 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1873 1874def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[ 1875 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1876 SchedVar<NoSchedPred, [WriteVecALUY]> 1877]>; 1878def : InstRW<[Zn4WriteVZeroIdiomALUY], 1879 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1880 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1881 1882def : IsZeroIdiomFunction<[ 1883 // GPR Zero-idioms. 1884 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1885 XOR64rr, XOR64rr_REV, 1886 SUB32rr, SUB32rr_REV, 1887 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1888 1889 // SSE XMM Zero-idioms. 1890 DepBreakingClass<[ 1891 // fp variants. 1892 XORPSrr, XORPDrr, 1893 ANDNPSrr, ANDNPDrr, 1894 1895 // int variants. 1896 PXORrr, 1897 PANDNrr, 1898 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1899 PSUBSBrr, PSUBSWrr, 1900 PSUBUSBrr, PSUBUSWrr, 1901 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1902 ], ZeroIdiomPredicate>, 1903 1904 // AVX XMM Zero-idioms. 1905 DepBreakingClass<[ 1906 // fp variants. 1907 VXORPSrr, VXORPDrr, 1908 VANDNPSrr, VANDNPDrr, 1909 1910 // int variants. 1911 VPXORrr, 1912 VPANDNrr, 1913 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1914 VPSUBSBrr, VPSUBSWrr, 1915 VPSUBUSBrr, VPSUBUSWrr, 1916 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1917 ], ZeroIdiomPredicate>, 1918 1919 // AVX YMM Zero-idioms. 1920 DepBreakingClass<[ 1921 // fp variants. 1922 VXORPSYrr, VXORPDYrr, 1923 VANDNPSYrr, VANDNPDYrr, 1924 1925 // int variants. 1926 VPXORYrr, 1927 VPANDNYrr, 1928 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1929 VPSUBSBYrr, VPSUBSWYrr, 1930 VPSUBUSBYrr, VPSUBUSWYrr, 1931 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1932 ], ZeroIdiomPredicate>, 1933]>; 1934 1935def : IsDepBreakingFunction<[ 1936 // GPR 1937 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1938 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1939 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1940 CMP16rr, CMP16rr_REV, 1941 CMP32rr, CMP32rr_REV, 1942 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1943 // SSE 1944 DepBreakingClass<[ 1945 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1946 ], ZeroIdiomPredicate>, 1947 1948 // AVX XMM 1949 DepBreakingClass<[ 1950 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1951 ], ZeroIdiomPredicate>, 1952 1953 // AVX YMM 1954 DepBreakingClass<[ 1955 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1956 ], ZeroIdiomPredicate>, 1957]>; 1958 1959} // SchedModel 1960 1961