1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver4 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14//===----------------------------------------------------------------------===// 15 16def Znver4Model : SchedMachineModel { 17 // AMD SOG 19h, 2.9.6 Dispatch 18 // The processor may dispatch up to 6 macro ops per cycle 19 // into the execution engine. 20 let IssueWidth = 6; 21 // AMD SOG 19h, 2.10.3 22 // The retire control unit (RCU) tracks the completion status of all 23 // outstanding operations (integer, load/store, and floating-point) and is 24 // the final arbiter for exception processing and recovery. 25 // The unit can receive up to 6 macro ops dispatched per cycle and track up 26 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. 27 let MicroOpBufferSize = 320; 28 // AMD SOG 19h, 2.9.1 Op Cache 29 // The op cache is organized as an associative cache with 64 sets and 8 ways. 30 // At each set-way intersection is an entry containing up to 8 macro ops. 31 // The maximum capacity of the op cache is 4K ops. 32 // Agner, 22.5 µop cache 33 // The size of the µop cache is big enough for holding most critical loops. 34 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, 35 // with large values here the compilation of certain loops 36 // ends up taking way too long. 37 // Ideally for znver4, we should have 6.75K. However we don't add that 38 // considerting the impact compile time and prefer using default values 39 // instead. 40 // Retaining minimal value to influence unrolling as we did for znver3. 41 let LoopMicroOpBufferSize = 512; 42 // AMD SOG 19h, 2.6.2 L1 Data Cache 43 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 44 // AMD SOG 19h, 2.12 L1 Data Cache 45 // The AGU and LS pipelines are optimized for simple address generation modes. 46 // <...> and can achieve 4-cycle load-to-use integer load latency. 47 let LoadLatency = 4; 48 // AMD SOG 19h, 2.12 L1 Data Cache 49 // The AGU and LS pipelines are optimized for simple address generation modes. 50 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 51 int VecLoadLatency = 7; 52 // Latency of a simple store operation. 53 int StoreLatency = 1; 54 // FIXME: 55 let HighLatency = 25; // FIXME: any better choice? 56 // AMD SOG 19h, 2.8 Optimizing Branching 57 // The branch misprediction penalty is in the range from 11 to 18 cycles, 58 // <...>. The common case penalty is 13 cycles. 59 let MispredictPenalty = 13; 60 61 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 62 63 let CompleteModel = 1; 64} 65 66let SchedModel = Znver4Model in { 67 68 69//===----------------------------------------------------------------------===// 70// RCU 71//===----------------------------------------------------------------------===// 72 73// AMD SOG 19h, 2.10.3 Retire Control Unit 74// The unit can receive up to 6 macro ops dispatched per cycle and track up to 75// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 76// The retire unit handles in-order commit of up to nine macro ops per cycle. 77def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>; 78 79//===----------------------------------------------------------------------===// 80// Integer Execution Unit 81// 82 83// AMD SOG 19h, 2.4 Superscalar Organization 84// The processor uses four decoupled independent integer scheduler queues, 85// each one servicing one ALU pipeline and one or two other pipelines 86 87// 88// Execution pipes 89//===----------------------------------------------------------------------===// 90 91// AMD SOG 19h, 2.10.2 Execution Units 92// The processor contains 4 general purpose integer execution pipes. 93// Each pipe has an ALU capable of general purpose integer operations. 94def Zn4ALU0 : ProcResource<1>; 95def Zn4ALU1 : ProcResource<1>; 96def Zn4ALU2 : ProcResource<1>; 97def Zn4ALU3 : ProcResource<1>; 98 99// AMD SOG 19h, 2.10.2 Execution Units 100// There is also a separate branch execution unit. 101def Zn4BRU1 : ProcResource<1>; 102 103// AMD SOG 19h, 2.10.2 Execution Units 104// There are three Address Generation Units (AGUs) for all load and store 105// address generation. There are also 3 store data movement units 106// associated with the same schedulers as the AGUs. 107def Zn4AGU0 : ProcResource<1>; 108def Zn4AGU1 : ProcResource<1>; 109def Zn4AGU2 : ProcResource<1>; 110 111// 112// Execution Units 113//===----------------------------------------------------------------------===// 114 115// AMD SOG 19h, 2.10.2 Execution Units 116// ALU0 additionally has divide <...> execution capability. 117defvar Zn4Divider = Zn4ALU0; 118 119// AMD SOG 19h, 2.10.2 Execution Units 120// ALU0 additionally has <...> branch execution capability. 121defvar Zn4BRU0 = Zn4ALU0; 122 123// Integer Multiplication issued on ALU1. 124defvar Zn4Multiplier = Zn4ALU1; 125 126// Execution pipeline grouping 127//===----------------------------------------------------------------------===// 128 129// General ALU operations 130def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>; 131 132// General AGU operations 133def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>; 134 135// Control flow: jumps, calls 136def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>; 137 138// Everything that isn't control flow, but still needs to access CC register, 139// namely: conditional moves, SETcc. 140def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>; 141 142// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 143 144// Simple bit twiddling: bit test, shift/rotate, bit extraction 145def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>; 146 147 148// 149// Scheduling 150//===----------------------------------------------------------------------===// 151 152// AMD SOG 19h, 2.10.3 Retire Control Unit 153// The integer physical register file (PRF) consists of 224 registers. 154def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0], 155 6, // Max moves that can be eliminated per cycle. 156 0>; // Restrict move elimination to zero regs. 157 158// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 159// AMD SOG 19h, 2.10.1 Schedulers 160// The schedulers can receive up to six macro ops per cycle, with a limit of 161// two per scheduler. Each scheduler can issue one micro op per cycle into 162// each of its associated pipelines 163def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 164 Zn4ALU1, Zn4AGU1, // scheduler 1 165 Zn4ALU2, Zn4AGU2, // scheduler 2 166 Zn4ALU3, Zn4BRU1 // scheduler 3 167 ]> { 168 let BufferSize = !mul(4, 24); 169} 170 171 172//===----------------------------------------------------------------------===// 173// Floating-Point Unit 174// 175 176// AMD SOG 19h, 2.4 Superscalar Organization 177// The processor uses <...> two decoupled independent floating point schedulers 178// each servicing two FP pipelines and one store or FP-to-integer pipeline. 179 180// 181// Execution pipes 182//===----------------------------------------------------------------------===// 183 184// AMD SOG 19h, 2.10.1 Schedulers 185// <...>, and six FPU pipes. 186// Agner, 22.10 Floating point execution pipes 187// There are six floating point/vector execution pipes, 188def Zn4FP0 : ProcResource<1>; 189def Zn4FP1 : ProcResource<1>; 190def Zn4FP2 : ProcResource<1>; 191def Zn4FP3 : ProcResource<1>; 192def Zn4FP45 : ProcResource<2>; 193 194// 195// Execution Units 196//===----------------------------------------------------------------------===// 197// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 198 199// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 200defvar Zn4FPFMul0 = Zn4FP0; 201defvar Zn4FPFMul1 = Zn4FP1; 202 203// (v)FADD* 204defvar Zn4FPFAdd0 = Zn4FP2; 205defvar Zn4FPFAdd1 = Zn4FP3; 206 207// All convert operations except pack/unpack 208defvar Zn4FPFCvt0 = Zn4FP2; 209defvar Zn4FPFCvt1 = Zn4FP3; 210 211// All Divide and Square Root except Reciprocal Approximation 212// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 213// FDIV unit can support 2 simultaneous operations in flight 214// even though it occupies a single pipe. 215// FIXME: BufferSize=2 ? 216defvar Zn4FPFDiv = Zn4FP1; 217 218// Moves and Logical operations on Floating Point Data Types 219defvar Zn4FPFMisc0 = Zn4FP0; 220defvar Zn4FPFMisc1 = Zn4FP1; 221defvar Zn4FPFMisc2 = Zn4FP2; 222defvar Zn4FPFMisc3 = Zn4FP3; 223 224// Integer Adds, Subtracts, and Compares 225// Some complex VADD operations are not available in all pipes. 226defvar Zn4FPVAdd0 = Zn4FP0; 227defvar Zn4FPVAdd1 = Zn4FP1; 228defvar Zn4FPVAdd2 = Zn4FP2; 229defvar Zn4FPVAdd3 = Zn4FP3; 230 231// Integer Multiplies, SAD, Blendvb 232defvar Zn4FPVMul0 = Zn4FP0; 233defvar Zn4FPVMul1 = Zn4FP3; 234 235// Data Shuffles, Packs, Unpacks, Permute 236// Some complex shuffle operations are only available in pipe1. 237defvar Zn4FPVShuf = Zn4FP1; 238defvar Zn4FPVShufAux = Zn4FP2; 239 240// Bit Shift Left/Right operations 241defvar Zn4FPVShift0 = Zn4FP1; 242defvar Zn4FPVShift1 = Zn4FP2; 243 244// Moves and Logical operations on Packed Integer Data Types 245defvar Zn4FPVMisc0 = Zn4FP0; 246defvar Zn4FPVMisc1 = Zn4FP1; 247defvar Zn4FPVMisc2 = Zn4FP2; 248defvar Zn4FPVMisc3 = Zn4FP3; 249 250// *AES* 251defvar Zn4FPAES0 = Zn4FP0; 252defvar Zn4FPAES1 = Zn4FP1; 253 254// *CLM* 255defvar Zn4FPCLM0 = Zn4FP0; 256defvar Zn4FPCLM1 = Zn4FP1; 257 258// Execution pipeline grouping 259//===----------------------------------------------------------------------===// 260 261// AMD SOG 19h, 2.11 Floating-Point Unit 262// Stores and floating point to general purpose register transfer 263// have 2 dedicated pipelines (pipe 5 and 6). 264def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>; 265 266// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 267def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>; 268 269// (v)FADD* 270// Some complex VADD operations are not available in all pipes. 271def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>; 272 273// All convert operations except pack/unpack 274def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>; 275 276// All Divide and Square Root except Reciprocal Approximation 277// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>; 278 279// Moves and Logical operations on Floating Point Data Types 280def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>; 281 282// FIXUP and RANGE use FP01 pipelines 283def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>; 284def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>; 285// SCALE instructions use FP23 pipelines 286def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>; 287def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>; 288 289// Loads, Stores and Move to General Register (EX) Operations 290// AMD SOG 19h, 2.11 Floating-Point Unit 291// Stores and floating point to general purpose register transfer 292// have 2 dedicated pipelines (pipe 5 and 6). 293defvar Zn4FPLd01 = Zn4FP45; 294 295// AMD SOG 19h, 2.11 Floating-Point Unit 296// Note that FP stores are supported on two pipelines, 297// but throughput is limited to one per cycle. 298let Super = Zn4FP45 in 299def Zn4FPSt : ProcResource<1>; 300 301// Integer Adds, Subtracts, and Compares 302// Some complex VADD operations are not available in all pipes. 303def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>; 304 305def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>; 306def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>; 307 308// AVX512 Opmask pipelines 309def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>; 310def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>; 311 312// Integer Multiplies, SAD, Blendvb 313def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>; 314 315// Data Shuffles, Packs, Unpacks, Permute 316// Some complex shuffle operations are only available in pipe1. 317def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>; 318 319// Bit Shift Left/Right operations 320def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>; 321 322// Moves and Logical operations on Packed Integer Data Types 323def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>; 324 325// *AES* 326def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>; 327 328// *CLM* 329def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>; 330 331 332// 333// Scheduling 334//===----------------------------------------------------------------------===// 335 336// Agner, 21.8 Register renaming and out-of-order schedulers 337// The floating point register file has 192 vector registers 338// of 512b each in zen4. 339def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1], 340 6, // Max moves that can be eliminated per cycle. 341 0>; // Restrict move elimination to zero regs. 342 343// AMD SOG 19h, 2.11 Floating-Point Unit 344// The floating-point scheduler has a 2*32 entry macro op capacity. 345// AMD SOG 19h, 2.11 Floating-Point Unit 346// <...> the scheduler can issue 1 micro op per cycle for each pipe. 347// FIXME: those are two separate schedulers, not a single big one. 348def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 349 Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1 350 ]> { 351 let BufferSize = !mul(2, 32); 352} 353 354// AMD SOG 19h, 2.11 Floating-Point Unit 355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 356// even if floating-point scheduler is full. 357// FIXME: how to model this properly? 358 359 360//===----------------------------------------------------------------------===// 361// Load-Store Unit 362// 363 364// AMD SOG 19h, 2.12 Load-Store Unit 365// The LS unit contains three largely independent pipe-lines 366// enabling the execution of three 256-bit memory operations per cycle. 367def Zn4LSU : ProcResource<3>; 368 369// AMD SOG 19h, 2.12 Load-Store Unit 370// All three memory operations can be loads. 371let Super = Zn4LSU in 372def Zn4Load : ProcResource<3> { 373 // AMD SOG 19h, 2.12 Load-Store Unit 374 // The LS unit can process up to 72 out-of-order loads. 375 let BufferSize = 72; 376} 377 378def Zn4LoadQueue : LoadQueue<Zn4Load>; 379 380// AMD SOG 19h, 2.12 Load-Store Unit 381// A maximum of two of the memory operations can be stores. 382let Super = Zn4LSU in 383def Zn4Store : ProcResource<2> { 384 // AMD SOG 19h, 2.12 Load-Store Unit 385 // The LS unit utilizes a 64-entry store queue (STQ). 386 let BufferSize = 64; 387} 388 389def Zn4StoreQueue : StoreQueue<Zn4Store>; 390 391//===----------------------------------------------------------------------===// 392// Basic helper classes. 393//===----------------------------------------------------------------------===// 394 395// Many SchedWrites are defined in pairs with and without a folded load. 396// Instructions with folded loads are usually micro-fused, so they only appear 397// as two micro-ops when dispatched by the schedulers. 398// This multiclass defines the resource usage for variants with and without 399// folded loads. 400 401multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 402 int Lat = 1, list<int> Res = [], int UOps = 1> { 403 def : WriteRes<SchedRW, ExePorts> { 404 let Latency = Lat; 405 let ReleaseAtCycles = Res; 406 let NumMicroOps = UOps; 407 } 408} 409 410multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW, 411 list<ProcResourceKind> ExePorts, int Lat, 412 list<int> Res, int UOps, int LoadLat, int LoadUOps, 413 ProcResourceKind AGU, int LoadRes> { 414 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 415 416 defm : __Zn4WriteRes<SchedRW.Folded, 417 !listconcat([AGU, Zn4Load], ExePorts), 418 !add(Lat, LoadLat), 419 !if(!and(!empty(Res), !eq(LoadRes, 1)), 420 [], 421 !listconcat([1, LoadRes], 422 !if(!empty(Res), 423 !listsplat(1, !size(ExePorts)), 424 Res))), 425 !add(UOps, LoadUOps)>; 426} 427 428// For classes without folded loads. 429multiclass Zn4WriteResInt<SchedWrite SchedRW, 430 list<ProcResourceKind> ExePorts, int Lat = 1, 431 list<int> Res = [], int UOps = 1> { 432 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 433} 434 435multiclass Zn4WriteResXMM<SchedWrite SchedRW, 436 list<ProcResourceKind> ExePorts, int Lat = 1, 437 list<int> Res = [], int UOps = 1> { 438 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 439} 440 441multiclass Zn4WriteResYMM<SchedWrite SchedRW, 442 list<ProcResourceKind> ExePorts, int Lat = 1, 443 list<int> Res = [], int UOps = 1> { 444 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 445} 446 447multiclass Zn4WriteResZMM<SchedWrite SchedRW, 448 list<ProcResourceKind> ExePorts, int Lat = 1, 449 list<int> Res = [], int UOps = 1> { 450 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 451} 452 453// For classes with folded loads. 454multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW, 455 list<ProcResourceKind> ExePorts, int Lat = 1, 456 list<int> Res = [], int UOps = 1, 457 int LoadUOps = 0, int LoadRes = 1> { 458 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 459 Znver4Model.LoadLatency, 460 LoadUOps, Zn4AGU012, LoadRes>; 461} 462 463multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW, 464 list<ProcResourceKind> ExePorts, int Lat = 1, 465 list<int> Res = [], int UOps = 1, 466 int LoadUOps = 0, int LoadRes = 1> { 467 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 468 Znver4Model.VecLoadLatency, 469 LoadUOps, Zn4FPLd01, LoadRes>; 470} 471 472multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW, 473 list<ProcResourceKind> ExePorts, int Lat = 1, 474 list<int> Res = [], int UOps = 1, 475 int LoadUOps = 0, int LoadRes = 1> { 476 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 477 Znver4Model.VecLoadLatency, 478 LoadUOps, Zn4FPLd01, LoadRes>; 479} 480 481multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW, 482 list<ProcResourceKind> ExePorts, int Lat = 1, 483 list<int> Res = [], int UOps = 2, 484 int LoadUOps = 0, int LoadRes = 1> { 485 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 486 Znver4Model.VecLoadLatency, 487 LoadUOps, Zn4FPLd01, LoadRes>; 488} 489 490//===----------------------------------------------------------------------===// 491// Here be dragons. 492//===----------------------------------------------------------------------===// 493 494def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>; 495 496def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>; 497def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>; 498def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>; 499 500// AMD SOG 19h, 2.11 Floating-Point Unit 501// There is 1 cycle of added latency for a result to cross 502// from F to I or I to F domain. 503def : ReadAdvance<ReadInt2Fpu, -1>; 504 505// Instructions with both a load and a store folded are modeled as a folded 506// load + WriteRMW. 507defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>; 508 509// Loads, stores, and moves, not folded with other operations. 510defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>; 511 512// Model the effect of clobbering the read-write mask operand of the GATHER operation. 513// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 514defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>; 515 516def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> { 517 let Latency = !add(Znver4Model.LoadLatency, 1); 518 let ReleaseAtCycles = [3, 1]; 519 let NumMicroOps = 1; 520} 521def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 522 523defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; 524defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; 525defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>; 526 527// Treat misc copies as a move. 528def : InstRW<[WriteMove], (instrs COPY)>; 529 530def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 531 let Latency = Znver4Model.LoadLatency; 532 let ReleaseAtCycles = [1, 1, 4]; 533 let NumMicroOps = 1; 534} 535def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>; 536 537def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> { 538 let Latency = Znver4Model.StoreLatency; 539 let ReleaseAtCycles = [4, 1, 1]; 540 let NumMicroOps = 2; 541} 542def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 543 544// Arithmetic. 545defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op. 546 547def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> { 548 let Latency = 1; 549 let ReleaseAtCycles = [4]; 550 let NumMicroOps = 1; 551} 552def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 553 AND8i8, AND16i16, AND32i32, AND64i32, 554 OR8i8, OR16i16, OR32i32, OR64i32, 555 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 556 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 557 558def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> { 559 let Latency = 1; 560 let ReleaseAtCycles = [4]; 561 let NumMicroOps = 1; 562} 563def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 564 565def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> { 566 let Latency = 1; 567 let ReleaseAtCycles = [2]; 568 let NumMicroOps = 1; 569} 570def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 571 572def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> { 573 let Latency = 3; 574 let ReleaseAtCycles = [1]; 575 let NumMicroOps = 1; 576} 577def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 578 PEXT32rr, PEXT64rr)>; 579 580defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op. 581 582def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> { 583 let Latency = 1; 584 let ReleaseAtCycles = [1, 1, 7, 1]; 585 let NumMicroOps = 1; 586} 587def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 588 589// This is for simple LEAs with one or two input operands. 590defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 591 592// This write is used for slow LEA instructions. 593def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> { 594 let Latency = 2; 595 let ReleaseAtCycles = [1]; 596 let NumMicroOps = 2; 597} 598 599// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset), 600// or an LEA with a `Scale` value different than 1. 601def Zn4SlowLEAPredicate : MCSchedPredicate< 602 CheckAny<[ 603 // A 3-operand LEA (base, index, offset). 604 IsThreeOperandsLEAFn, 605 // An LEA with a "Scale" different than 1. 606 CheckAll<[ 607 CheckIsImmOperand<2>, 608 CheckNot<CheckImmOperand<2, 1>> 609 ]> 610 ]> 611>; 612 613def Zn4WriteLEA : SchedWriteVariant<[ 614 SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>, 615 SchedVar<NoSchedPred, [WriteLEA]> 616]>; 617 618def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 619 620def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { 621 let Latency = 2; // FIXME: not from llvm-exegesis 622 let ReleaseAtCycles = [4]; 623 let NumMicroOps = 2; 624} 625 626def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>; 627 628// Integer multiplication 629defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 630defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 631defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 632defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 633defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 634defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 635defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 636defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 637defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 638defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 639defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 640defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 641defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part. 642defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 643 644defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 645defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 646 647defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 648 649def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> { 650 let Latency = 3; 651 let ReleaseAtCycles = [12]; 652 let NumMicroOps = 3; 653} 654def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 655 656defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 657 658def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 659 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency); 660 let ReleaseAtCycles = [1, 1, 12]; 661 let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2); 662} 663def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 664 665def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { 666 let Latency = 3; // FIXME: not from llvm-exegesis 667 let ReleaseAtCycles = [24]; 668 let NumMicroOps = 19; 669} 670def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 671 672def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { 673 let Latency = 4; // FIXME: not from llvm-exegesis 674 let ReleaseAtCycles = [59]; 675 let NumMicroOps = 28; 676} 677def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 678 679def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> { 680 let Latency = 1; 681 let ReleaseAtCycles = [2]; 682 let NumMicroOps = 2; 683} 684def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 685 686def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 687 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 688 let ReleaseAtCycles = [1, 1, 2]; 689 let NumMicroOps = 5; 690} 691def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 692 693def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 694 let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 695 let ReleaseAtCycles = [1, 1, 2]; 696 let NumMicroOps = 2; 697} 698def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 699 700// Integer division. 701// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 702// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 703defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; 704defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; 705defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; 706defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; 707defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; 708defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; 709defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; 710defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; 711 712defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. 713defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. 714 715defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. 716 717def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 718 let Latency = 1; 719 let ReleaseAtCycles = [4]; 720 let NumMicroOps = 1; 721} 722def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>; 723 724defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count. 725 726def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 727 let Latency = 1; 728 let ReleaseAtCycles = [4]; 729 let NumMicroOps = 1; 730} 731def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; 732 733defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. 734 735def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 736 let Latency = 2; 737 let ReleaseAtCycles = [4]; 738 let NumMicroOps = 2; 739} 740def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; 741 742defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move. 743defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 744defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code. 745defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 746defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH. 747 748defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test 749defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>; 750defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>; 751 752defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set 753defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>; 754defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>; 755 756// Integer shifts and rotates. 757defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 758defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 759defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 760 761def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> { 762 let Latency = 1; 763 let ReleaseAtCycles = [2]; 764 let NumMicroOps = 1; 765} 766def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 767 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 768 769def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 770 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency); 771 let ReleaseAtCycles = [1, 1, 2]; 772 let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1); 773} 774def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 775 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 776 777def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> { 778 let Latency = 3; 779 let ReleaseAtCycles = [6]; 780 let NumMicroOps = 7; 781} 782def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 783 784def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 785 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency); 786 let ReleaseAtCycles = [1, 1, 8]; 787 let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3); 788} 789def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 790 791def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> { 792 let Latency = 4; 793 let ReleaseAtCycles = [8]; 794 let NumMicroOps = 9; 795} 796def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 797 798def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 799 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency); 800 let ReleaseAtCycles = [1, 1, 8]; 801 let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2); 802} 803def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 804 805defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 806 807def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> { 808 let Latency = 3; 809 let ReleaseAtCycles = [6]; 810 let NumMicroOps = 7; 811} 812def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 813 814def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 815 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency); 816 let ReleaseAtCycles = [1, 1, 8]; 817 let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2); 818} 819def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 820 821def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> { 822 let Latency = 4; 823 let ReleaseAtCycles = [8]; 824 let NumMicroOps = 9; 825} 826def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 827 828def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 829 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency); 830 let ReleaseAtCycles = [1, 1, 8]; 831 let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2); 832} 833def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 834 835// Double shift instructions. 836defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>; 837defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>; 838defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; 839defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; 840 841// BMI1 BEXTR/BLS, BMI2 BZHI 842defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 843defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>; 844defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 845 846// Idioms that clear a register, like xorps %xmm0, %xmm0. 847// These can often bypass execution ports completely. 848defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>; 849 850// Branches don't produce values, so they have no latency, but they still 851// consume resources. Indirect branches can fold loads. 852defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 853 854// Floating point. This covers both scalar and vector operations. 855defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>; 856defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; 857defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; 858defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 859defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 860defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 861defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 862defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 863defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 864 865def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> { 866 let Latency = 2; // FIXME: not from llvm-exegesis 867 let ReleaseAtCycles = [1, 1]; 868 let NumMicroOps = 2; 869} 870def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 871 VMOVHPDmr, VMOVHPSmr)>; 872 873defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 874defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 875defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 876defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 877defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 878 879defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 880defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; 881defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; 882defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 883 884defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub. 885 886def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 887 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 888 let ReleaseAtCycles = [1, 1, 24]; 889 let NumMicroOps = 2; 890} 891def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 892 SUB_FI16m, SUB_FI32m, 893 SUBR_FI16m, SUBR_FI32m, 894 MUL_FI16m, MUL_FI32m)>; 895 896def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 897 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 898 let ReleaseAtCycles = [1, 1, 62]; 899 let NumMicroOps = 2; 900} 901def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 902 DIVR_FI16m, DIVR_FI32m)>; 903 904defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 905defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 906defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM). 907defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 908defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 909defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 910defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM). 911defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare. 912defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM). 913defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM). 914defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM). 915defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare. 916defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM). 917defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM). 918defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM). 919defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 920defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 921defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication. 922defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 923defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 924defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM). 925defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication. 926defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 927defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 928defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM). 929defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division. 930defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM). 931defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM). 932defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM). 933defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division. 934defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 935defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 936defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM). 937defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root. 938defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 939defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 940defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM). 941defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root. 942defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 943defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 944defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM). 945defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 946defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate. 947defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM). 948defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM). 949defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM). 950defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate. 951defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM). 952defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM). 953defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM). 954defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add. 955defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM). 956defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM). 957defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM). 958defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 959defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 960defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 961defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 962defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding. 963defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 964defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM). 965 966defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 967defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 968defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM). 969defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 970defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 971defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM). 972defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 973defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 974defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM). 975defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 976defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 977defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM). 978defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends. 979defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 980defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM). 981defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends. 982defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 983defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM). 984 985// Horizontal Add/Sub (float and integer) 986defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>; 987defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>; 988defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>; 989defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 990defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>; 991defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>; 992defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>; 993 994// Vector integer operations. 995defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 996defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 997defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 998defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 999defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 1000defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 1001defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 1002defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1003defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1004 1005def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { 1006 let Latency = 4; 1007 let ReleaseAtCycles = [1]; 1008 let NumMicroOps = 1; 1009} 1010def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 1011 1012def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { 1013 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1014 let ReleaseAtCycles = [1, 1, 1]; 1015 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1016} 1017def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1018 1019def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { 1020 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1021 let ReleaseAtCycles = [1, 1, 1]; 1022 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1023} 1024def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1025 1026defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1027defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1028defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1029defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 1030defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; 1031defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; 1032defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 1033 1034defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>; 1035defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>; 1036 1037def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { 1038 let Latency = 1; 1039 let ReleaseAtCycles = [1, 2]; 1040 let NumMicroOps = 2; 1041} 1042def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1043 1044def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { 1045 let Latency = 1; 1046 let ReleaseAtCycles = [1, 4]; 1047 let NumMicroOps = 2; 1048} 1049def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1050 1051defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1052 1053def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { 1054 let Latency = 3; 1055 let ReleaseAtCycles = [1, 1]; 1056 let NumMicroOps = 1; 1057} 1058def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1059 1060def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { 1061 let Latency = 3; 1062 let ReleaseAtCycles = [1, 1]; 1063 let NumMicroOps = 2; 1064} 1065def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1066 1067defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1068 1069def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> { 1070 let Latency = 2; 1071 let ReleaseAtCycles = [2]; 1072 let NumMicroOps = 1; 1073} 1074def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1075 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1076 PAVGBrr, PAVGWrr, 1077 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1078 VPABSBrr, VPABSDrr, VPABSWrr, 1079 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1080 VPAVGBrr, VPAVGWrr, 1081 VPCMPEQQrr, 1082 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1083 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1084 1085def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> { 1086 let Latency = 1; 1087 let ReleaseAtCycles = [1]; 1088 let NumMicroOps = 1; 1089} 1090def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr, 1091 KANDBrr, KANDDrr, KANDQrr, KANDWrr, 1092 KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr, 1093 KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk, 1094 KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk, 1095 KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr, 1096 KORBrr, KORDrr, KORQrr, KORWrr, 1097 KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr, 1098 KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr, 1099 KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr, 1100 KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr, 1101 KXORBrr, KXORDrr, KXORQrr, KXORWrr)>; 1102 1103def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> { 1104 let Latency = 1; 1105 let ReleaseAtCycles = [1]; 1106 let NumMicroOps = 1; 1107} 1108def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>; 1109 1110def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { 1111 let Latency = 1; 1112 let ReleaseAtCycles = [1]; 1113 let NumMicroOps = 1; 1114} 1115def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; 1116 1117def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { 1118 // TODO: All align instructions are expected to be of 4 cycle latency 1119 let Latency = 4; 1120 let ReleaseAtCycles = [1]; 1121 let NumMicroOps = 1; 1122} 1123def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, 1124 VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) 1125 >; 1126defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1127 1128def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { 1129 let Latency = 1; 1130 let ReleaseAtCycles = [1]; 1131 let NumMicroOps = 1; 1132} 1133def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1134 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1135 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1136 VPAVGBYrr, VPAVGWYrr, 1137 VPCMPEQQYrr, 1138 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1139 1140defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM). 1141 1142defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1143defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1144defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1145defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM). 1146defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1147defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1148defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM). 1149defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1150defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM). 1151defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1152defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM). 1153defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1154defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1155defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1156defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM). 1157defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1158defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1159defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1160defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM). 1161defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD. 1162defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1163defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM). 1164defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles. 1165defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1166defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1167defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM). 1168defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1169defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1170defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1171defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM). 1172defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends. 1173defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1174defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM). 1175defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends. 1176defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1177defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM). 1178defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1179defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1180defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1181defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM). 1182defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1183defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1184defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM). 1185defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1186 1187// Vector insert/extract operations. 1188defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1189defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1190defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1191 1192// MOVMSK operations. 1193defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1194defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1195defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>; 1196defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1197 1198// Conversion between integer and float. 1199defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer. 1200defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM). 1201defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM). 1202defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM). 1203 1204def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1205 let Latency = 1; 1206 let ReleaseAtCycles = [2]; 1207 let NumMicroOps = 2; 1208} 1209defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer. 1210 1211defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1212defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM). 1213defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM). 1214 1215defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1216defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1217defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1218defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM). 1219 1220def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1221 let Latency = 2; 1222 let ReleaseAtCycles = [6]; 1223 let NumMicroOps = 2; 1224} 1225 1226defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1227defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1228defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1229defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM). 1230 1231def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1232 let Latency = 3; 1233 let ReleaseAtCycles = [1]; 1234 let NumMicroOps = 2; 1235} 1236 1237defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1238defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1239defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1240defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM). 1241 1242defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1243defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1244defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1245defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM). 1246 1247defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1248defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1249defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM). 1250 1251defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1252defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1253defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM). 1254 1255defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1256defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1257defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM). 1258 1259// CRC32 instruction. 1260defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>; 1261 1262def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { 1263 let Latency = 2; 1264 let ReleaseAtCycles = [2]; 1265 let NumMicroOps = 2; 1266} 1267def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1268 1269def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1270 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); 1271 let ReleaseAtCycles = [1, 1, 2]; 1272 let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); 1273} 1274def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1275 1276def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { 1277 let Latency = 1; 1278 let ReleaseAtCycles = [2]; 1279 let NumMicroOps = 1; 1280} 1281def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1282 1283def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1284 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1285 let ReleaseAtCycles = [1, 1, 2]; 1286 let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1287} 1288def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1289 1290def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { 1291 let Latency = 2; 1292 let ReleaseAtCycles = [3]; 1293 let NumMicroOps = 2; 1294} 1295def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1296 1297def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1298 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); 1299 let ReleaseAtCycles = [1, 1, 3]; 1300 let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); 1301} 1302def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1303 1304def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { 1305 let Latency = 3; 1306 let ReleaseAtCycles = [8]; 1307 let NumMicroOps = 4; 1308} 1309def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1310 1311def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1312 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); 1313 let ReleaseAtCycles = [1, 1, 8]; 1314 let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); 1315} 1316def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1317 1318def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> { 1319 let Latency = 6; 1320 let ReleaseAtCycles = [8]; 1321 let NumMicroOps = 1; 1322} 1323def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1324 1325def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> { 1326 let Latency = 4; 1327 let ReleaseAtCycles = [8]; 1328 let NumMicroOps = 1; 1329} 1330def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1331 1332// Strings instructions. 1333// Packed Compare Implicit Length Strings, Return Mask 1334defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1335// Packed Compare Explicit Length Strings, Return Mask 1336defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1337// Packed Compare Implicit Length Strings, Return Index 1338defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; 1339// Packed Compare Explicit Length Strings, Return Index 1340defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1341 1342// AES instructions. 1343defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption. 1344defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn. 1345defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. 1346 1347// Carry-less multiplication instructions. 1348defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; 1349 1350// EMMS/FEMMS 1351defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1352 1353// Load/store MXCSR 1354defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1355defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1356 1357// Catch-all for expensive system instructions. 1358defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>; 1359 1360def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> { 1361 let Latency = 0; // FIXME: not from llvm-exegesis 1362 let ReleaseAtCycles = [1]; 1363 let NumMicroOps = 1; 1364} 1365def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>; 1366 1367def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> { 1368 let Latency = 10; // FIXME: not from llvm-exegesis 1369 let ReleaseAtCycles = [24]; 1370 let NumMicroOps = 18; 1371} 1372def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>; 1373 1374// AVX2. 1375defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1376defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1377defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles. 1378 1379def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { 1380 let Latency = 3; 1381 let ReleaseAtCycles = [1]; 1382 let NumMicroOps = 1; 1383} 1384def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1385 1386def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1387 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); 1388 let ReleaseAtCycles = [1, 1, 1]; 1389 let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1390} 1391def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1392 1393def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { 1394 let Latency = 7; 1395 let ReleaseAtCycles = [1]; 1396 let NumMicroOps = 2; 1397} 1398def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; 1399 1400def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1401 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); 1402 let ReleaseAtCycles = [1, 1, 2]; 1403 let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); 1404} 1405def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1406 1407def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { 1408 let Latency = 6; 1409 let ReleaseAtCycles = [1]; 1410 let NumMicroOps = 2; 1411} 1412def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1413 1414def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1415 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); 1416 let ReleaseAtCycles = [1, 1, 2]; 1417 let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); 1418} 1419def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1420 1421def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { 1422 let Latency = 5; 1423 let ReleaseAtCycles = [1]; 1424 let NumMicroOps = 2; 1425} 1426def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; 1427 1428def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1429 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); 1430 let ReleaseAtCycles = [1, 1, 2]; 1431 let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); 1432} 1433def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; 1434 1435defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1436defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. 1437defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts. 1438defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1439defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM). 1440 1441// Old microcoded instructions that nobody use. 1442defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>; 1443 1444// Fence instructions. 1445defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>; 1446 1447def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> { 1448 let Latency = 1; 1449 let ReleaseAtCycles = [30]; 1450 let NumMicroOps = 1; 1451} 1452def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>; 1453 1454def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> { 1455 let Latency = 1; 1456 let ReleaseAtCycles = [1]; 1457 let NumMicroOps = 1; 1458} 1459def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>; 1460 1461// Nop, not very useful expect it provides a model for nops! 1462defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1463 1464 1465/////////////////////////////////////////////////////////////////////////////// 1466// Zero Cycle Move 1467/////////////////////////////////////////////////////////////////////////////// 1468 1469def Zn4WriteZeroLatency : SchedWriteRes<[]> { 1470 let Latency = 0; 1471 let ReleaseAtCycles = []; 1472 let NumMicroOps = 1; 1473} 1474def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1475 MOV64rr, MOV64rr_REV, 1476 MOVSX32rr32)>; 1477 1478def Zn4WriteSwapRenameable : SchedWriteRes<[]> { 1479 let Latency = 0; 1480 let ReleaseAtCycles = []; 1481 let NumMicroOps = 2; 1482} 1483def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1484 XCHG64rr, XCHG64ar)>; 1485 1486defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1487 1488defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1489defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1490defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>; 1491 1492defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX 1493defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1494defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1495defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>; 1496 1497def : IsOptimizableRegisterMove<[ 1498 InstructionEquivalenceClass<[ 1499 // GPR variants. 1500 MOV32rr, MOV32rr_REV, 1501 MOV64rr, MOV64rr_REV, 1502 MOVSX32rr32, 1503 XCHG32rr, XCHG32ar, 1504 XCHG64rr, XCHG64ar, 1505 1506 // MMX variants. 1507 // MMX moves are *NOT* eliminated. 1508 1509 // SSE variants. 1510 MOVAPSrr, MOVAPSrr_REV, 1511 MOVUPSrr, MOVUPSrr_REV, 1512 MOVAPDrr, MOVAPDrr_REV, 1513 MOVUPDrr, MOVUPDrr_REV, 1514 MOVDQArr, MOVDQArr_REV, 1515 MOVDQUrr, MOVDQUrr_REV, 1516 1517 // AVX variants. 1518 VMOVAPSrr, VMOVAPSrr_REV, 1519 VMOVUPSrr, VMOVUPSrr_REV, 1520 VMOVAPDrr, VMOVAPDrr_REV, 1521 VMOVUPDrr, VMOVUPDrr_REV, 1522 VMOVDQArr, VMOVDQArr_REV, 1523 VMOVDQUrr, VMOVDQUrr_REV, 1524 1525 // AVX YMM variants. 1526 VMOVAPSYrr, VMOVAPSYrr_REV, 1527 VMOVUPSYrr, VMOVUPSYrr_REV, 1528 VMOVAPDYrr, VMOVAPDYrr_REV, 1529 VMOVUPDYrr, VMOVUPDYrr_REV, 1530 VMOVDQAYrr, VMOVDQAYrr_REV, 1531 VMOVDQUYrr, VMOVDQUYrr_REV, 1532 ], TruePred > 1533]>; 1534 1535// FIXUP and RANGE Instructions 1536def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { 1537 let Latency = 2; 1538 let ReleaseAtCycles = [2]; 1539 let NumMicroOps = 1; 1540} 1541def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex 1542 "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", 1543 "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", 1544 "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" 1545 )>; 1546 1547// SCALE & REDUCE instructions 1548def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> { 1549 let Latency = 6; 1550 let ReleaseAtCycles = [6]; 1551 let NumMicroOps = 2; 1552} 1553def : InstRW<[Zn4WriteSCALErr], (instregex 1554 "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)", 1555 "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)" 1556 )>; 1557 1558//BF16PS Instructions 1559def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> { 1560 let Latency = 6; 1561 let ReleaseAtCycles = [6]; 1562 let NumMicroOps = 2; 1563} 1564def : InstRW<[Zn4WriteBF16], (instregex 1565 "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)" 1566 )>; 1567 1568// BUSD and VPMADD Instructions 1569def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { 1570 let Latency = 4; 1571 let ReleaseAtCycles = [4]; 1572 let NumMicroOps = 1; 1573} 1574def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex 1575 "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", 1576 "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" 1577 )>; 1578 1579// SHIFT instructions 1580def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> { 1581 let Latency = 2; 1582 let ReleaseAtCycles = [2]; 1583 let NumMicroOps = 1; 1584} 1585def : InstRW<[Zn4WriteSHIFTrr], (instregex 1586 "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)", 1587 "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)", 1588 "(V?)P(SLL|SRL|SRA)DQYri", 1589 "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri", 1590 "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)", 1591 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", 1592 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", 1593 "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", 1594 "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz" 1595 )>; 1596 1597def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { 1598 let Latency = 1; 1599 let ReleaseAtCycles = [1]; 1600 let NumMicroOps = 1; 1601} 1602def : InstRW<[Zn4WriteSHIFTri], (instregex 1603 "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" 1604 )>; 1605 1606// ALIGN Instructions 1607def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { 1608 let Latency = 2; 1609 let ReleaseAtCycles = [2]; 1610 let NumMicroOps = 1; 1611} 1612def : InstRW<[Zn4WriteALIGN], (instregex 1613 "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" 1614 )>; 1615 1616//PACK Instructions 1617def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { 1618 let Latency = 2; 1619 let ReleaseAtCycles = [2]; 1620 let NumMicroOps = 1; 1621} 1622def : InstRW<[Zn4WritePACK], (instregex 1623 "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" 1624 )>; 1625 1626// MAX and MIN Instructions 1627def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> { 1628 let Latency = 2; 1629 let ReleaseAtCycles = [2]; 1630 let NumMicroOps = 1; 1631} 1632def : InstRW<[Zn4WriteFCmp64], (instregex 1633 "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)", 1634 "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)", 1635 "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)", 1636 "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)" 1637 )>; 1638 1639// MOV Instructions 1640def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { 1641 let Latency = 2; 1642 let ReleaseAtCycles = [2]; 1643 let NumMicroOps = 1; 1644} 1645def : InstRW<[Zn4MOVS], (instregex 1646 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)", 1647 "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)", 1648 "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)", 1649 "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)", 1650 "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)" 1651 )>; 1652 1653def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> { 1654 let Latency = 4; 1655 let ReleaseAtCycles = [4]; 1656 let NumMicroOps = 1; 1657} 1658def : InstRW<[Zn4MOVSZ], (instregex 1659 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)" 1660 )>; 1661 1662def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> { 1663 let Latency = 5; 1664 let ReleaseAtCycles = [5]; 1665 let NumMicroOps = 1; 1666} 1667def : InstRW<[Zn4MOVSrr], (instregex 1668 "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)" 1669 )>; 1670 1671 1672//VPTEST Instructions 1673def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> { 1674 let Latency = 3; 1675 let ReleaseAtCycles = [3]; 1676 let NumMicroOps = 1; 1677} 1678def : InstRW<[Zn4VPTESTZ128], (instregex 1679 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)" 1680 )>; 1681 1682def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> { 1683 let Latency = 4; 1684 let ReleaseAtCycles = [4]; 1685 let NumMicroOps = 1; 1686} 1687def : InstRW<[Zn4VPTESTZ256], (instregex 1688 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)" 1689 )>; 1690 1691def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> { 1692 let Latency = 5; 1693 let ReleaseAtCycles = [5]; 1694 let NumMicroOps = 1; 1695} 1696def : InstRW<[Zn4VPTESTZ], (instregex 1697 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)" 1698 )>; 1699 1700// CONFLICT Instructions 1701def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> { 1702 let Latency = 2; 1703 let ReleaseAtCycles = [2]; 1704 let NumMicroOps = 1; 1705} 1706def : InstRW<[Zn4CONFLICTZ128], (instregex 1707 "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)" 1708 )>; 1709 1710def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> { 1711 let Latency = 6; 1712 let ReleaseAtCycles = [2,2,2]; 1713 let NumMicroOps = 4; 1714} 1715def : InstRW<[Zn4CONFLICTrr], (instregex 1716 "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)" 1717 )>; 1718 1719// RSQRT Instructions 1720def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> { 1721 let Latency = 5; 1722 let ReleaseAtCycles = [2]; 1723 let NumMicroOps = 1; 1724} 1725def : InstRW<[Zn4VRSQRT14PDZ256], (instregex 1726 "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)" 1727 )>; 1728 1729 1730// PERM Instructions 1731def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> { 1732 let Latency = 2; 1733 let ReleaseAtCycles = [2]; 1734 let NumMicroOps = 1; 1735} 1736def : InstRW<[Zn4PERMILP], (instregex 1737 "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)" 1738 )>; 1739 1740def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> { 1741 let Latency = 3; 1742 let ReleaseAtCycles = [2]; 1743 let NumMicroOps = 1; 1744} 1745def : InstRW<[Zn4PERMIT2_128], (instregex 1746 "VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)", 1747 "VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)" 1748 )>; 1749 1750def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> { 1751 let Latency = 2; 1752 let ReleaseAtCycles = [2]; 1753 let NumMicroOps = 1; 1754} 1755def : InstRW<[Zn4PERMIT2_128rr], (instregex 1756 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)", 1757 "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)" 1758 )>; 1759 1760def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> { 1761 let Latency = 4; 1762 let ReleaseAtCycles = [2]; 1763 let NumMicroOps = 1; 1764} 1765def : InstRW<[Zn4PERMIT2_256], (instregex 1766 "VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)", 1767 "VPERMP(S|D)Z256(rr|rrk|rrkz)", 1768 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)", 1769 "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)", 1770 "VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)", 1771 "VPEXPAND(B|W)Z256(rr|rrk|rrkz)" 1772 )>; 1773 1774def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> { 1775 let Latency = 5; 1776 let ReleaseAtCycles = [2]; 1777 let NumMicroOps = 1; 1778} 1779def : InstRW<[Zn4PERMIT2Z], (instregex 1780 "VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)", 1781 "VPERM(B|D|W)Z(rr|rrk|rrkz)", 1782 "VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)", 1783 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)", 1784 "VPEXPAND(B|W)Z(rr|rrk|rrkz)", 1785 "VPERMP(S|D)Z(rr|rrk|rrkz)" 1786 )>; 1787 1788// ALU SLOW Misc Instructions 1789def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> { 1790 let Latency = 2; 1791 let ReleaseAtCycles = [2]; 1792 let NumMicroOps = 1; 1793} 1794def : InstRW<[Zn4VecALUZSlow], (instrs 1795 VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, 1796 VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, 1797 VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, 1798 VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, 1799 VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, 1800 VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, 1801 VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, 1802 VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, 1803 VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, 1804 VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, 1805 VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, 1806 VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, 1807 VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, 1808 VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz 1809 )>; 1810 1811 1812/////////////////////////////////////////////////////////////////////////////// 1813// Dependency breaking instructions. 1814/////////////////////////////////////////////////////////////////////////////// 1815 1816def Zn4WriteZeroIdiom : SchedWriteVariant<[ 1817 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1818 SchedVar<NoSchedPred, [WriteALU]> 1819]>; 1820def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1821 XOR64rr, XOR64rr_REV, 1822 SUB32rr, SUB32rr_REV, 1823 SUB64rr, SUB64rr_REV)>; 1824 1825def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1826 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>, 1827 SchedVar<NoSchedPred, [WriteALU]> 1828]>; 1829def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1830 CMP16rr, CMP16rr_REV, 1831 CMP32rr, CMP32rr_REV, 1832 CMP64rr, CMP64rr_REV)>; 1833 1834def Zn4WriteFZeroIdiom : SchedWriteVariant<[ 1835 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1836 SchedVar<NoSchedPred, [WriteFLogic]> 1837]>; 1838// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1839def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1840 VANDNPSrr, VANDNPDrr)>; 1841 1842def Zn4WriteFZeroIdiomY : SchedWriteVariant<[ 1843 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1844 SchedVar<NoSchedPred, [WriteFLogicY]> 1845]>; 1846def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1847 VANDNPSYrr, VANDNPDYrr)>; 1848 1849def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1850 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1851 SchedVar<NoSchedPred, [WriteVecLogicX]> 1852]>; 1853// NOTE: PXORrr,PANDNrr are not zero-cycle! 1854def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1855 1856def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1857 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1858 SchedVar<NoSchedPred, [WriteVecLogicY]> 1859]>; 1860def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1861 1862def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[ 1863 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1864 SchedVar<NoSchedPred, [WriteVecALUX]> 1865]>; 1866// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1867// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1868def : InstRW<[Zn4WriteVZeroIdiomALUX], 1869 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1870 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1871 1872def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[ 1873 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1874 SchedVar<NoSchedPred, [WriteVecALUY]> 1875]>; 1876def : InstRW<[Zn4WriteVZeroIdiomALUY], 1877 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1878 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1879 1880def : IsZeroIdiomFunction<[ 1881 // GPR Zero-idioms. 1882 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1883 XOR64rr, XOR64rr_REV, 1884 SUB32rr, SUB32rr_REV, 1885 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1886 1887 // SSE XMM Zero-idioms. 1888 DepBreakingClass<[ 1889 // fp variants. 1890 XORPSrr, XORPDrr, 1891 ANDNPSrr, ANDNPDrr, 1892 1893 // int variants. 1894 PXORrr, 1895 PANDNrr, 1896 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1897 PSUBSBrr, PSUBSWrr, 1898 PSUBUSBrr, PSUBUSWrr, 1899 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1900 ], ZeroIdiomPredicate>, 1901 1902 // AVX XMM Zero-idioms. 1903 DepBreakingClass<[ 1904 // fp variants. 1905 VXORPSrr, VXORPDrr, 1906 VANDNPSrr, VANDNPDrr, 1907 1908 // int variants. 1909 VPXORrr, 1910 VPANDNrr, 1911 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1912 VPSUBSBrr, VPSUBSWrr, 1913 VPSUBUSBrr, VPSUBUSWrr, 1914 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1915 ], ZeroIdiomPredicate>, 1916 1917 // AVX YMM Zero-idioms. 1918 DepBreakingClass<[ 1919 // fp variants. 1920 VXORPSYrr, VXORPDYrr, 1921 VANDNPSYrr, VANDNPDYrr, 1922 1923 // int variants. 1924 VPXORYrr, 1925 VPANDNYrr, 1926 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1927 VPSUBSBYrr, VPSUBSWYrr, 1928 VPSUBUSBYrr, VPSUBUSWYrr, 1929 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1930 ], ZeroIdiomPredicate>, 1931]>; 1932 1933def : IsDepBreakingFunction<[ 1934 // GPR 1935 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1936 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1937 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1938 CMP16rr, CMP16rr_REV, 1939 CMP32rr, CMP32rr_REV, 1940 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1941 // SSE 1942 DepBreakingClass<[ 1943 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1944 ], ZeroIdiomPredicate>, 1945 1946 // AVX XMM 1947 DepBreakingClass<[ 1948 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1949 ], ZeroIdiomPredicate>, 1950 1951 // AVX YMM 1952 DepBreakingClass<[ 1953 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1954 ], ZeroIdiomPredicate>, 1955]>; 1956 1957} // SchedModel 1958 1959