1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver4 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14//===----------------------------------------------------------------------===// 15 16def Znver4Model : SchedMachineModel { 17 // AMD SOG 19h, 2.9.6 Dispatch 18 // The processor may dispatch up to 6 macro ops per cycle 19 // into the execution engine. 20 let IssueWidth = 6; 21 // AMD SOG 19h, 2.10.3 22 // The retire control unit (RCU) tracks the completion status of all 23 // outstanding operations (integer, load/store, and floating-point) and is 24 // the final arbiter for exception processing and recovery. 25 // The unit can receive up to 6 macro ops dispatched per cycle and track up 26 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. 27 let MicroOpBufferSize = 320; 28 // AMD SOG 19h, 2.9.1 Op Cache 29 // The op cache is organized as an associative cache with 64 sets and 8 ways. 30 // At each set-way intersection is an entry containing up to 8 macro ops. 31 // The maximum capacity of the op cache is 4K ops. 32 // Agner, 22.5 µop cache 33 // The size of the µop cache is big enough for holding most critical loops. 34 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, 35 // with large values here the compilation of certain loops 36 // ends up taking way too long. 37 // Ideally for znver4, we should have 6.75K. However we don't add that 38 // considerting the impact compile time and prefer using default values 39 // instead. 40 // let LoopMicroOpBufferSize = 6750; 41 // AMD SOG 19h, 2.6.2 L1 Data Cache 42 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 43 // AMD SOG 19h, 2.12 L1 Data Cache 44 // The AGU and LS pipelines are optimized for simple address generation modes. 45 // <...> and can achieve 4-cycle load-to-use integer load latency. 46 let LoadLatency = 4; 47 // AMD SOG 19h, 2.12 L1 Data Cache 48 // The AGU and LS pipelines are optimized for simple address generation modes. 49 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 50 int VecLoadLatency = 7; 51 // Latency of a simple store operation. 52 int StoreLatency = 1; 53 // FIXME: 54 let HighLatency = 25; // FIXME: any better choice? 55 // AMD SOG 19h, 2.8 Optimizing Branching 56 // The branch misprediction penalty is in the range from 11 to 18 cycles, 57 // <...>. The common case penalty is 13 cycles. 58 let MispredictPenalty = 13; 59 60 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 61 62 let CompleteModel = 1; 63} 64 65let SchedModel = Znver4Model in { 66 67 68//===----------------------------------------------------------------------===// 69// RCU 70//===----------------------------------------------------------------------===// 71 72// AMD SOG 19h, 2.10.3 Retire Control Unit 73// The unit can receive up to 6 macro ops dispatched per cycle and track up to 74// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 75// The retire unit handles in-order commit of up to nine macro ops per cycle. 76def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>; 77 78//===----------------------------------------------------------------------===// 79// Integer Execution Unit 80// 81 82// AMD SOG 19h, 2.4 Superscalar Organization 83// The processor uses four decoupled independent integer scheduler queues, 84// each one servicing one ALU pipeline and one or two other pipelines 85 86// 87// Execution pipes 88//===----------------------------------------------------------------------===// 89 90// AMD SOG 19h, 2.10.2 Execution Units 91// The processor contains 4 general purpose integer execution pipes. 92// Each pipe has an ALU capable of general purpose integer operations. 93def Zn4ALU0 : ProcResource<1>; 94def Zn4ALU1 : ProcResource<1>; 95def Zn4ALU2 : ProcResource<1>; 96def Zn4ALU3 : ProcResource<1>; 97 98// AMD SOG 19h, 2.10.2 Execution Units 99// There is also a separate branch execution unit. 100def Zn4BRU1 : ProcResource<1>; 101 102// AMD SOG 19h, 2.10.2 Execution Units 103// There are three Address Generation Units (AGUs) for all load and store 104// address generation. There are also 3 store data movement units 105// associated with the same schedulers as the AGUs. 106def Zn4AGU0 : ProcResource<1>; 107def Zn4AGU1 : ProcResource<1>; 108def Zn4AGU2 : ProcResource<1>; 109 110// 111// Execution Units 112//===----------------------------------------------------------------------===// 113 114// AMD SOG 19h, 2.10.2 Execution Units 115// ALU0 additionally has divide <...> execution capability. 116defvar Zn4Divider = Zn4ALU0; 117 118// AMD SOG 19h, 2.10.2 Execution Units 119// ALU0 additionally has <...> branch execution capability. 120defvar Zn4BRU0 = Zn4ALU0; 121 122// Integer Multiplication issued on ALU1. 123defvar Zn4Multiplier = Zn4ALU1; 124 125// Execution pipeline grouping 126//===----------------------------------------------------------------------===// 127 128// General ALU operations 129def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>; 130 131// General AGU operations 132def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>; 133 134// Control flow: jumps, calls 135def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>; 136 137// Everything that isn't control flow, but still needs to access CC register, 138// namely: conditional moves, SETcc. 139def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>; 140 141// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 142 143// Simple bit twiddling: bit test, shift/rotate, bit extraction 144def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>; 145 146 147// 148// Scheduling 149//===----------------------------------------------------------------------===// 150 151// AMD SOG 19h, 2.10.3 Retire Control Unit 152// The integer physical register file (PRF) consists of 224 registers. 153def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0], 154 6, // Max moves that can be eliminated per cycle. 155 0>; // Restrict move elimination to zero regs. 156 157// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 158// AMD SOG 19h, 2.10.1 Schedulers 159// The schedulers can receive up to six macro ops per cycle, with a limit of 160// two per scheduler. Each scheduler can issue one micro op per cycle into 161// each of its associated pipelines 162def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 163 Zn4ALU1, Zn4AGU1, // scheduler 1 164 Zn4ALU2, Zn4AGU2, // scheduler 2 165 Zn4ALU3, Zn4BRU1 // scheduler 3 166 ]> { 167 let BufferSize = !mul(4, 24); 168} 169 170 171//===----------------------------------------------------------------------===// 172// Floating-Point Unit 173// 174 175// AMD SOG 19h, 2.4 Superscalar Organization 176// The processor uses <...> two decoupled independent floating point schedulers 177// each servicing two FP pipelines and one store or FP-to-integer pipeline. 178 179// 180// Execution pipes 181//===----------------------------------------------------------------------===// 182 183// AMD SOG 19h, 2.10.1 Schedulers 184// <...>, and six FPU pipes. 185// Agner, 22.10 Floating point execution pipes 186// There are six floating point/vector execution pipes, 187def Zn4FP0 : ProcResource<1>; 188def Zn4FP1 : ProcResource<1>; 189def Zn4FP2 : ProcResource<1>; 190def Zn4FP3 : ProcResource<1>; 191def Zn4FP45 : ProcResource<2>; 192 193// 194// Execution Units 195//===----------------------------------------------------------------------===// 196// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 197 198// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 199defvar Zn4FPFMul0 = Zn4FP0; 200defvar Zn4FPFMul1 = Zn4FP1; 201 202// (v)FADD* 203defvar Zn4FPFAdd0 = Zn4FP2; 204defvar Zn4FPFAdd1 = Zn4FP3; 205 206// All convert operations except pack/unpack 207defvar Zn4FPFCvt0 = Zn4FP2; 208defvar Zn4FPFCvt1 = Zn4FP3; 209 210// All Divide and Square Root except Reciprocal Approximation 211// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 212// FDIV unit can support 2 simultaneous operations in flight 213// even though it occupies a single pipe. 214// FIXME: BufferSize=2 ? 215defvar Zn4FPFDiv = Zn4FP1; 216 217// Moves and Logical operations on Floating Point Data Types 218defvar Zn4FPFMisc0 = Zn4FP0; 219defvar Zn4FPFMisc1 = Zn4FP1; 220defvar Zn4FPFMisc2 = Zn4FP2; 221defvar Zn4FPFMisc3 = Zn4FP3; 222 223// Integer Adds, Subtracts, and Compares 224// Some complex VADD operations are not available in all pipes. 225defvar Zn4FPVAdd0 = Zn4FP0; 226defvar Zn4FPVAdd1 = Zn4FP1; 227defvar Zn4FPVAdd2 = Zn4FP2; 228defvar Zn4FPVAdd3 = Zn4FP3; 229 230// Integer Multiplies, SAD, Blendvb 231defvar Zn4FPVMul0 = Zn4FP0; 232defvar Zn4FPVMul1 = Zn4FP3; 233 234// Data Shuffles, Packs, Unpacks, Permute 235// Some complex shuffle operations are only available in pipe1. 236defvar Zn4FPVShuf = Zn4FP1; 237defvar Zn4FPVShufAux = Zn4FP2; 238 239// Bit Shift Left/Right operations 240defvar Zn4FPVShift0 = Zn4FP1; 241defvar Zn4FPVShift1 = Zn4FP2; 242 243// Moves and Logical operations on Packed Integer Data Types 244defvar Zn4FPVMisc0 = Zn4FP0; 245defvar Zn4FPVMisc1 = Zn4FP1; 246defvar Zn4FPVMisc2 = Zn4FP2; 247defvar Zn4FPVMisc3 = Zn4FP3; 248 249// *AES* 250defvar Zn4FPAES0 = Zn4FP0; 251defvar Zn4FPAES1 = Zn4FP1; 252 253// *CLM* 254defvar Zn4FPCLM0 = Zn4FP0; 255defvar Zn4FPCLM1 = Zn4FP1; 256 257// Execution pipeline grouping 258//===----------------------------------------------------------------------===// 259 260// AMD SOG 19h, 2.11 Floating-Point Unit 261// Stores and floating point to general purpose register transfer 262// have 2 dedicated pipelines (pipe 5 and 6). 263def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>; 264 265// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 266def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>; 267 268// (v)FADD* 269// Some complex VADD operations are not available in all pipes. 270def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>; 271 272// All convert operations except pack/unpack 273def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>; 274 275// All Divide and Square Root except Reciprocal Approximation 276// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>; 277 278// Moves and Logical operations on Floating Point Data Types 279def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>; 280 281// FIXUP and RANGE use FP01 pipelines 282def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>; 283def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>; 284// SCALE instructions use FP23 pipelines 285def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>; 286def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>; 287 288// Loads, Stores and Move to General Register (EX) Operations 289// AMD SOG 19h, 2.11 Floating-Point Unit 290// Stores and floating point to general purpose register transfer 291// have 2 dedicated pipelines (pipe 5 and 6). 292defvar Zn4FPLd01 = Zn4FP45; 293 294// AMD SOG 19h, 2.11 Floating-Point Unit 295// Note that FP stores are supported on two pipelines, 296// but throughput is limited to one per cycle. 297let Super = Zn4FP45 in 298def Zn4FPSt : ProcResource<1>; 299 300// Integer Adds, Subtracts, and Compares 301// Some complex VADD operations are not available in all pipes. 302def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>; 303 304def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>; 305def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>; 306 307// AVX512 Opmask pipelines 308def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>; 309def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>; 310 311// Integer Multiplies, SAD, Blendvb 312def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>; 313 314// Data Shuffles, Packs, Unpacks, Permute 315// Some complex shuffle operations are only available in pipe1. 316def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>; 317 318// Bit Shift Left/Right operations 319def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>; 320 321// Moves and Logical operations on Packed Integer Data Types 322def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>; 323 324// *AES* 325def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>; 326 327// *CLM* 328def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>; 329 330 331// 332// Scheduling 333//===----------------------------------------------------------------------===// 334 335// Agner, 21.8 Register renaming and out-of-order schedulers 336// The floating point register file has 192 vector registers 337// of 512b each in zen4. 338def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1], 339 6, // Max moves that can be eliminated per cycle. 340 0>; // Restrict move elimination to zero regs. 341 342// AMD SOG 19h, 2.11 Floating-Point Unit 343// The floating-point scheduler has a 2*32 entry macro op capacity. 344// AMD SOG 19h, 2.11 Floating-Point Unit 345// <...> the scheduler can issue 1 micro op per cycle for each pipe. 346// FIXME: those are two separate schedulers, not a single big one. 347def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 348 Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1 349 ]> { 350 let BufferSize = !mul(2, 32); 351} 352 353// AMD SOG 19h, 2.11 Floating-Point Unit 354// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 355// even if floating-point scheduler is full. 356// FIXME: how to model this properly? 357 358 359//===----------------------------------------------------------------------===// 360// Load-Store Unit 361// 362 363// AMD SOG 19h, 2.12 Load-Store Unit 364// The LS unit contains three largely independent pipe-lines 365// enabling the execution of three 256-bit memory operations per cycle. 366def Zn4LSU : ProcResource<3>; 367 368// AMD SOG 19h, 2.12 Load-Store Unit 369// All three memory operations can be loads. 370let Super = Zn4LSU in 371def Zn4Load : ProcResource<3> { 372 // AMD SOG 19h, 2.12 Load-Store Unit 373 // The LS unit can process up to 72 out-of-order loads. 374 let BufferSize = 72; 375} 376 377def Zn4LoadQueue : LoadQueue<Zn4Load>; 378 379// AMD SOG 19h, 2.12 Load-Store Unit 380// A maximum of two of the memory operations can be stores. 381let Super = Zn4LSU in 382def Zn4Store : ProcResource<2> { 383 // AMD SOG 19h, 2.12 Load-Store Unit 384 // The LS unit utilizes a 64-entry store queue (STQ). 385 let BufferSize = 64; 386} 387 388def Zn4StoreQueue : StoreQueue<Zn4Store>; 389 390//===----------------------------------------------------------------------===// 391// Basic helper classes. 392//===----------------------------------------------------------------------===// 393 394// Many SchedWrites are defined in pairs with and without a folded load. 395// Instructions with folded loads are usually micro-fused, so they only appear 396// as two micro-ops when dispatched by the schedulers. 397// This multiclass defines the resource usage for variants with and without 398// folded loads. 399 400multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 401 int Lat = 1, list<int> Res = [], int UOps = 1> { 402 def : WriteRes<SchedRW, ExePorts> { 403 let Latency = Lat; 404 let ResourceCycles = Res; 405 let NumMicroOps = UOps; 406 } 407} 408 409multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW, 410 list<ProcResourceKind> ExePorts, int Lat, 411 list<int> Res, int UOps, int LoadLat, int LoadUOps, 412 ProcResourceKind AGU, int LoadRes> { 413 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 414 415 defm : __Zn4WriteRes<SchedRW.Folded, 416 !listconcat([AGU, Zn4Load], ExePorts), 417 !add(Lat, LoadLat), 418 !if(!and(!empty(Res), !eq(LoadRes, 1)), 419 [], 420 !listconcat([1, LoadRes], 421 !if(!empty(Res), 422 !listsplat(1, !size(ExePorts)), 423 Res))), 424 !add(UOps, LoadUOps)>; 425} 426 427// For classes without folded loads. 428multiclass Zn4WriteResInt<SchedWrite SchedRW, 429 list<ProcResourceKind> ExePorts, int Lat = 1, 430 list<int> Res = [], int UOps = 1> { 431 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 432} 433 434multiclass Zn4WriteResXMM<SchedWrite SchedRW, 435 list<ProcResourceKind> ExePorts, int Lat = 1, 436 list<int> Res = [], int UOps = 1> { 437 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 438} 439 440multiclass Zn4WriteResYMM<SchedWrite SchedRW, 441 list<ProcResourceKind> ExePorts, int Lat = 1, 442 list<int> Res = [], int UOps = 1> { 443 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 444} 445 446multiclass Zn4WriteResZMM<SchedWrite SchedRW, 447 list<ProcResourceKind> ExePorts, int Lat = 1, 448 list<int> Res = [], int UOps = 1> { 449 defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 450} 451 452// For classes with folded loads. 453multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW, 454 list<ProcResourceKind> ExePorts, int Lat = 1, 455 list<int> Res = [], int UOps = 1, 456 int LoadUOps = 0, int LoadRes = 1> { 457 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 458 Znver4Model.LoadLatency, 459 LoadUOps, Zn4AGU012, LoadRes>; 460} 461 462multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW, 463 list<ProcResourceKind> ExePorts, int Lat = 1, 464 list<int> Res = [], int UOps = 1, 465 int LoadUOps = 0, int LoadRes = 1> { 466 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 467 Znver4Model.VecLoadLatency, 468 LoadUOps, Zn4FPLd01, LoadRes>; 469} 470 471multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW, 472 list<ProcResourceKind> ExePorts, int Lat = 1, 473 list<int> Res = [], int UOps = 1, 474 int LoadUOps = 0, int LoadRes = 1> { 475 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 476 Znver4Model.VecLoadLatency, 477 LoadUOps, Zn4FPLd01, LoadRes>; 478} 479 480multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW, 481 list<ProcResourceKind> ExePorts, int Lat = 1, 482 list<int> Res = [], int UOps = 2, 483 int LoadUOps = 0, int LoadRes = 1> { 484 defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 485 Znver4Model.VecLoadLatency, 486 LoadUOps, Zn4FPLd01, LoadRes>; 487} 488 489//===----------------------------------------------------------------------===// 490// Here be dragons. 491//===----------------------------------------------------------------------===// 492 493def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>; 494 495def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>; 496def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>; 497def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>; 498 499// AMD SOG 19h, 2.11 Floating-Point Unit 500// There is 1 cycle of added latency for a result to cross 501// from F to I or I to F domain. 502def : ReadAdvance<ReadInt2Fpu, -1>; 503 504// Instructions with both a load and a store folded are modeled as a folded 505// load + WriteRMW. 506defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>; 507 508// Loads, stores, and moves, not folded with other operations. 509defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>; 510 511// Model the effect of clobbering the read-write mask operand of the GATHER operation. 512// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 513defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>; 514 515def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> { 516 let Latency = !add(Znver4Model.LoadLatency, 1); 517 let ResourceCycles = [3, 1]; 518 let NumMicroOps = 1; 519} 520def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 521 522defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; 523defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; 524defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>; 525 526// Treat misc copies as a move. 527def : InstRW<[WriteMove], (instrs COPY)>; 528 529def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 530 let Latency = Znver4Model.LoadLatency; 531 let ResourceCycles = [1, 1, 4]; 532 let NumMicroOps = 1; 533} 534def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>; 535 536def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> { 537 let Latency = Znver4Model.StoreLatency; 538 let ResourceCycles = [4, 1, 1]; 539 let NumMicroOps = 2; 540} 541def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 542 543// Arithmetic. 544defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op. 545 546def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> { 547 let Latency = 1; 548 let ResourceCycles = [4]; 549 let NumMicroOps = 1; 550} 551def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 552 AND8i8, AND16i16, AND32i32, AND64i32, 553 OR8i8, OR16i16, OR32i32, OR64i32, 554 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 555 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 556 557def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> { 558 let Latency = 1; 559 let ResourceCycles = [4]; 560 let NumMicroOps = 1; 561} 562def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 563 564def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> { 565 let Latency = 1; 566 let ResourceCycles = [2]; 567 let NumMicroOps = 1; 568} 569def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 570 571def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> { 572 let Latency = 3; 573 let ResourceCycles = [1]; 574 let NumMicroOps = 1; 575} 576def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 577 PEXT32rr, PEXT64rr)>; 578 579defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op. 580 581def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> { 582 let Latency = 1; 583 let ResourceCycles = [1, 1, 7, 1]; 584 let NumMicroOps = 1; 585} 586def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 587 588// This is for simple LEAs with one or two input operands. 589defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 590 591// This write is used for slow LEA instructions. 592def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> { 593 let Latency = 2; 594 let ResourceCycles = [1]; 595 let NumMicroOps = 2; 596} 597 598// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset), 599// or an LEA with a `Scale` value different than 1. 600def Zn4SlowLEAPredicate : MCSchedPredicate< 601 CheckAny<[ 602 // A 3-operand LEA (base, index, offset). 603 IsThreeOperandsLEAFn, 604 // An LEA with a "Scale" different than 1. 605 CheckAll<[ 606 CheckIsImmOperand<2>, 607 CheckNot<CheckImmOperand<2, 1>> 608 ]> 609 ]> 610>; 611 612def Zn4WriteLEA : SchedWriteVariant<[ 613 SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>, 614 SchedVar<NoSchedPred, [WriteLEA]> 615]>; 616 617def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 618 619def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { 620 let Latency = 2; // FIXME: not from llvm-exegesis 621 let ResourceCycles = [4]; 622 let NumMicroOps = 2; 623} 624 625def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>; 626 627// Integer multiplication 628defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 629defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 630defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 631defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 632defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 633defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 634defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 635defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 636defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 637defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 638defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 639defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 640defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part. 641defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 642 643defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 644defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 645 646defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 647 648def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> { 649 let Latency = 3; 650 let ResourceCycles = [12]; 651 let NumMicroOps = 3; 652} 653def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 654 655defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 656 657def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 658 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency); 659 let ResourceCycles = [1, 1, 12]; 660 let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2); 661} 662def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 663 664def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { 665 let Latency = 3; // FIXME: not from llvm-exegesis 666 let ResourceCycles = [24]; 667 let NumMicroOps = 19; 668} 669def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 670 671def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { 672 let Latency = 4; // FIXME: not from llvm-exegesis 673 let ResourceCycles = [59]; 674 let NumMicroOps = 28; 675} 676def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 677 678def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> { 679 let Latency = 1; 680 let ResourceCycles = [2]; 681 let NumMicroOps = 2; 682} 683def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 684 685def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 686 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 687 let ResourceCycles = [1, 1, 2]; 688 let NumMicroOps = 5; 689} 690def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 691 692def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { 693 let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 694 let ResourceCycles = [1, 1, 2]; 695 let NumMicroOps = 2; 696} 697def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 698 699// Integer division. 700// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 701// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 702defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; 703defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; 704defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; 705defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; 706defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; 707defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; 708defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; 709defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; 710 711defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. 712defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. 713 714defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. 715 716def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 717 let Latency = 1; 718 let ResourceCycles = [4]; 719 let NumMicroOps = 1; 720} 721def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>; 722 723defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count. 724 725def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 726 let Latency = 1; 727 let ResourceCycles = [4]; 728 let NumMicroOps = 1; 729} 730def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; 731 732defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. 733 734def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { 735 let Latency = 2; 736 let ResourceCycles = [4]; 737 let NumMicroOps = 2; 738} 739def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; 740 741defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move. 742defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 743defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code. 744defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 745defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH. 746 747defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test 748defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>; 749defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>; 750 751defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set 752defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>; 753defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>; 754 755// Integer shifts and rotates. 756defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 757defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 758defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 759 760def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> { 761 let Latency = 1; 762 let ResourceCycles = [2]; 763 let NumMicroOps = 1; 764} 765def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 766 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 767 768def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 769 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency); 770 let ResourceCycles = [1, 1, 2]; 771 let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1); 772} 773def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 774 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 775 776def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> { 777 let Latency = 3; 778 let ResourceCycles = [6]; 779 let NumMicroOps = 7; 780} 781def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 782 783def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 784 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency); 785 let ResourceCycles = [1, 1, 8]; 786 let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3); 787} 788def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 789 790def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> { 791 let Latency = 4; 792 let ResourceCycles = [8]; 793 let NumMicroOps = 9; 794} 795def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 796 797def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 798 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency); 799 let ResourceCycles = [1, 1, 8]; 800 let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2); 801} 802def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 803 804defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 805 806def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> { 807 let Latency = 3; 808 let ResourceCycles = [6]; 809 let NumMicroOps = 7; 810} 811def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 812 813def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 814 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency); 815 let ResourceCycles = [1, 1, 8]; 816 let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2); 817} 818def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 819 820def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> { 821 let Latency = 4; 822 let ResourceCycles = [8]; 823 let NumMicroOps = 9; 824} 825def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 826 827def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { 828 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency); 829 let ResourceCycles = [1, 1, 8]; 830 let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2); 831} 832def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 833 834// Double shift instructions. 835defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>; 836defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>; 837defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; 838defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; 839 840// BMI1 BEXTR/BLS, BMI2 BZHI 841defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 842defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>; 843defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; 844 845// Idioms that clear a register, like xorps %xmm0, %xmm0. 846// These can often bypass execution ports completely. 847defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>; 848 849// Branches don't produce values, so they have no latency, but they still 850// consume resources. Indirect branches can fold loads. 851defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 852 853// Floating point. This covers both scalar and vector operations. 854defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>; 855defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; 856defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; 857defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 858defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 859defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 860defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 861defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 862defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 863 864def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> { 865 let Latency = 2; // FIXME: not from llvm-exegesis 866 let ResourceCycles = [1, 1]; 867 let NumMicroOps = 2; 868} 869def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 870 VMOVHPDmr, VMOVHPSmr)>; 871 872defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 873defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 874defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 875defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 876defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 877 878defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 879defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; 880defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; 881defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 882 883defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub. 884 885def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 886 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 887 let ResourceCycles = [1, 1, 24]; 888 let NumMicroOps = 2; 889} 890def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 891 SUB_FI16m, SUB_FI32m, 892 SUBR_FI16m, SUBR_FI32m, 893 MUL_FI16m, MUL_FI32m)>; 894 895def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 896 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 897 let ResourceCycles = [1, 1, 62]; 898 let NumMicroOps = 2; 899} 900def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 901 DIVR_FI16m, DIVR_FI32m)>; 902 903defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 904defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 905defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM). 906defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 907defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 908defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 909defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM). 910defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare. 911defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM). 912defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM). 913defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM). 914defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare. 915defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM). 916defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM). 917defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM). 918defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 919defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 920defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication. 921defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 922defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 923defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM). 924defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication. 925defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 926defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 927defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM). 928defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division. 929defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM). 930defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM). 931defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM). 932defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division. 933defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 934defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 935defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM). 936defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root. 937defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 938defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 939defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM). 940defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root. 941defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 942defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 943defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM). 944defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 945defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate. 946defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM). 947defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM). 948defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM). 949defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate. 950defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM). 951defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM). 952defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM). 953defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add. 954defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM). 955defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM). 956defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM). 957defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 958defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 959defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 960defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 961defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding. 962defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 963defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM). 964 965defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 966defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 967defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM). 968defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 969defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 970defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM). 971defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 972defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 973defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM). 974defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 975defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 976defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM). 977defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends. 978defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 979defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM). 980defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends. 981defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 982defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM). 983 984// Horizontal Add/Sub (float and integer) 985defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>; 986defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>; 987defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>; 988defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 989defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>; 990defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>; 991defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>; 992 993// Vector integer operations. 994defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 995defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 996defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 997defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 998defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 999defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 1000defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; 1001defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1002defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1003 1004def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { 1005 let Latency = 4; 1006 let ResourceCycles = [1]; 1007 let NumMicroOps = 1; 1008} 1009def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 1010 1011def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { 1012 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1013 let ResourceCycles = [1, 1, 1]; 1014 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1015} 1016def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1017 1018def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { 1019 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1020 let ResourceCycles = [1, 1, 1]; 1021 let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1022} 1023def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1024 1025defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1026defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1027defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; 1028defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 1029defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; 1030defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; 1031defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; 1032 1033defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>; 1034defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>; 1035 1036def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { 1037 let Latency = 1; 1038 let ResourceCycles = [1, 2]; 1039 let NumMicroOps = 2; 1040} 1041def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1042 1043def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { 1044 let Latency = 1; 1045 let ResourceCycles = [1, 4]; 1046 let NumMicroOps = 2; 1047} 1048def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1049 1050defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1051 1052def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { 1053 let Latency = 3; 1054 let ResourceCycles = [1, 1]; 1055 let NumMicroOps = 1; 1056} 1057def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1058 1059def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { 1060 let Latency = 3; 1061 let ResourceCycles = [1, 1]; 1062 let NumMicroOps = 2; 1063} 1064def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1065 1066defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1067 1068def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> { 1069 let Latency = 2; 1070 let ResourceCycles = [2]; 1071 let NumMicroOps = 1; 1072} 1073def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1074 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1075 PAVGBrr, PAVGWrr, 1076 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1077 VPABSBrr, VPABSDrr, VPABSWrr, 1078 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1079 VPAVGBrr, VPAVGWrr, 1080 VPCMPEQQrr, 1081 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1082 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1083 1084def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> { 1085 let Latency = 1; 1086 let ResourceCycles = [1]; 1087 let NumMicroOps = 1; 1088} 1089def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr, 1090 KANDBrr, KANDDrr, KANDQrr, KANDWrr, 1091 KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr, 1092 KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk, 1093 KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk, 1094 KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr, 1095 KORBrr, KORDrr, KORQrr, KORWrr, 1096 KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr, 1097 KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr, 1098 KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr, 1099 KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr, 1100 KXORBrr, KXORDrr, KXORQrr, KXORWrr)>; 1101 1102def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> { 1103 let Latency = 1; 1104 let ResourceCycles = [1]; 1105 let NumMicroOps = 1; 1106} 1107def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>; 1108 1109def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { 1110 let Latency = 1; 1111 let ResourceCycles = [1]; 1112 let NumMicroOps = 1; 1113} 1114def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; 1115 1116def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { 1117 // TODO: All align instructions are expected to be of 4 cycle latency 1118 let Latency = 4; 1119 let ResourceCycles = [1]; 1120 let NumMicroOps = 1; 1121} 1122def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, 1123 VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) 1124 >; 1125defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1126 1127def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { 1128 let Latency = 1; 1129 let ResourceCycles = [1]; 1130 let NumMicroOps = 1; 1131} 1132def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1133 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1134 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1135 VPAVGBYrr, VPAVGWYrr, 1136 VPCMPEQQYrr, 1137 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1138 1139defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM). 1140 1141defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1142defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1143defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1144defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM). 1145defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1146defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1147defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM). 1148defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1149defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM). 1150defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1151defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM). 1152defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1153defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1154defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1155defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM). 1156defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1157defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1158defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1159defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM). 1160defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD. 1161defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1162defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM). 1163defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles. 1164defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1165defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1166defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM). 1167defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1168defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1169defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1170defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM). 1171defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends. 1172defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1173defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM). 1174defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends. 1175defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1176defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM). 1177defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1178defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1179defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1180defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM). 1181defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1182defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1183defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM). 1184defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1185 1186// Vector insert/extract operations. 1187defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1188defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1189defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1190 1191// MOVMSK operations. 1192defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1193defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1194defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>; 1195defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; 1196 1197// Conversion between integer and float. 1198defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer. 1199defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM). 1200defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM). 1201defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM). 1202 1203def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1204 let Latency = 1; 1205 let ResourceCycles = [2]; 1206 let NumMicroOps = 2; 1207} 1208defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer. 1209 1210defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1211defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM). 1212defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM). 1213 1214defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1215defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1216defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1217defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM). 1218 1219def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1220 let Latency = 2; 1221 let ResourceCycles = [6]; 1222 let NumMicroOps = 2; 1223} 1224 1225defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1226defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1227defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1228defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM). 1229 1230def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> { 1231 let Latency = 3; 1232 let ResourceCycles = [1]; 1233 let NumMicroOps = 2; 1234} 1235 1236defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1237defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1238defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1239defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM). 1240 1241defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1242defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1243defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1244defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM). 1245 1246defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1247defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1248defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM). 1249 1250defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1251defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1252defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM). 1253 1254defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1255defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1256defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM). 1257 1258// CRC32 instruction. 1259defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>; 1260 1261def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { 1262 let Latency = 2; 1263 let ResourceCycles = [2]; 1264 let NumMicroOps = 2; 1265} 1266def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1267 1268def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1269 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); 1270 let ResourceCycles = [1, 1, 2]; 1271 let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); 1272} 1273def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1274 1275def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { 1276 let Latency = 1; 1277 let ResourceCycles = [2]; 1278 let NumMicroOps = 1; 1279} 1280def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1281 1282def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1283 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1284 let ResourceCycles = [1, 1, 2]; 1285 let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1286} 1287def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1288 1289def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { 1290 let Latency = 2; 1291 let ResourceCycles = [3]; 1292 let NumMicroOps = 2; 1293} 1294def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1295 1296def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1297 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); 1298 let ResourceCycles = [1, 1, 3]; 1299 let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); 1300} 1301def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1302 1303def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { 1304 let Latency = 3; 1305 let ResourceCycles = [8]; 1306 let NumMicroOps = 4; 1307} 1308def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1309 1310def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { 1311 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); 1312 let ResourceCycles = [1, 1, 8]; 1313 let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); 1314} 1315def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1316 1317def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> { 1318 let Latency = 6; 1319 let ResourceCycles = [8]; 1320 let NumMicroOps = 1; 1321} 1322def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1323 1324def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> { 1325 let Latency = 4; 1326 let ResourceCycles = [8]; 1327 let NumMicroOps = 1; 1328} 1329def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1330 1331// Strings instructions. 1332// Packed Compare Implicit Length Strings, Return Mask 1333defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1334// Packed Compare Explicit Length Strings, Return Mask 1335defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1336// Packed Compare Implicit Length Strings, Return Index 1337defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; 1338// Packed Compare Explicit Length Strings, Return Index 1339defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1340 1341// AES instructions. 1342defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption. 1343defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn. 1344defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. 1345 1346// Carry-less multiplication instructions. 1347defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; 1348 1349// EMMS/FEMMS 1350defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1351 1352// Load/store MXCSR 1353defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1354defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1355 1356// Catch-all for expensive system instructions. 1357defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>; 1358 1359def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> { 1360 let Latency = 0; // FIXME: not from llvm-exegesis 1361 let ResourceCycles = [1]; 1362 let NumMicroOps = 1; 1363} 1364def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>; 1365 1366def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> { 1367 let Latency = 10; // FIXME: not from llvm-exegesis 1368 let ResourceCycles = [24]; 1369 let NumMicroOps = 18; 1370} 1371def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>; 1372 1373// AVX2. 1374defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1375defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1376defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles. 1377 1378def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { 1379 let Latency = 3; 1380 let ResourceCycles = [1]; 1381 let NumMicroOps = 1; 1382} 1383def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1384 1385def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1386 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); 1387 let ResourceCycles = [1, 1, 1]; 1388 let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1389} 1390def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1391 1392def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { 1393 let Latency = 7; 1394 let ResourceCycles = [1]; 1395 let NumMicroOps = 2; 1396} 1397def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; 1398 1399def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1400 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); 1401 let ResourceCycles = [1, 1, 2]; 1402 let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); 1403} 1404def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1405 1406def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { 1407 let Latency = 6; 1408 let ResourceCycles = [1]; 1409 let NumMicroOps = 2; 1410} 1411def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1412 1413def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1414 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); 1415 let ResourceCycles = [1, 1, 2]; 1416 let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); 1417} 1418def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1419 1420def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { 1421 let Latency = 5; 1422 let ResourceCycles = [1]; 1423 let NumMicroOps = 2; 1424} 1425def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; 1426 1427def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { 1428 let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); 1429 let ResourceCycles = [1, 1, 2]; 1430 let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); 1431} 1432def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; 1433 1434defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1435defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. 1436defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts. 1437defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1438defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM). 1439 1440// Old microcoded instructions that nobody use. 1441defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>; 1442 1443// Fence instructions. 1444defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>; 1445 1446def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> { 1447 let Latency = 1; 1448 let ResourceCycles = [30]; 1449 let NumMicroOps = 1; 1450} 1451def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>; 1452 1453def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> { 1454 let Latency = 1; 1455 let ResourceCycles = [1]; 1456 let NumMicroOps = 1; 1457} 1458def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>; 1459 1460// Nop, not very useful expect it provides a model for nops! 1461defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1462 1463 1464/////////////////////////////////////////////////////////////////////////////// 1465// Zero Cycle Move 1466/////////////////////////////////////////////////////////////////////////////// 1467 1468def Zn4WriteZeroLatency : SchedWriteRes<[]> { 1469 let Latency = 0; 1470 let ResourceCycles = []; 1471 let NumMicroOps = 1; 1472} 1473def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1474 MOV64rr, MOV64rr_REV, 1475 MOVSX32rr32)>; 1476 1477def Zn4WriteSwapRenameable : SchedWriteRes<[]> { 1478 let Latency = 0; 1479 let ResourceCycles = []; 1480 let NumMicroOps = 2; 1481} 1482def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1483 XCHG64rr, XCHG64ar)>; 1484 1485defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1486 1487defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1488defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1489defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>; 1490 1491defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX 1492defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1493defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1494defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>; 1495 1496def : IsOptimizableRegisterMove<[ 1497 InstructionEquivalenceClass<[ 1498 // GPR variants. 1499 MOV32rr, MOV32rr_REV, 1500 MOV64rr, MOV64rr_REV, 1501 MOVSX32rr32, 1502 XCHG32rr, XCHG32ar, 1503 XCHG64rr, XCHG64ar, 1504 1505 // MMX variants. 1506 // MMX moves are *NOT* eliminated. 1507 1508 // SSE variants. 1509 MOVAPSrr, MOVAPSrr_REV, 1510 MOVUPSrr, MOVUPSrr_REV, 1511 MOVAPDrr, MOVAPDrr_REV, 1512 MOVUPDrr, MOVUPDrr_REV, 1513 MOVDQArr, MOVDQArr_REV, 1514 MOVDQUrr, MOVDQUrr_REV, 1515 1516 // AVX variants. 1517 VMOVAPSrr, VMOVAPSrr_REV, 1518 VMOVUPSrr, VMOVUPSrr_REV, 1519 VMOVAPDrr, VMOVAPDrr_REV, 1520 VMOVUPDrr, VMOVUPDrr_REV, 1521 VMOVDQArr, VMOVDQArr_REV, 1522 VMOVDQUrr, VMOVDQUrr_REV, 1523 1524 // AVX YMM variants. 1525 VMOVAPSYrr, VMOVAPSYrr_REV, 1526 VMOVUPSYrr, VMOVUPSYrr_REV, 1527 VMOVAPDYrr, VMOVAPDYrr_REV, 1528 VMOVUPDYrr, VMOVUPDYrr_REV, 1529 VMOVDQAYrr, VMOVDQAYrr_REV, 1530 VMOVDQUYrr, VMOVDQUYrr_REV, 1531 ], TruePred > 1532]>; 1533 1534// FIXUP and RANGE Instructions 1535def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { 1536 let Latency = 2; 1537 let ResourceCycles = [2]; 1538 let NumMicroOps = 1; 1539} 1540def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex 1541 "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", 1542 "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", 1543 "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" 1544 )>; 1545 1546// SCALE & REDUCE instructions 1547def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> { 1548 let Latency = 6; 1549 let ResourceCycles = [6]; 1550 let NumMicroOps = 2; 1551} 1552def : InstRW<[Zn4WriteSCALErr], (instregex 1553 "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)", 1554 "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)" 1555 )>; 1556 1557//BF16PS Instructions 1558def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> { 1559 let Latency = 6; 1560 let ResourceCycles = [6]; 1561 let NumMicroOps = 2; 1562} 1563def : InstRW<[Zn4WriteBF16], (instregex 1564 "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)" 1565 )>; 1566 1567// BUSD and VPMADD Instructions 1568def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { 1569 let Latency = 4; 1570 let ResourceCycles = [4]; 1571 let NumMicroOps = 1; 1572} 1573def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex 1574 "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", 1575 "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" 1576 )>; 1577 1578// SHIFT instructions 1579def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> { 1580 let Latency = 2; 1581 let ResourceCycles = [2]; 1582 let NumMicroOps = 1; 1583} 1584def : InstRW<[Zn4WriteSHIFTrr], (instregex 1585 "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)", 1586 "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)", 1587 "(V?)P(SLL|SRL|SRA)DQYri", 1588 "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri", 1589 "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)", 1590 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", 1591 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", 1592 "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", 1593 "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz" 1594 )>; 1595 1596def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { 1597 let Latency = 1; 1598 let ResourceCycles = [1]; 1599 let NumMicroOps = 1; 1600} 1601def : InstRW<[Zn4WriteSHIFTri], (instregex 1602 "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" 1603 )>; 1604 1605// ALIGN Instructions 1606def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { 1607 let Latency = 2; 1608 let ResourceCycles = [2]; 1609 let NumMicroOps = 1; 1610} 1611def : InstRW<[Zn4WriteALIGN], (instregex 1612 "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" 1613 )>; 1614 1615//PACK Instructions 1616def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { 1617 let Latency = 2; 1618 let ResourceCycles = [2]; 1619 let NumMicroOps = 1; 1620} 1621def : InstRW<[Zn4WritePACK], (instregex 1622 "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" 1623 )>; 1624 1625// MAX and MIN Instructions 1626def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> { 1627 let Latency = 2; 1628 let ResourceCycles = [2]; 1629 let NumMicroOps = 1; 1630} 1631def : InstRW<[Zn4WriteFCmp64], (instregex 1632 "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)", 1633 "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)", 1634 "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)", 1635 "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)" 1636 )>; 1637 1638// MOV Instructions 1639def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { 1640 let Latency = 2; 1641 let ResourceCycles = [2]; 1642 let NumMicroOps = 1; 1643} 1644def : InstRW<[Zn4MOVS], (instregex 1645 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)", 1646 "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)", 1647 "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)", 1648 "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)", 1649 "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)" 1650 )>; 1651 1652def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> { 1653 let Latency = 4; 1654 let ResourceCycles = [4]; 1655 let NumMicroOps = 1; 1656} 1657def : InstRW<[Zn4MOVSZ], (instregex 1658 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)" 1659 )>; 1660 1661def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> { 1662 let Latency = 5; 1663 let ResourceCycles = [5]; 1664 let NumMicroOps = 1; 1665} 1666def : InstRW<[Zn4MOVSrr], (instregex 1667 "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)" 1668 )>; 1669 1670 1671//VPTEST Instructions 1672def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> { 1673 let Latency = 3; 1674 let ResourceCycles = [3]; 1675 let NumMicroOps = 1; 1676} 1677def : InstRW<[Zn4VPTESTZ128], (instregex 1678 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)" 1679 )>; 1680 1681def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> { 1682 let Latency = 4; 1683 let ResourceCycles = [4]; 1684 let NumMicroOps = 1; 1685} 1686def : InstRW<[Zn4VPTESTZ256], (instregex 1687 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)" 1688 )>; 1689 1690def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> { 1691 let Latency = 5; 1692 let ResourceCycles = [5]; 1693 let NumMicroOps = 1; 1694} 1695def : InstRW<[Zn4VPTESTZ], (instregex 1696 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)" 1697 )>; 1698 1699// CONFLICT Instructions 1700def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> { 1701 let Latency = 2; 1702 let ResourceCycles = [2]; 1703 let NumMicroOps = 1; 1704} 1705def : InstRW<[Zn4CONFLICTZ128], (instregex 1706 "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)" 1707 )>; 1708 1709def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> { 1710 let Latency = 6; 1711 let ResourceCycles = [2,2,2]; 1712 let NumMicroOps = 4; 1713} 1714def : InstRW<[Zn4CONFLICTrr], (instregex 1715 "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)" 1716 )>; 1717 1718// RSQRT Instructions 1719def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> { 1720 let Latency = 5; 1721 let ResourceCycles = [2]; 1722 let NumMicroOps = 1; 1723} 1724def : InstRW<[Zn4VRSQRT14PDZ256], (instregex 1725 "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)" 1726 )>; 1727 1728 1729// PERM Instructions 1730def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> { 1731 let Latency = 2; 1732 let ResourceCycles = [2]; 1733 let NumMicroOps = 1; 1734} 1735def : InstRW<[Zn4PERMILP], (instregex 1736 "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)" 1737 )>; 1738 1739def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> { 1740 let Latency = 3; 1741 let ResourceCycles = [2]; 1742 let NumMicroOps = 1; 1743} 1744def : InstRW<[Zn4PERMIT2_128], (instregex 1745 "VPERM(I2|T2)(PS|PD|W)128(rr|rrk|rrkz)", 1746 "VPERM(I2|T2)(B|D|Q)128(rr|rrk|rrkz)" 1747 )>; 1748 1749def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> { 1750 let Latency = 2; 1751 let ResourceCycles = [2]; 1752 let NumMicroOps = 1; 1753} 1754def : InstRW<[Zn4PERMIT2_128rr], (instregex 1755 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)", 1756 "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)" 1757 )>; 1758 1759def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> { 1760 let Latency = 4; 1761 let ResourceCycles = [2]; 1762 let NumMicroOps = 1; 1763} 1764def : InstRW<[Zn4PERMIT2_256], (instregex 1765 "VPERM(I2|T2)(PS|PD|W)256(rr|rrk|rrkz)", 1766 "VPERMP(S|D)Z256(rr|rrk|rrkz)", 1767 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)", 1768 "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)", 1769 "VPERM(I2|Q|T2)(B|D|Q)(Z?)256(rr|rrk|rrkz)", 1770 "VPEXPAND(B|W)Z256(rr|rrk|rrkz)" 1771 )>; 1772 1773def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> { 1774 let Latency = 5; 1775 let ResourceCycles = [2]; 1776 let NumMicroOps = 1; 1777} 1778def : InstRW<[Zn4PERMIT2Z], (instregex 1779 "VPERM(I2|T2)(PS|PD|W)(rr|rrk|rrkz)", 1780 "VPERM(B|D|W)Z(rr|rrk|rrkz)", 1781 "VPERM(I2|Q|T2)(B|D|Q)(Z?)(rr|rrk|rrkz)", 1782 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)", 1783 "VPEXPAND(B|W)Z(rr|rrk|rrkz)", 1784 "VPERMP(S|D)Z(rr|rrk|rrkz)" 1785 )>; 1786 1787// ALU SLOW Misc Instructions 1788def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> { 1789 let Latency = 2; 1790 let ResourceCycles = [2]; 1791 let NumMicroOps = 1; 1792} 1793def : InstRW<[Zn4VecALUZSlow], (instrs 1794 VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, 1795 VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, 1796 VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, 1797 VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, 1798 VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, 1799 VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, 1800 VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, 1801 VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, 1802 VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, 1803 VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, 1804 VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, 1805 VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, 1806 VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, 1807 VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz 1808 )>; 1809 1810 1811/////////////////////////////////////////////////////////////////////////////// 1812// Dependency breaking instructions. 1813/////////////////////////////////////////////////////////////////////////////// 1814 1815def Zn4WriteZeroIdiom : SchedWriteVariant<[ 1816 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1817 SchedVar<NoSchedPred, [WriteALU]> 1818]>; 1819def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1820 XOR64rr, XOR64rr_REV, 1821 SUB32rr, SUB32rr_REV, 1822 SUB64rr, SUB64rr_REV)>; 1823 1824def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1825 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>, 1826 SchedVar<NoSchedPred, [WriteALU]> 1827]>; 1828def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1829 CMP16rr, CMP16rr_REV, 1830 CMP32rr, CMP32rr_REV, 1831 CMP64rr, CMP64rr_REV)>; 1832 1833def Zn4WriteFZeroIdiom : SchedWriteVariant<[ 1834 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1835 SchedVar<NoSchedPred, [WriteFLogic]> 1836]>; 1837// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1838def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1839 VANDNPSrr, VANDNPDrr)>; 1840 1841def Zn4WriteFZeroIdiomY : SchedWriteVariant<[ 1842 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1843 SchedVar<NoSchedPred, [WriteFLogicY]> 1844]>; 1845def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1846 VANDNPSYrr, VANDNPDYrr)>; 1847 1848def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1849 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1850 SchedVar<NoSchedPred, [WriteVecLogicX]> 1851]>; 1852// NOTE: PXORrr,PANDNrr are not zero-cycle! 1853def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1854 1855def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1856 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1857 SchedVar<NoSchedPred, [WriteVecLogicY]> 1858]>; 1859def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1860 1861def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[ 1862 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1863 SchedVar<NoSchedPred, [WriteVecALUX]> 1864]>; 1865// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1866// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1867def : InstRW<[Zn4WriteVZeroIdiomALUX], 1868 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1869 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1870 1871def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[ 1872 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, 1873 SchedVar<NoSchedPred, [WriteVecALUY]> 1874]>; 1875def : InstRW<[Zn4WriteVZeroIdiomALUY], 1876 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1877 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1878 1879def : IsZeroIdiomFunction<[ 1880 // GPR Zero-idioms. 1881 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1882 XOR64rr, XOR64rr_REV, 1883 SUB32rr, SUB32rr_REV, 1884 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1885 1886 // SSE XMM Zero-idioms. 1887 DepBreakingClass<[ 1888 // fp variants. 1889 XORPSrr, XORPDrr, 1890 ANDNPSrr, ANDNPDrr, 1891 1892 // int variants. 1893 PXORrr, 1894 PANDNrr, 1895 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1896 PSUBSBrr, PSUBSWrr, 1897 PSUBUSBrr, PSUBUSWrr, 1898 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1899 ], ZeroIdiomPredicate>, 1900 1901 // AVX XMM Zero-idioms. 1902 DepBreakingClass<[ 1903 // fp variants. 1904 VXORPSrr, VXORPDrr, 1905 VANDNPSrr, VANDNPDrr, 1906 1907 // int variants. 1908 VPXORrr, 1909 VPANDNrr, 1910 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1911 VPSUBSBrr, VPSUBSWrr, 1912 VPSUBUSBrr, VPSUBUSWrr, 1913 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1914 ], ZeroIdiomPredicate>, 1915 1916 // AVX YMM Zero-idioms. 1917 DepBreakingClass<[ 1918 // fp variants. 1919 VXORPSYrr, VXORPDYrr, 1920 VANDNPSYrr, VANDNPDYrr, 1921 1922 // int variants. 1923 VPXORYrr, 1924 VPANDNYrr, 1925 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1926 VPSUBSBYrr, VPSUBSWYrr, 1927 VPSUBUSBYrr, VPSUBUSWYrr, 1928 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1929 ], ZeroIdiomPredicate>, 1930]>; 1931 1932def : IsDepBreakingFunction<[ 1933 // GPR 1934 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1935 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1936 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1937 CMP16rr, CMP16rr_REV, 1938 CMP32rr, CMP32rr_REV, 1939 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1940 // SSE 1941 DepBreakingClass<[ 1942 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1943 ], ZeroIdiomPredicate>, 1944 1945 // AVX XMM 1946 DepBreakingClass<[ 1947 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1948 ], ZeroIdiomPredicate>, 1949 1950 // AVX YMM 1951 DepBreakingClass<[ 1952 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1953 ], ZeroIdiomPredicate>, 1954]>; 1955 1956} // SchedModel 1957 1958