1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver3 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog 15// http://www.agner.org/optimize/microarchitecture.pdf 16// * AMD Zen 3 Ryzen Deep Dive Review 17// https://www.anandtech.com/show/16214/ 18//===----------------------------------------------------------------------===// 19 20def Znver3Model : SchedMachineModel { 21 // AMD SOG 19h, 2.9.6 Dispatch 22 // The processor may dispatch up to 6 macro ops per cycle 23 // into the execution engine. 24 let IssueWidth = 6; 25 // AMD SOG 19h, 2.10.3 26 // The retire control unit (RCU) tracks the completion status of all 27 // outstanding operations (integer, load/store, and floating-point) and is 28 // the final arbiter for exception processing and recovery. 29 // The unit can receive up to 6 macro ops dispatched per cycle and track up 30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. 31 let MicroOpBufferSize = 256; 32 // AMD SOG 19h, 2.9.1 Op Cache 33 // The op cache is organized as an associative cache with 64 sets and 8 ways. 34 // At each set-way intersection is an entry containing up to 8 macro ops. 35 // The maximum capacity of the op cache is 4K ops. 36 // Agner, 22.5 µop cache 37 // The size of the µop cache is big enough for holding most critical loops. 38 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, 39 // with large values here the compilation of certain loops 40 // ends up taking way too long. 41 // let LoopMicroOpBufferSize = 4096; 42 let LoopMicroOpBufferSize = 512; 43 // AMD SOG 19h, 2.6.2 L1 Data Cache 44 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 45 // AMD SOG 19h, 2.12 L1 Data Cache 46 // The AGU and LS pipelines are optimized for simple address generation modes. 47 // <...> and can achieve 4-cycle load-to-use integer load latency. 48 let LoadLatency = 4; 49 // AMD SOG 19h, 2.12 L1 Data Cache 50 // The AGU and LS pipelines are optimized for simple address generation modes. 51 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 52 int VecLoadLatency = 7; 53 // Latency of a simple store operation. 54 int StoreLatency = 1; 55 // FIXME 56 let HighLatency = 25; // FIXME: any better choice? 57 // AMD SOG 19h, 2.8 Optimizing Branching 58 // The branch misprediction penalty is in the range from 11 to 18 cycles, 59 // <...>. The common case penalty is 13 cycles. 60 let MispredictPenalty = 13; 61 62 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 63 64 let CompleteModel = 1; 65} 66 67let SchedModel = Znver3Model in { 68 69 70//===----------------------------------------------------------------------===// 71// RCU 72//===----------------------------------------------------------------------===// 73 74// AMD SOG 19h, 2.10.3 Retire Control Unit 75// The unit can receive up to 6 macro ops dispatched per cycle and track up to 76// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 77// The retire unit handles in-order commit of up to eight macro ops per cycle. 78def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>; 79 80//===----------------------------------------------------------------------===// 81// Units 82//===----------------------------------------------------------------------===// 83 84// There are total of three Units, each one with it's own schedulers. 85 86//===----------------------------------------------------------------------===// 87// Integer Execution Unit 88// 89 90// AMD SOG 19h, 2.4 Superscalar Organization 91// The processor uses four decoupled independent integer scheduler queues, 92// each one servicing one ALU pipeline and one or two other pipelines 93 94// 95// Execution pipes 96//===----------------------------------------------------------------------===// 97 98// AMD SOG 19h, 2.10.2 Execution Units 99// The processor contains 4 general purpose integer execution pipes. 100// Each pipe has an ALU capable of general purpose integer operations. 101def Zn3ALU0 : ProcResource<1>; 102def Zn3ALU1 : ProcResource<1>; 103def Zn3ALU2 : ProcResource<1>; 104def Zn3ALU3 : ProcResource<1>; 105 106// AMD SOG 19h, 2.10.2 Execution Units 107// There is also a separate branch execution unit. 108def Zn3BRU1 : ProcResource<1>; 109 110// AMD SOG 19h, 2.10.2 Execution Units 111// There are three Address Generation Units (AGUs) for all load and store 112// address generation. There are also 3 store data movement units 113// associated with the same schedulers as the AGUs. 114def Zn3AGU0 : ProcResource<1>; 115def Zn3AGU1 : ProcResource<1>; 116def Zn3AGU2 : ProcResource<1>; 117 118// 119// Execution Units 120//===----------------------------------------------------------------------===// 121 122// AMD SOG 19h, 2.10.2 Execution Units 123// ALU0 additionally has divide <...> execution capability. 124defvar Zn3Divider = Zn3ALU0; 125 126// AMD SOG 19h, 2.10.2 Execution Units 127// ALU0 additionally has <...> branch execution capability. 128defvar Zn3BRU0 = Zn3ALU0; 129 130// Integer Multiplication issued on ALU1. 131defvar Zn3Multiplier = Zn3ALU1; 132 133// Execution pipeline grouping 134//===----------------------------------------------------------------------===// 135 136// General ALU operations 137def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>; 138 139// General AGU operations 140def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>; 141 142// Control flow: jumps, calls 143def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>; 144 145// Everything that isn't control flow, but still needs to access CC register, 146// namely: conditional moves, SETcc. 147def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>; 148 149// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 150 151// Simple bit twiddling: bit test, shift/rotate, bit extraction 152def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>; 153 154 155// 156// Scheduling 157//===----------------------------------------------------------------------===// 158 159// AMD SOG 19h, 2.10.3 Retire Control Unit 160// The integer physical register file (PRF) consists of 192 registers. 161def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0], 162 6, // Max moves that can be eliminated per cycle. 163 0>; // Restrict move elimination to zero regs. 164 165// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 166// AMD SOG 19h, 2.10.1 Schedulers 167// The schedulers can receive up to six macro ops per cycle, with a limit of 168// two per scheduler. Each scheduler can issue one micro op per cycle into 169// each of its associated pipelines 170// FIXME: these are 4 separate schedulers, not a single big one. 171def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0 172 Zn3ALU1, Zn3AGU1, // scheduler 1 173 Zn3ALU2, Zn3AGU2, // scheduler 2 174 Zn3ALU3, Zn3BRU1 // scheduler 3 175 ]> { 176 let BufferSize = !mul(4, 24); 177} 178 179 180//===----------------------------------------------------------------------===// 181// Floating-Point Unit 182// 183 184// AMD SOG 19h, 2.4 Superscalar Organization 185// The processor uses <...> two decoupled independent floating point schedulers 186// each servicing two FP pipelines and one store or FP-to-integer pipeline. 187 188// 189// Execution pipes 190//===----------------------------------------------------------------------===// 191 192// AMD SOG 19h, 2.10.1 Schedulers 193// <...>, and six FPU pipes. 194// Agner, 22.10 Floating point execution pipes 195// There are six floating point/vector execution pipes, 196def Zn3FPP0 : ProcResource<1>; 197def Zn3FPP1 : ProcResource<1>; 198def Zn3FPP2 : ProcResource<1>; 199def Zn3FPP3 : ProcResource<1>; 200def Zn3FPP45 : ProcResource<2>; 201 202// 203// Execution Units 204//===----------------------------------------------------------------------===// 205// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 206 207// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 208defvar Zn3FPFMul0 = Zn3FPP0; 209defvar Zn3FPFMul1 = Zn3FPP1; 210 211// (v)FADD* 212defvar Zn3FPFAdd0 = Zn3FPP2; 213defvar Zn3FPFAdd1 = Zn3FPP3; 214 215// All convert operations except pack/unpack 216defvar Zn3FPFCvt0 = Zn3FPP2; 217defvar Zn3FPFCvt1 = Zn3FPP3; 218 219// All Divide and Square Root except Reciprocal Approximation 220// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 221// FDIV unit can support 2 simultaneous operations in flight 222// even though it occupies a single pipe. 223// FIXME: BufferSize=2 ? 224defvar Zn3FPFDiv = Zn3FPP1; 225 226// Moves and Logical operations on Floating Point Data Types 227defvar Zn3FPFMisc0 = Zn3FPP0; 228defvar Zn3FPFMisc1 = Zn3FPP1; 229defvar Zn3FPFMisc2 = Zn3FPP2; 230defvar Zn3FPFMisc3 = Zn3FPP3; 231 232// Integer Adds, Subtracts, and Compares 233// Some complex VADD operations are not available in all pipes. 234defvar Zn3FPVAdd0 = Zn3FPP0; 235defvar Zn3FPVAdd1 = Zn3FPP1; 236defvar Zn3FPVAdd2 = Zn3FPP2; 237defvar Zn3FPVAdd3 = Zn3FPP3; 238 239// Integer Multiplies, SAD, Blendvb 240defvar Zn3FPVMul0 = Zn3FPP0; 241defvar Zn3FPVMul1 = Zn3FPP3; 242 243// Data Shuffles, Packs, Unpacks, Permute 244// Some complex shuffle operations are only available in pipe1. 245defvar Zn3FPVShuf = Zn3FPP1; 246defvar Zn3FPVShufAux = Zn3FPP2; 247 248// Bit Shift Left/Right operations 249defvar Zn3FPVShift0 = Zn3FPP1; 250defvar Zn3FPVShift1 = Zn3FPP2; 251 252// Moves and Logical operations on Packed Integer Data Types 253defvar Zn3FPVMisc0 = Zn3FPP0; 254defvar Zn3FPVMisc1 = Zn3FPP1; 255defvar Zn3FPVMisc2 = Zn3FPP2; 256defvar Zn3FPVMisc3 = Zn3FPP3; 257 258// *AES* 259defvar Zn3FPAES0 = Zn3FPP0; 260defvar Zn3FPAES1 = Zn3FPP1; 261 262// *CLM* 263defvar Zn3FPCLM0 = Zn3FPP0; 264defvar Zn3FPCLM1 = Zn3FPP1; 265 266// Execution pipeline grouping 267//===----------------------------------------------------------------------===// 268 269// AMD SOG 19h, 2.11 Floating-Point Unit 270// Stores and floating point to general purpose register transfer 271// have 2 dedicated pipelines (pipe 5 and 6). 272def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>; 273 274// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 275def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>; 276 277// (v)FADD* 278// Some complex VADD operations are not available in all pipes. 279def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>; 280 281// All convert operations except pack/unpack 282def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>; 283 284// All Divide and Square Root except Reciprocal Approximation 285// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>; 286 287// Moves and Logical operations on Floating Point Data Types 288def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>; 289 290def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>; 291 292// Loads, Stores and Move to General Register (EX) Operations 293// AMD SOG 19h, 2.11 Floating-Point Unit 294// Stores and floating point to general purpose register transfer 295// have 2 dedicated pipelines (pipe 5 and 6). 296defvar Zn3FPLd01 = Zn3FPP45; 297 298// AMD SOG 19h, 2.11 Floating-Point Unit 299// Note that FP stores are supported on two pipelines, 300// but throughput is limited to one per cycle. 301let Super = Zn3FPP45 in 302def Zn3FPSt : ProcResource<1>; 303 304// Integer Adds, Subtracts, and Compares 305// Some complex VADD operations are not available in all pipes. 306def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>; 307 308def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>; 309def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>; 310 311// Integer Multiplies, SAD, Blendvb 312def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>; 313 314// Data Shuffles, Packs, Unpacks, Permute 315// Some complex shuffle operations are only available in pipe1. 316def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>; 317 318// Bit Shift Left/Right operations 319def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>; 320 321// Moves and Logical operations on Packed Integer Data Types 322def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>; 323 324// *AES* 325def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>; 326 327// *CLM* 328def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>; 329 330 331// 332// Scheduling 333//===----------------------------------------------------------------------===// 334 335// Agner, 21.8 Register renaming and out-of-order schedulers 336// The floating point register file has 160 vector registers 337// of 128 bits each in Zen 1 and 256 bits each in Zen 2. 338// anandtech also confirms this. 339def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1], 340 6, // Max moves that can be eliminated per cycle. 341 0>; // Restrict move elimination to zero regs. 342 343// AMD SOG 19h, 2.11 Floating-Point Unit 344// The floating-point scheduler has a 2*32 entry macro op capacity. 345// AMD SOG 19h, 2.11 Floating-Point Unit 346// <...> the scheduler can issue 1 micro op per cycle for each pipe. 347// FIXME: those are two separate schedulers, not a single big one. 348def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0 349 Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1 350 ]> { 351 let BufferSize = !mul(2, 32); 352} 353 354// AMD SOG 19h, 2.11 Floating-Point Unit 355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 356// even if floating-point scheduler is full. 357// FIXME: how to model this properly? 358 359 360//===----------------------------------------------------------------------===// 361// Load-Store Unit 362// 363 364// AMD SOG 19h, 2.12 Load-Store Unit 365// The LS unit contains three largely independent pipe-lines 366// enabling the execution of three 256-bit memory operations per cycle. 367def Zn3LSU : ProcResource<3>; 368 369// AMD SOG 19h, 2.12 Load-Store Unit 370// All three memory operations can be loads. 371let Super = Zn3LSU in 372def Zn3Load : ProcResource<3> { 373 // AMD SOG 19h, 2.12 Load-Store Unit 374 // The LS unit can process up to 72 out-of-order loads. 375 let BufferSize = 72; 376} 377 378def Zn3LoadQueue : LoadQueue<Zn3Load>; 379 380// AMD SOG 19h, 2.12 Load-Store Unit 381// A maximum of two of the memory operations can be stores. 382let Super = Zn3LSU in 383def Zn3Store : ProcResource<2> { 384 // AMD SOG 19h, 2.12 Load-Store Unit 385 // The LS unit utilizes a 64-entry store queue (STQ). 386 let BufferSize = 64; 387} 388 389def Zn3StoreQueue : StoreQueue<Zn3Store>; 390 391//===----------------------------------------------------------------------===// 392// Basic helper classes. 393//===----------------------------------------------------------------------===// 394 395// Many SchedWrites are defined in pairs with and without a folded load. 396// Instructions with folded loads are usually micro-fused, so they only appear 397// as two micro-ops when dispatched by the schedulers. 398// This multiclass defines the resource usage for variants with and without 399// folded loads. 400 401multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 402 int Lat = 1, list<int> Res = [], int UOps = 1> { 403 def : WriteRes<SchedRW, ExePorts> { 404 let Latency = Lat; 405 let ResourceCycles = Res; 406 let NumMicroOps = UOps; 407 } 408} 409 410multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW, 411 list<ProcResourceKind> ExePorts, int Lat, 412 list<int> Res, int UOps, int LoadLat, int LoadUOps, 413 ProcResourceKind AGU, int LoadRes> { 414 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 415 416 defm : __zn3WriteRes<SchedRW.Folded, 417 !listconcat([AGU, Zn3Load], ExePorts), 418 !add(Lat, LoadLat), 419 !if(!and(!empty(Res), !eq(LoadRes, 1)), 420 [], 421 !listconcat([1, LoadRes], 422 !if(!empty(Res), 423 !listsplat(1, !size(ExePorts)), 424 Res))), 425 !add(UOps, LoadUOps)>; 426} 427 428// For classes without folded loads. 429multiclass Zn3WriteResInt<SchedWrite SchedRW, 430 list<ProcResourceKind> ExePorts, int Lat = 1, 431 list<int> Res = [], int UOps = 1> { 432 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 433} 434 435multiclass Zn3WriteResXMM<SchedWrite SchedRW, 436 list<ProcResourceKind> ExePorts, int Lat = 1, 437 list<int> Res = [], int UOps = 1> { 438 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 439} 440 441multiclass Zn3WriteResYMM<SchedWrite SchedRW, 442 list<ProcResourceKind> ExePorts, int Lat = 1, 443 list<int> Res = [], int UOps = 1> { 444 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 445} 446 447// For classes with folded loads. 448multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW, 449 list<ProcResourceKind> ExePorts, int Lat = 1, 450 list<int> Res = [], int UOps = 1, 451 int LoadUOps = 0, int LoadRes = 1> { 452 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 453 Znver3Model.LoadLatency, 454 LoadUOps, Zn3AGU012, LoadRes>; 455} 456 457multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW, 458 list<ProcResourceKind> ExePorts, int Lat = 1, 459 list<int> Res = [], int UOps = 1, 460 int LoadUOps = 0, int LoadRes = 1> { 461 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 462 Znver3Model.VecLoadLatency, 463 LoadUOps, Zn3FPLd01, LoadRes>; 464} 465 466multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW, 467 list<ProcResourceKind> ExePorts, int Lat = 1, 468 list<int> Res = [], int UOps = 1, 469 int LoadUOps = 0, int LoadRes = 1> { 470 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 471 Znver3Model.VecLoadLatency, 472 LoadUOps, Zn3FPLd01, LoadRes>; 473} 474 475 476//===----------------------------------------------------------------------===// 477// Here be dragons. 478//===----------------------------------------------------------------------===// 479 480def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>; 481 482def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>; 483def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>; 484def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>; 485 486// AMD SOG 19h, 2.11 Floating-Point Unit 487// There is 1 cycle of added latency for a result to cross 488// from F to I or I to F domain. 489def : ReadAdvance<ReadInt2Fpu, -1>; 490 491// Instructions with both a load and a store folded are modeled as a folded 492// load + WriteRMW. 493defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>; 494 495// Loads, stores, and moves, not folded with other operations. 496defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>; 497 498// Model the effect of clobbering the read-write mask operand of the GATHER operation. 499// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 500defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>; 501 502def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { 503 let Latency = !add(Znver3Model.LoadLatency, 1); 504 let ResourceCycles = [3, 1]; 505 let NumMicroOps = 1; 506} 507def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 508 509defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 510defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 511defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>; 512 513// Treat misc copies as a move. 514def : InstRW<[WriteMove], (instrs COPY)>; 515 516def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 517 let Latency = Znver3Model.LoadLatency; 518 let ResourceCycles = [1, 1, 4]; 519 let NumMicroOps = 1; 520} 521def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>; 522 523def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> { 524 let Latency = Znver3Model.StoreLatency; 525 let ResourceCycles = [4, 1, 1]; 526 let NumMicroOps = 2; 527} 528def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 529 530// Arithmetic. 531defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op. 532 533def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> { 534 let Latency = 1; 535 let ResourceCycles = [4]; 536 let NumMicroOps = 1; 537} 538def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 539 AND8i8, AND16i16, AND32i32, AND64i32, 540 OR8i8, OR16i16, OR32i32, OR64i32, 541 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 542 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 543 544def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> { 545 let Latency = 1; 546 let ResourceCycles = [4]; 547 let NumMicroOps = 1; 548} 549def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 550 551def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> { 552 let Latency = 1; 553 let ResourceCycles = [2]; 554 let NumMicroOps = 1; 555} 556def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 557 558def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> { 559 let Latency = 3; 560 let ResourceCycles = [1]; 561 let NumMicroOps = 1; 562} 563def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 564 PEXT32rr, PEXT64rr)>; 565 566defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op. 567 568def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> { 569 let Latency = 1; 570 let ResourceCycles = [1, 1, 7, 1]; 571 let NumMicroOps = 1; 572} 573def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 574 575// This is for simple LEAs with one or two input operands. 576defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 577 578// This write is used for slow LEA instructions. 579def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> { 580 let Latency = 2; 581 let ResourceCycles = [1]; 582 let NumMicroOps = 2; 583} 584 585// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset), 586// or an LEA with a `Scale` value different than 1. 587def Zn3SlowLEAPredicate : MCSchedPredicate< 588 CheckAny<[ 589 // A 3-operand LEA (base, index, offset). 590 IsThreeOperandsLEAFn, 591 // An LEA with a "Scale" different than 1. 592 CheckAll<[ 593 CheckIsImmOperand<2>, 594 CheckNot<CheckImmOperand<2, 1>> 595 ]> 596 ]> 597>; 598 599def Zn3WriteLEA : SchedWriteVariant<[ 600 SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>, 601 SchedVar<NoSchedPred, [WriteLEA]> 602]>; 603 604def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 605 606def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> { 607 let Latency = 2; // FIXME: not from llvm-exegesis 608 let ResourceCycles = [4]; 609 let NumMicroOps = 2; 610} 611 612def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>; 613 614// Integer multiplication 615defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 616defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 617defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 618defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 619defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 620defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 621defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 622defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 623defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 624defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 625defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 626defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 627defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>; // Integer multiplication, high part. 628defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 629 630defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 631defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 632 633defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 634 635def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> { 636 let Latency = 3; 637 let ResourceCycles = [12]; 638 let NumMicroOps = 3; 639} 640def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 641 642defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 643 644def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 645 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency); 646 let ResourceCycles = [1, 1, 12]; 647 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2); 648} 649def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 650 651def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> { 652 let Latency = 3; // FIXME: not from llvm-exegesis 653 let ResourceCycles = [24]; 654 let NumMicroOps = 19; 655} 656def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 657 658def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> { 659 let Latency = 4; // FIXME: not from llvm-exegesis 660 let ResourceCycles = [59]; 661 let NumMicroOps = 28; 662} 663def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 664 665def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> { 666 let Latency = 1; 667 let ResourceCycles = [2]; 668 let NumMicroOps = 2; 669} 670def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 671 672def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 673 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 674 let ResourceCycles = [1, 1, 2]; 675 let NumMicroOps = 5; 676} 677def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 678 679def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 680 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 681 let ResourceCycles = [1, 1, 2]; 682 let NumMicroOps = 2; 683} 684def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 685 686// Integer division. 687// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 688// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 689defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>; 690defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>; 691defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>; 692defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>; 693defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>; 694defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>; 695defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>; 696defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>; 697 698defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward. 699defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse. 700 701defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count. 702 703def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 704 let Latency = 1; 705 let ResourceCycles = [4]; 706 let NumMicroOps = 1; 707} 708def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>; 709 710defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count. 711 712def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 713 let Latency = 1; 714 let ResourceCycles = [4]; 715 let NumMicroOps = 1; 716} 717def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>; 718 719defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count. 720 721def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 722 let Latency = 2; 723 let ResourceCycles = [4]; 724 let NumMicroOps = 2; 725} 726def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>; 727 728defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move. 729defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 730defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code. 731defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 732defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH. 733 734defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test 735defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>; 736defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>; 737 738defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set 739defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>; 740defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>; 741 742// Integer shifts and rotates. 743defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 744defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 745defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 746 747def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> { 748 let Latency = 1; 749 let ResourceCycles = [2]; 750 let NumMicroOps = 1; 751} 752def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 753 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 754 755def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 756 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency); 757 let ResourceCycles = [1, 1, 2]; 758 let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1); 759} 760def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 761 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 762 763def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> { 764 let Latency = 3; 765 let ResourceCycles = [6]; 766 let NumMicroOps = 7; 767} 768def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 769 770def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 771 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency); 772 let ResourceCycles = [1, 1, 8]; 773 let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3); 774} 775def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 776 777def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> { 778 let Latency = 4; 779 let ResourceCycles = [8]; 780 let NumMicroOps = 9; 781} 782def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 783 784def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 785 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency); 786 let ResourceCycles = [1, 1, 8]; 787 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2); 788} 789def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 790 791defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 792 793def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> { 794 let Latency = 3; 795 let ResourceCycles = [6]; 796 let NumMicroOps = 7; 797} 798def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 799 800def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 801 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency); 802 let ResourceCycles = [1, 1, 8]; 803 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2); 804} 805def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 806 807def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> { 808 let Latency = 4; 809 let ResourceCycles = [8]; 810 let NumMicroOps = 9; 811} 812def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 813 814def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 815 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency); 816 let ResourceCycles = [1, 1, 8]; 817 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2); 818} 819def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 820 821// Double shift instructions. 822defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>; 823defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>; 824defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 825defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 826 827// BMI1 BEXTR/BLS, BMI2 BZHI 828defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 829defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>; 830defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 831 832// Idioms that clear a register, like xorps %xmm0, %xmm0. 833// These can often bypass execution ports completely. 834defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>; 835 836// Branches don't produce values, so they have no latency, but they still 837// consume resources. Indirect branches can fold loads. 838defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 839 840// Floating point. This covers both scalar and vector operations. 841defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; 842defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 843defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 844defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 845defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 846defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 847defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 848defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 849defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 850 851def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> { 852 let Latency = 2; // FIXME: not from llvm-exegesis 853 let ResourceCycles = [1, 1]; 854 let NumMicroOps = 2; 855} 856def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 857 VMOVHPDmr, VMOVHPSmr)>; 858 859defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 860defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 861defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 862defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 863defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 864 865defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 866defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 867defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 868defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 869 870defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub. 871 872def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 873 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 874 let ResourceCycles = [1, 1, 24]; 875 let NumMicroOps = 2; 876} 877def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 878 SUB_FI16m, SUB_FI32m, 879 SUBR_FI16m, SUBR_FI32m, 880 MUL_FI16m, MUL_FI32m)>; 881 882def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 883 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 884 let ResourceCycles = [1, 1, 62]; 885 let NumMicroOps = 2; 886} 887def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 888 DIVR_FI16m, DIVR_FI32m)>; 889 890defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 891defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 892defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM). 893defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 894defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 895defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 896defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM). 897defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare. 898defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM). 899defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM). 900defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM). 901defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare. 902defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). 903defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM). 904defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM). 905defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 906defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 907defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication. 908defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 909defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 910defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM). 911defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication. 912defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 913defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 914defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM). 915defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division. 916defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM). 917defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM). 918defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM). 919defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division. 920defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 921defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 922defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM). 923defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root. 924defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 925defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 926defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM). 927defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root. 928defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 929defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 930defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM). 931defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 932defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate. 933defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM). 934defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM). 935defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM). 936defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate. 937defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM). 938defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM). 939defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM). 940defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add. 941defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM). 942defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM). 943defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM). 944defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 945defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 946defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 947defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM). 948defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 949defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding. 950defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 951defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM). 952defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 953defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 954defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM). 955defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 956defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 957defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM). 958defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 959defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 960defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM). 961defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 962defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 963defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM). 964defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends. 965defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 966defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM). 967defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends. 968defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 969defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM). 970 971// Horizontal Add/Sub (float and integer) 972defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>; 973defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>; 974defm : X86WriteResPairUnsupported<WriteFHAddZ>; 975defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 976defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>; 977defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 978defm : X86WriteResPairUnsupported<WritePHAddZ>; 979 980// Vector integer operations. 981defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 982defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 983defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 984defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 985defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 986defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 987defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 988defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 989defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 990 991def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { 992 let Latency = 4; 993 let ResourceCycles = [1]; 994 let NumMicroOps = 1; 995} 996def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 997 998def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { 999 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1000 let ResourceCycles = [1, 1, 1]; 1001 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1002} 1003def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1004 1005def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { 1006 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1007 let ResourceCycles = [1, 1, 1]; 1008 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1009} 1010def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1011 1012defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1013defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1014defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1015defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1016defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 1017defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 1018defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1019 1020defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>; 1021defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>; 1022 1023def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1024 let Latency = 1; 1025 let ResourceCycles = [1, 2]; 1026 let NumMicroOps = 2; 1027} 1028def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1029 1030def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1031 let Latency = 1; 1032 let ResourceCycles = [1, 4]; 1033 let NumMicroOps = 2; 1034} 1035def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1036 1037defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1038 1039def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1040 let Latency = 3; 1041 let ResourceCycles = [1, 1]; 1042 let NumMicroOps = 1; 1043} 1044def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1045 1046def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1047 let Latency = 3; 1048 let ResourceCycles = [1, 1]; 1049 let NumMicroOps = 2; 1050} 1051def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1052 1053defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1054 1055def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1056 let Latency = 1; 1057 let ResourceCycles = [1]; 1058 let NumMicroOps = 1; 1059} 1060def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1061 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1062 PAVGBrr, PAVGWrr, 1063 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1064 VPABSBrr, VPABSDrr, VPABSWrr, 1065 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1066 VPAVGBrr, VPAVGWrr, 1067 VPCMPEQQrr, 1068 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1069 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1070 1071def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> { 1072 let Latency = 1; 1073 let ResourceCycles = [1]; 1074 let NumMicroOps = 1; 1075} 1076def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, 1077 MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr, 1078 MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr, 1079 MMX_PAVGBrr, MMX_PAVGWrr, 1080 MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>; 1081 1082defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1083 1084def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1085 let Latency = 1; 1086 let ResourceCycles = [1]; 1087 let NumMicroOps = 1; 1088} 1089def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1090 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1091 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1092 VPAVGBYrr, VPAVGWYrr, 1093 VPCMPEQQYrr, 1094 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1095 1096defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM). 1097defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1098defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1099defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1100defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM). 1101defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1102defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1103defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM). 1104defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1105defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM). 1106defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1107defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM). 1108defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1109defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1110defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1111defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM). 1112defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1113defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1114defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1115defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM). 1116defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD. 1117defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1118defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM). 1119defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles. 1120defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1121defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1122defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM). 1123defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1124defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1125defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1126defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM). 1127defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends. 1128defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1129defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM). 1130defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends. 1131defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1132defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM). 1133defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1134defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1135defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1136defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM). 1137defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1138defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1139defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM). 1140defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1141 1142// Vector insert/extract operations. 1143defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1144defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1145defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1146 1147// MOVMSK operations. 1148defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1149defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1150defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>; 1151defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1152 1153// Conversion between integer and float. 1154defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer. 1155defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM). 1156defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM). 1157defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM). 1158 1159def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1160 let Latency = 1; 1161 let ResourceCycles = [2]; 1162 let NumMicroOps = 2; 1163} 1164def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>; 1165 1166defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer. 1167 1168defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1169defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM). 1170defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM). 1171 1172defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1173defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1174defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1175defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM). 1176 1177def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1178 let Latency = 2; 1179 let ResourceCycles = [6]; 1180 let NumMicroOps = 2; 1181} 1182def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>; 1183 1184defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1185defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1186defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1187defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM). 1188 1189def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1190 let Latency = 3; 1191 let ResourceCycles = [1]; 1192 let NumMicroOps = 2; 1193} 1194def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>; 1195 1196defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1197defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1198defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1199defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM). 1200 1201defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1202defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1203defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1204defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM). 1205 1206defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1207defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1208defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM). 1209 1210defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1211defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1212defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM). 1213defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1214defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1215defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM). 1216 1217// CRC32 instruction. 1218defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>; 1219 1220def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1221 let Latency = 2; 1222 let ResourceCycles = [2]; 1223 let NumMicroOps = 2; 1224} 1225def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1226 1227def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1228 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); 1229 let ResourceCycles = [1, 1, 2]; 1230 let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); 1231} 1232def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1233 1234def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { 1235 let Latency = 1; 1236 let ResourceCycles = [2]; 1237 let NumMicroOps = 1; 1238} 1239def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1240 1241def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1242 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1243 let ResourceCycles = [1, 1, 2]; 1244 let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1245} 1246def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1247 1248def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1249 let Latency = 2; 1250 let ResourceCycles = [3]; 1251 let NumMicroOps = 2; 1252} 1253def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1254 1255def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1256 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); 1257 let ResourceCycles = [1, 1, 3]; 1258 let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); 1259} 1260def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1261 1262def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { 1263 let Latency = 3; 1264 let ResourceCycles = [8]; 1265 let NumMicroOps = 4; 1266} 1267def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1268 1269def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1270 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); 1271 let ResourceCycles = [1, 1, 8]; 1272 let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); 1273} 1274def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1275 1276def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> { 1277 let Latency = 6; 1278 let ResourceCycles = [8]; 1279 let NumMicroOps = 1; 1280} 1281def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1282 1283def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> { 1284 let Latency = 4; 1285 let ResourceCycles = [8]; 1286 let NumMicroOps = 1; 1287} 1288def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1289 1290// Strings instructions. 1291// Packed Compare Implicit Length Strings, Return Mask 1292defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1293// Packed Compare Explicit Length Strings, Return Mask 1294defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1295// Packed Compare Implicit Length Strings, Return Index 1296defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>; 1297// Packed Compare Explicit Length Strings, Return Index 1298defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1299 1300// AES instructions. 1301defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption. 1302defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn. 1303defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation. 1304 1305// Carry-less multiplication instructions. 1306defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>; 1307 1308// EMMS/FEMMS 1309defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1310 1311// Load/store MXCSR 1312defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1313defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1314 1315// Catch-all for expensive system instructions. 1316defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>; 1317 1318def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> { 1319 let Latency = 0; // FIXME: not from llvm-exegesis 1320 let ResourceCycles = [1]; 1321 let NumMicroOps = 1; 1322} 1323def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>; 1324 1325def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> { 1326 let Latency = 10; // FIXME: not from llvm-exegesis 1327 let ResourceCycles = [24]; 1328 let NumMicroOps = 18; 1329} 1330def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>; 1331 1332// AVX2. 1333defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1334defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1335defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles. 1336 1337def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { 1338 let Latency = 3; 1339 let ResourceCycles = [1]; 1340 let NumMicroOps = 1; 1341} 1342def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1343 1344def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1345 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); 1346 let ResourceCycles = [1, 1, 1]; 1347 let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1348} 1349def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1350 1351def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> { 1352 let Latency = 7; 1353 let ResourceCycles = [1]; 1354 let NumMicroOps = 2; 1355} 1356def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>; 1357 1358def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1359 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency); 1360 let ResourceCycles = [1, 1, 2]; 1361 let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1); 1362} 1363def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1364 1365def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { 1366 let Latency = 6; 1367 let ResourceCycles = [1]; 1368 let NumMicroOps = 2; 1369} 1370def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1371 1372def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1373 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); 1374 let ResourceCycles = [1, 1, 2]; 1375 let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); 1376} 1377def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1378 1379def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> { 1380 let Latency = 5; 1381 let ResourceCycles = [1]; 1382 let NumMicroOps = 2; 1383} 1384def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>; 1385 1386def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1387 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency); 1388 let ResourceCycles = [1, 1, 2]; 1389 let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0); 1390} 1391def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; 1392 1393defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1394defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. 1395defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts. 1396defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1397defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM). 1398 1399// Old microcoded instructions that nobody use. 1400defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>; 1401 1402// Fence instructions. 1403defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>; 1404 1405def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> { 1406 let Latency = 1; 1407 let ResourceCycles = [30]; 1408 let NumMicroOps = 1; 1409} 1410def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>; 1411 1412def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> { 1413 let Latency = 1; 1414 let ResourceCycles = [1]; 1415 let NumMicroOps = 1; 1416} 1417def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>; 1418 1419// Nop, not very useful expect it provides a model for nops! 1420defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1421 1422 1423/////////////////////////////////////////////////////////////////////////////// 1424// Zero Cycle Move 1425/////////////////////////////////////////////////////////////////////////////// 1426 1427def Zn3WriteZeroLatency : SchedWriteRes<[]> { 1428 let Latency = 0; 1429 let ResourceCycles = []; 1430 let NumMicroOps = 1; 1431} 1432def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1433 MOV64rr, MOV64rr_REV, 1434 MOVSX32rr32)>; 1435 1436def Zn3WriteSwapRenameable : SchedWriteRes<[]> { 1437 let Latency = 0; 1438 let ResourceCycles = []; 1439 let NumMicroOps = 2; 1440} 1441def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1442 XCHG64rr, XCHG64ar)>; 1443 1444defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1445 1446defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class 1447defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1448defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1449defm : X86WriteResUnsupported<WriteFMoveZ>; 1450 1451defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX 1452defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1453defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1454defm : X86WriteResUnsupported<WriteVecMoveZ>; 1455 1456def : IsOptimizableRegisterMove<[ 1457 InstructionEquivalenceClass<[ 1458 // GPR variants. 1459 MOV32rr, MOV32rr_REV, 1460 MOV64rr, MOV64rr_REV, 1461 MOVSX32rr32, 1462 XCHG32rr, XCHG32ar, 1463 XCHG64rr, XCHG64ar, 1464 1465 // MMX variants. 1466 // MMX moves are *NOT* eliminated. 1467 1468 // SSE variants. 1469 MOVAPSrr, MOVAPSrr_REV, 1470 MOVUPSrr, MOVUPSrr_REV, 1471 MOVAPDrr, MOVAPDrr_REV, 1472 MOVUPDrr, MOVUPDrr_REV, 1473 MOVDQArr, MOVDQArr_REV, 1474 MOVDQUrr, MOVDQUrr_REV, 1475 1476 // AVX variants. 1477 VMOVAPSrr, VMOVAPSrr_REV, 1478 VMOVUPSrr, VMOVUPSrr_REV, 1479 VMOVAPDrr, VMOVAPDrr_REV, 1480 VMOVUPDrr, VMOVUPDrr_REV, 1481 VMOVDQArr, VMOVDQArr_REV, 1482 VMOVDQUrr, VMOVDQUrr_REV, 1483 1484 // AVX YMM variants. 1485 VMOVAPSYrr, VMOVAPSYrr_REV, 1486 VMOVUPSYrr, VMOVUPSYrr_REV, 1487 VMOVAPDYrr, VMOVAPDYrr_REV, 1488 VMOVUPDYrr, VMOVUPDYrr_REV, 1489 VMOVDQAYrr, VMOVDQAYrr_REV, 1490 VMOVDQUYrr, VMOVDQUYrr_REV, 1491 ], TruePred > 1492]>; 1493 1494/////////////////////////////////////////////////////////////////////////////// 1495// Dependency breaking instructions. 1496/////////////////////////////////////////////////////////////////////////////// 1497 1498def Zn3WriteZeroIdiom : SchedWriteVariant<[ 1499 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1500 SchedVar<NoSchedPred, [WriteALU]> 1501]>; 1502def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1503 XOR64rr, XOR64rr_REV, 1504 SUB32rr, SUB32rr_REV, 1505 SUB64rr, SUB64rr_REV)>; 1506 1507def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1508 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>, 1509 SchedVar<NoSchedPred, [WriteALU]> 1510]>; 1511def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1512 CMP16rr, CMP16rr_REV, 1513 CMP32rr, CMP32rr_REV, 1514 CMP64rr, CMP64rr_REV)>; 1515 1516def Zn3WriteFZeroIdiom : SchedWriteVariant<[ 1517 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1518 SchedVar<NoSchedPred, [WriteFLogic]> 1519]>; 1520// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1521def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1522 VANDNPSrr, VANDNPDrr)>; 1523 1524def Zn3WriteFZeroIdiomY : SchedWriteVariant<[ 1525 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1526 SchedVar<NoSchedPred, [WriteFLogicY]> 1527]>; 1528def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1529 VANDNPSYrr, VANDNPDYrr)>; 1530 1531def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1532 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1533 SchedVar<NoSchedPred, [WriteVecLogicX]> 1534]>; 1535// NOTE: PXORrr,PANDNrr are not zero-cycle! 1536def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1537 1538def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1539 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1540 SchedVar<NoSchedPred, [WriteVecLogicY]> 1541]>; 1542def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1543 1544def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[ 1545 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1546 SchedVar<NoSchedPred, [WriteVecALUX]> 1547]>; 1548// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1549// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1550def : InstRW<[Zn3WriteVZeroIdiomALUX], 1551 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1552 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1553 1554def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[ 1555 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1556 SchedVar<NoSchedPred, [WriteVecALUY]> 1557]>; 1558def : InstRW<[Zn3WriteVZeroIdiomALUY], 1559 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1560 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1561 1562def : IsZeroIdiomFunction<[ 1563 // GPR Zero-idioms. 1564 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1565 XOR64rr, XOR64rr_REV, 1566 SUB32rr, SUB32rr_REV, 1567 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1568 1569 // SSE XMM Zero-idioms. 1570 DepBreakingClass<[ 1571 // fp variants. 1572 XORPSrr, XORPDrr, 1573 ANDNPSrr, ANDNPDrr, 1574 1575 // int variants. 1576 PXORrr, 1577 PANDNrr, 1578 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1579 PSUBSBrr, PSUBSWrr, 1580 PSUBUSBrr, PSUBUSWrr, 1581 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1582 ], ZeroIdiomPredicate>, 1583 1584 // AVX XMM Zero-idioms. 1585 DepBreakingClass<[ 1586 // fp variants. 1587 VXORPSrr, VXORPDrr, 1588 VANDNPSrr, VANDNPDrr, 1589 1590 // int variants. 1591 VPXORrr, 1592 VPANDNrr, 1593 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1594 VPSUBSBrr, VPSUBSWrr, 1595 VPSUBUSBrr, VPSUBUSWrr, 1596 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1597 ], ZeroIdiomPredicate>, 1598 1599 // AVX YMM Zero-idioms. 1600 DepBreakingClass<[ 1601 // fp variants. 1602 VXORPSYrr, VXORPDYrr, 1603 VANDNPSYrr, VANDNPDYrr, 1604 1605 // int variants. 1606 VPXORYrr, 1607 VPANDNYrr, 1608 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1609 VPSUBSBYrr, VPSUBSWYrr, 1610 VPSUBUSBYrr, VPSUBUSWYrr, 1611 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1612 ], ZeroIdiomPredicate>, 1613]>; 1614 1615def : IsDepBreakingFunction<[ 1616 // GPR 1617 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1618 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1619 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1620 CMP16rr, CMP16rr_REV, 1621 CMP32rr, CMP32rr_REV, 1622 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1623 1624 // MMX 1625 DepBreakingClass<[ 1626 MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr 1627 ], ZeroIdiomPredicate>, 1628 1629 // SSE 1630 DepBreakingClass<[ 1631 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1632 ], ZeroIdiomPredicate>, 1633 1634 // AVX XMM 1635 DepBreakingClass<[ 1636 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1637 ], ZeroIdiomPredicate>, 1638 1639 // AVX YMM 1640 DepBreakingClass<[ 1641 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1642 ], ZeroIdiomPredicate>, 1643]>; 1644 1645} // SchedModel 1646