1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver3 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog 15// http://www.agner.org/optimize/microarchitecture.pdf 16// * AMD Zen 3 Ryzen Deep Dive Review 17// https://www.anandtech.com/show/16214/ 18//===----------------------------------------------------------------------===// 19 20def Znver3Model : SchedMachineModel { 21 // AMD SOG 19h, 2.9.6 Dispatch 22 // The processor may dispatch up to 6 macro ops per cycle 23 // into the execution engine. 24 let IssueWidth = 6; 25 // AMD SOG 19h, 2.10.3 26 // The retire control unit (RCU) tracks the completion status of all 27 // outstanding operations (integer, load/store, and floating-point) and is 28 // the final arbiter for exception processing and recovery. 29 // The unit can receive up to 6 macro ops dispatched per cycle and track up 30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. 31 let MicroOpBufferSize = 256; 32 // AMD SOG 19h, 2.9.1 Op Cache 33 // The op cache is organized as an associative cache with 64 sets and 8 ways. 34 // At each set-way intersection is an entry containing up to 8 macro ops. 35 // The maximum capacity of the op cache is 4K ops. 36 // Agner, 22.5 µop cache 37 // The size of the µop cache is big enough for holding most critical loops. 38 // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, 39 // with large values here the compilation of certain loops 40 // ends up taking way too long. 41 // let LoopMicroOpBufferSize = 4096; 42 let LoopMicroOpBufferSize = 512; 43 // AMD SOG 19h, 2.6.2 L1 Data Cache 44 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 45 // AMD SOG 19h, 2.12 L1 Data Cache 46 // The AGU and LS pipelines are optimized for simple address generation modes. 47 // <...> and can achieve 4-cycle load-to-use integer load latency. 48 let LoadLatency = 4; 49 // AMD SOG 19h, 2.12 L1 Data Cache 50 // The AGU and LS pipelines are optimized for simple address generation modes. 51 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 52 int VecLoadLatency = 7; 53 // Latency of a simple store operation. 54 int StoreLatency = 1; 55 // FIXME 56 let HighLatency = 25; // FIXME: any better choice? 57 // AMD SOG 19h, 2.8 Optimizing Branching 58 // The branch misprediction penalty is in the range from 11 to 18 cycles, 59 // <...>. The common case penalty is 13 cycles. 60 let MispredictPenalty = 13; 61 62 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 63 64 let CompleteModel = 1; 65} 66 67let SchedModel = Znver3Model in { 68 69 70//===----------------------------------------------------------------------===// 71// RCU 72//===----------------------------------------------------------------------===// 73 74// AMD SOG 19h, 2.10.3 Retire Control Unit 75// The unit can receive up to 6 macro ops dispatched per cycle and track up to 76// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 77// The retire unit handles in-order commit of up to eight macro ops per cycle. 78def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>; 79 80//===----------------------------------------------------------------------===// 81// Units 82//===----------------------------------------------------------------------===// 83 84// There are total of three Units, each one with it's own schedulers. 85 86//===----------------------------------------------------------------------===// 87// Integer Execution Unit 88// 89 90// AMD SOG 19h, 2.4 Superscalar Organization 91// The processor uses four decoupled independent integer scheduler queues, 92// each one servicing one ALU pipeline and one or two other pipelines 93 94// 95// Execution pipes 96//===----------------------------------------------------------------------===// 97 98// AMD SOG 19h, 2.10.2 Execution Units 99// The processor contains 4 general purpose integer execution pipes. 100// Each pipe has an ALU capable of general purpose integer operations. 101def Zn3ALU0 : ProcResource<1>; 102def Zn3ALU1 : ProcResource<1>; 103def Zn3ALU2 : ProcResource<1>; 104def Zn3ALU3 : ProcResource<1>; 105 106// AMD SOG 19h, 2.10.2 Execution Units 107// There is also a separate branch execution unit. 108def Zn3BRU1 : ProcResource<1>; 109 110// AMD SOG 19h, 2.10.2 Execution Units 111// There are three Address Generation Units (AGUs) for all load and store 112// address generation. There are also 3 store data movement units 113// associated with the same schedulers as the AGUs. 114def Zn3AGU0 : ProcResource<1>; 115def Zn3AGU1 : ProcResource<1>; 116def Zn3AGU2 : ProcResource<1>; 117 118// 119// Execution Units 120//===----------------------------------------------------------------------===// 121 122// AMD SOG 19h, 2.10.2 Execution Units 123// ALU0 additionally has divide <...> execution capability. 124defvar Zn3Divider = Zn3ALU0; 125 126// AMD SOG 19h, 2.10.2 Execution Units 127// ALU0 additionally has <...> branch execution capability. 128defvar Zn3BRU0 = Zn3ALU0; 129 130// Integer Multiplication issued on ALU1. 131defvar Zn3Multiplier = Zn3ALU1; 132 133// Execution pipeline grouping 134//===----------------------------------------------------------------------===// 135 136// General ALU operations 137def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>; 138 139// General AGU operations 140def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>; 141 142// Control flow: jumps, calls 143def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>; 144 145// Everything that isn't control flow, but still needs to access CC register, 146// namely: conditional moves, SETcc. 147def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>; 148 149// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 150 151// Simple bit twiddling: bit test, shift/rotate, bit extraction 152def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>; 153 154 155// 156// Scheduling 157//===----------------------------------------------------------------------===// 158 159// AMD SOG 19h, 2.10.3 Retire Control Unit 160// The integer physical register file (PRF) consists of 192 registers. 161def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0], 162 6, // Max moves that can be eliminated per cycle. 163 0>; // Restrict move elimination to zero regs. 164 165// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 166// AMD SOG 19h, 2.10.1 Schedulers 167// The schedulers can receive up to six macro ops per cycle, with a limit of 168// two per scheduler. Each scheduler can issue one micro op per cycle into 169// each of its associated pipelines 170// FIXME: these are 4 separate schedulers, not a single big one. 171def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0 172 Zn3ALU1, Zn3AGU1, // scheduler 1 173 Zn3ALU2, Zn3AGU2, // scheduler 2 174 Zn3ALU3, Zn3BRU1 // scheduler 3 175 ]> { 176 let BufferSize = !mul(4, 24); 177} 178 179 180//===----------------------------------------------------------------------===// 181// Floating-Point Unit 182// 183 184// AMD SOG 19h, 2.4 Superscalar Organization 185// The processor uses <...> two decoupled independent floating point schedulers 186// each servicing two FP pipelines and one store or FP-to-integer pipeline. 187 188// 189// Execution pipes 190//===----------------------------------------------------------------------===// 191 192// AMD SOG 19h, 2.10.1 Schedulers 193// <...>, and six FPU pipes. 194// Agner, 22.10 Floating point execution pipes 195// There are six floating point/vector execution pipes, 196def Zn3FPP0 : ProcResource<1>; 197def Zn3FPP1 : ProcResource<1>; 198def Zn3FPP2 : ProcResource<1>; 199def Zn3FPP3 : ProcResource<1>; 200def Zn3FPP45 : ProcResource<2>; 201 202// 203// Execution Units 204//===----------------------------------------------------------------------===// 205// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 206 207// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 208defvar Zn3FPFMul0 = Zn3FPP0; 209defvar Zn3FPFMul1 = Zn3FPP1; 210 211// (v)FADD* 212defvar Zn3FPFAdd0 = Zn3FPP2; 213defvar Zn3FPFAdd1 = Zn3FPP3; 214 215// All convert operations except pack/unpack 216defvar Zn3FPFCvt0 = Zn3FPP2; 217defvar Zn3FPFCvt1 = Zn3FPP3; 218 219// All Divide and Square Root except Reciprocal Approximation 220// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 221// FDIV unit can support 2 simultaneous operations in flight 222// even though it occupies a single pipe. 223// FIXME: BufferSize=2 ? 224defvar Zn3FPFDiv = Zn3FPP1; 225 226// Moves and Logical operations on Floating Point Data Types 227defvar Zn3FPFMisc0 = Zn3FPP0; 228defvar Zn3FPFMisc1 = Zn3FPP1; 229defvar Zn3FPFMisc2 = Zn3FPP2; 230defvar Zn3FPFMisc3 = Zn3FPP3; 231 232// Integer Adds, Subtracts, and Compares 233// Some complex VADD operations are not available in all pipes. 234defvar Zn3FPVAdd0 = Zn3FPP0; 235defvar Zn3FPVAdd1 = Zn3FPP1; 236defvar Zn3FPVAdd2 = Zn3FPP2; 237defvar Zn3FPVAdd3 = Zn3FPP3; 238 239// Integer Multiplies, SAD, Blendvb 240defvar Zn3FPVMul0 = Zn3FPP0; 241defvar Zn3FPVMul1 = Zn3FPP3; 242 243// Data Shuffles, Packs, Unpacks, Permute 244// Some complex shuffle operations are only available in pipe1. 245defvar Zn3FPVShuf = Zn3FPP1; 246defvar Zn3FPVShufAux = Zn3FPP2; 247 248// Bit Shift Left/Right operations 249defvar Zn3FPVShift0 = Zn3FPP1; 250defvar Zn3FPVShift1 = Zn3FPP2; 251 252// Moves and Logical operations on Packed Integer Data Types 253defvar Zn3FPVMisc0 = Zn3FPP0; 254defvar Zn3FPVMisc1 = Zn3FPP1; 255defvar Zn3FPVMisc2 = Zn3FPP2; 256defvar Zn3FPVMisc3 = Zn3FPP3; 257 258// *AES* 259defvar Zn3FPAES0 = Zn3FPP0; 260defvar Zn3FPAES1 = Zn3FPP1; 261 262// *CLM* 263defvar Zn3FPCLM0 = Zn3FPP0; 264defvar Zn3FPCLM1 = Zn3FPP1; 265 266// Execution pipeline grouping 267//===----------------------------------------------------------------------===// 268 269// AMD SOG 19h, 2.11 Floating-Point Unit 270// Stores and floating point to general purpose register transfer 271// have 2 dedicated pipelines (pipe 5 and 6). 272def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>; 273 274// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 275def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>; 276 277// (v)FADD* 278// Some complex VADD operations are not available in all pipes. 279def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>; 280 281// All convert operations except pack/unpack 282def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>; 283 284// All Divide and Square Root except Reciprocal Approximation 285// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>; 286 287// Moves and Logical operations on Floating Point Data Types 288def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>; 289 290def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>; 291 292// Loads, Stores and Move to General Register (EX) Operations 293// AMD SOG 19h, 2.11 Floating-Point Unit 294// Stores and floating point to general purpose register transfer 295// have 2 dedicated pipelines (pipe 5 and 6). 296defvar Zn3FPLd01 = Zn3FPP45; 297 298// AMD SOG 19h, 2.11 Floating-Point Unit 299// Note that FP stores are supported on two pipelines, 300// but throughput is limited to one per cycle. 301let Super = Zn3FPP45 in 302def Zn3FPSt : ProcResource<1>; 303 304// Integer Adds, Subtracts, and Compares 305// Some complex VADD operations are not available in all pipes. 306def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>; 307 308def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>; 309def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>; 310 311// Integer Multiplies, SAD, Blendvb 312def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>; 313 314// Data Shuffles, Packs, Unpacks, Permute 315// Some complex shuffle operations are only available in pipe1. 316def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>; 317 318// Bit Shift Left/Right operations 319def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>; 320 321// Moves and Logical operations on Packed Integer Data Types 322def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>; 323 324// *AES* 325def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>; 326 327// *CLM* 328def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>; 329 330 331// 332// Scheduling 333//===----------------------------------------------------------------------===// 334 335// Agner, 21.8 Register renaming and out-of-order schedulers 336// The floating point register file has 160 vector registers 337// of 128 bits each in Zen 1 and 256 bits each in Zen 2. 338// anandtech also confirms this. 339def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1], 340 6, // Max moves that can be eliminated per cycle. 341 0>; // Restrict move elimination to zero regs. 342 343// AMD SOG 19h, 2.11 Floating-Point Unit 344// The floating-point scheduler has a 2*32 entry macro op capacity. 345// AMD SOG 19h, 2.11 Floating-Point Unit 346// <...> the scheduler can issue 1 micro op per cycle for each pipe. 347// FIXME: those are two separate schedulers, not a single big one. 348def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2, /*Zn3FPP4,*/ // scheduler 0 349 Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/ // scheduler 1 350 ]> { 351 let BufferSize = !mul(2, 32); 352} 353 354// AMD SOG 19h, 2.11 Floating-Point Unit 355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 356// even if floating-point scheduler is full. 357// FIXME: how to model this properly? 358 359 360//===----------------------------------------------------------------------===// 361// Load-Store Unit 362// 363 364// AMD SOG 19h, 2.12 Load-Store Unit 365// The LS unit contains three largely independent pipe-lines 366// enabling the execution of three 256-bit memory operations per cycle. 367def Zn3LSU : ProcResource<3>; 368 369// AMD SOG 19h, 2.12 Load-Store Unit 370// All three memory operations can be loads. 371let Super = Zn3LSU in 372def Zn3Load : ProcResource<3> { 373 // AMD SOG 19h, 2.12 Load-Store Unit 374 // The LS unit can process up to 72 out-of-order loads. 375 let BufferSize = 72; 376} 377 378def Zn3LoadQueue : LoadQueue<Zn3Load>; 379 380// AMD SOG 19h, 2.12 Load-Store Unit 381// A maximum of two of the memory operations can be stores. 382let Super = Zn3LSU in 383def Zn3Store : ProcResource<2> { 384 // AMD SOG 19h, 2.12 Load-Store Unit 385 // The LS unit utilizes a 64-entry store queue (STQ). 386 let BufferSize = 64; 387} 388 389def Zn3StoreQueue : StoreQueue<Zn3Store>; 390 391//===----------------------------------------------------------------------===// 392// Basic helper classes. 393//===----------------------------------------------------------------------===// 394 395// Many SchedWrites are defined in pairs with and without a folded load. 396// Instructions with folded loads are usually micro-fused, so they only appear 397// as two micro-ops when dispatched by the schedulers. 398// This multiclass defines the resource usage for variants with and without 399// folded loads. 400 401multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 402 int Lat = 1, list<int> Res = [], int UOps = 1> { 403 def : WriteRes<SchedRW, ExePorts> { 404 let Latency = Lat; 405 let ReleaseAtCycles = Res; 406 let NumMicroOps = UOps; 407 } 408} 409 410multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW, 411 list<ProcResourceKind> ExePorts, int Lat, 412 list<int> Res, int UOps, int LoadLat, int LoadUOps, 413 ProcResourceKind AGU, int LoadRes> { 414 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 415 416 defm : __zn3WriteRes<SchedRW.Folded, 417 !listconcat([AGU, Zn3Load], ExePorts), 418 !add(Lat, LoadLat), 419 !if(!and(!empty(Res), !eq(LoadRes, 1)), 420 [], 421 !listconcat([1, LoadRes], 422 !if(!empty(Res), 423 !listsplat(1, !size(ExePorts)), 424 Res))), 425 !add(UOps, LoadUOps)>; 426} 427 428// For classes without folded loads. 429multiclass Zn3WriteResInt<SchedWrite SchedRW, 430 list<ProcResourceKind> ExePorts, int Lat = 1, 431 list<int> Res = [], int UOps = 1> { 432 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 433} 434 435multiclass Zn3WriteResXMM<SchedWrite SchedRW, 436 list<ProcResourceKind> ExePorts, int Lat = 1, 437 list<int> Res = [], int UOps = 1> { 438 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 439} 440 441multiclass Zn3WriteResYMM<SchedWrite SchedRW, 442 list<ProcResourceKind> ExePorts, int Lat = 1, 443 list<int> Res = [], int UOps = 1> { 444 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 445} 446 447// For classes with folded loads. 448multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW, 449 list<ProcResourceKind> ExePorts, int Lat = 1, 450 list<int> Res = [], int UOps = 1, 451 int LoadUOps = 0, int LoadRes = 1> { 452 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 453 Znver3Model.LoadLatency, 454 LoadUOps, Zn3AGU012, LoadRes>; 455} 456 457multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW, 458 list<ProcResourceKind> ExePorts, int Lat = 1, 459 list<int> Res = [], int UOps = 1, 460 int LoadUOps = 0, int LoadRes = 1> { 461 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 462 Znver3Model.VecLoadLatency, 463 LoadUOps, Zn3FPLd01, LoadRes>; 464} 465 466multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW, 467 list<ProcResourceKind> ExePorts, int Lat = 1, 468 list<int> Res = [], int UOps = 1, 469 int LoadUOps = 0, int LoadRes = 1> { 470 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 471 Znver3Model.VecLoadLatency, 472 LoadUOps, Zn3FPLd01, LoadRes>; 473} 474 475 476//===----------------------------------------------------------------------===// 477// Here be dragons. 478//===----------------------------------------------------------------------===// 479 480def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>; 481 482def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>; 483def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>; 484def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>; 485 486// AMD SOG 19h, 2.11 Floating-Point Unit 487// There is 1 cycle of added latency for a result to cross 488// from F to I or I to F domain. 489def : ReadAdvance<ReadInt2Fpu, -1>; 490 491// Instructions with both a load and a store folded are modeled as a folded 492// load + WriteRMW. 493defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>; 494 495// Loads, stores, and moves, not folded with other operations. 496defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>; 497 498// Model the effect of clobbering the read-write mask operand of the GATHER operation. 499// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 500defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>; 501 502def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { 503 let Latency = !add(Znver3Model.LoadLatency, 1); 504 let ReleaseAtCycles = [3, 1]; 505 let NumMicroOps = 1; 506} 507def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 508 509defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 510defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 511defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>; 512 513// Treat misc copies as a move. 514def : InstRW<[WriteMove], (instrs COPY)>; 515 516def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 517 let Latency = Znver3Model.LoadLatency; 518 let ReleaseAtCycles = [1, 1, 4]; 519 let NumMicroOps = 1; 520} 521def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>; 522 523def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> { 524 let Latency = Znver3Model.StoreLatency; 525 let ReleaseAtCycles = [4, 1, 1]; 526 let NumMicroOps = 2; 527} 528def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 529 530// Arithmetic. 531defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op. 532 533def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> { 534 let Latency = 1; 535 let ReleaseAtCycles = [4]; 536 let NumMicroOps = 1; 537} 538def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 539 AND8i8, AND16i16, AND32i32, AND64i32, 540 OR8i8, OR16i16, OR32i32, OR64i32, 541 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 542 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 543 544def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> { 545 let Latency = 1; 546 let ReleaseAtCycles = [4]; 547 let NumMicroOps = 1; 548} 549def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 550 551def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> { 552 let Latency = 1; 553 let ReleaseAtCycles = [2]; 554 let NumMicroOps = 1; 555} 556def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 557 558def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> { 559 let Latency = 3; 560 let ReleaseAtCycles = [1]; 561 let NumMicroOps = 1; 562} 563def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 564 PEXT32rr, PEXT64rr)>; 565 566defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op. 567 568def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> { 569 let Latency = 1; 570 let ReleaseAtCycles = [1, 1, 7, 1]; 571 let NumMicroOps = 1; 572} 573def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 574 575// This is for simple LEAs with one or two input operands. 576defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 577 578// This write is used for slow LEA instructions. 579def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> { 580 let Latency = 2; 581 let ReleaseAtCycles = [1]; 582 let NumMicroOps = 2; 583} 584 585// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset), 586// or an LEA with a `Scale` value different than 1. 587def Zn3SlowLEAPredicate : MCSchedPredicate< 588 CheckAny<[ 589 // A 3-operand LEA (base, index, offset). 590 IsThreeOperandsLEAFn, 591 // An LEA with a "Scale" different than 1. 592 CheckAll<[ 593 CheckIsImmOperand<2>, 594 CheckNot<CheckImmOperand<2, 1>> 595 ]> 596 ]> 597>; 598 599def Zn3WriteLEA : SchedWriteVariant<[ 600 SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>, 601 SchedVar<NoSchedPred, [WriteLEA]> 602]>; 603 604def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 605 606def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> { 607 let Latency = 2; // FIXME: not from llvm-exegesis 608 let ReleaseAtCycles = [4]; 609 let NumMicroOps = 2; 610} 611 612def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>; 613 614// Integer multiplication 615defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 616defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 617defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 618defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 619defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 620defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 621defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 622defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 623defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 624defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 625defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 626defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 627defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>; // Integer multiplication, high part. 628defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 629 630defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 631defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 632 633defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 634 635def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> { 636 let Latency = 3; 637 let ReleaseAtCycles = [12]; 638 let NumMicroOps = 3; 639} 640def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 641 642defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 643 644def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 645 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency); 646 let ReleaseAtCycles = [1, 1, 12]; 647 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2); 648} 649def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 650 651def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> { 652 let Latency = 3; // FIXME: not from llvm-exegesis 653 let ReleaseAtCycles = [24]; 654 let NumMicroOps = 19; 655} 656def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 657 658def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> { 659 let Latency = 4; // FIXME: not from llvm-exegesis 660 let ReleaseAtCycles = [59]; 661 let NumMicroOps = 28; 662} 663def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 664 665def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> { 666 let Latency = 1; 667 let ReleaseAtCycles = [2]; 668 let NumMicroOps = 2; 669} 670def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 671 672def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 673 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 674 let ReleaseAtCycles = [1, 1, 2]; 675 let NumMicroOps = 5; 676} 677def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 678 679def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 680 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 681 let ReleaseAtCycles = [1, 1, 2]; 682 let NumMicroOps = 2; 683} 684def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 685 686// Integer division. 687// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 688// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 689defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>; 690defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>; 691defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>; 692defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>; 693defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>; 694defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>; 695defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>; 696defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>; 697 698defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward. 699defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse. 700 701defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count. 702 703def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 704 let Latency = 1; 705 let ReleaseAtCycles = [4]; 706 let NumMicroOps = 1; 707} 708def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>; 709 710defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count. 711 712def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 713 let Latency = 1; 714 let ReleaseAtCycles = [4]; 715 let NumMicroOps = 1; 716} 717def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>; 718 719defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count. 720 721def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 722 let Latency = 2; 723 let ReleaseAtCycles = [4]; 724 let NumMicroOps = 2; 725} 726def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>; 727 728defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move. 729defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 730defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code. 731defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 732defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH. 733 734defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test 735defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>; 736defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>; 737 738defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set 739defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>; 740defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>; 741 742// Integer shifts and rotates. 743defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 744defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 745defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 746 747def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> { 748 let Latency = 1; 749 let ReleaseAtCycles = [2]; 750 let NumMicroOps = 1; 751} 752def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 753 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 754 755def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 756 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency); 757 let ReleaseAtCycles = [1, 1, 2]; 758 let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1); 759} 760def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 761 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 762 763def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> { 764 let Latency = 3; 765 let ReleaseAtCycles = [6]; 766 let NumMicroOps = 7; 767} 768def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 769 770def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 771 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency); 772 let ReleaseAtCycles = [1, 1, 8]; 773 let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3); 774} 775def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 776 777def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> { 778 let Latency = 4; 779 let ReleaseAtCycles = [8]; 780 let NumMicroOps = 9; 781} 782def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 783 784def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 785 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency); 786 let ReleaseAtCycles = [1, 1, 8]; 787 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2); 788} 789def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 790 791defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 792 793def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> { 794 let Latency = 3; 795 let ReleaseAtCycles = [6]; 796 let NumMicroOps = 7; 797} 798def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 799 800def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 801 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency); 802 let ReleaseAtCycles = [1, 1, 8]; 803 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2); 804} 805def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 806 807def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> { 808 let Latency = 4; 809 let ReleaseAtCycles = [8]; 810 let NumMicroOps = 9; 811} 812def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 813 814def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 815 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency); 816 let ReleaseAtCycles = [1, 1, 8]; 817 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2); 818} 819def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 820 821// Double shift instructions. 822defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>; 823defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>; 824defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 825defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 826 827// BMI1 BEXTR/BLS, BMI2 BZHI 828defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 829defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>; 830defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 831 832// Idioms that clear a register, like xorps %xmm0, %xmm0. 833// These can often bypass execution ports completely. 834defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>; 835 836// Branches don't produce values, so they have no latency, but they still 837// consume resources. Indirect branches can fold loads. 838defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 839 840// Floating point. This covers both scalar and vector operations. 841defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; 842defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 843defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 844defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 845defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 846defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 847defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 848defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 849defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 850 851def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> { 852 let Latency = 2; // FIXME: not from llvm-exegesis 853 let ReleaseAtCycles = [1, 1]; 854 let NumMicroOps = 2; 855} 856def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 857 VMOVHPDmr, VMOVHPSmr)>; 858 859defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 860defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 861defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 862defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 863defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 864 865defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 866defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 867defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 868defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 869 870defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub. 871 872def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 873 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 874 let ReleaseAtCycles = [1, 1, 24]; 875 let NumMicroOps = 2; 876} 877def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 878 SUB_FI16m, SUB_FI32m, 879 SUBR_FI16m, SUBR_FI32m, 880 MUL_FI16m, MUL_FI32m)>; 881 882def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 883 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 884 let ReleaseAtCycles = [1, 1, 62]; 885 let NumMicroOps = 2; 886} 887def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 888 DIVR_FI16m, DIVR_FI32m)>; 889 890defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 891defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 892defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM). 893defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 894defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 895defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 896defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM). 897defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare. 898defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM). 899defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM). 900defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM). 901defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare. 902defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). 903defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM). 904defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM). 905defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 906defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 907defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication. 908defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 909defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 910defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM). 911defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication. 912defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 913defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 914defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM). 915defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division. 916defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM). 917defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM). 918defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM). 919defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division. 920defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 921defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 922defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM). 923defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root. 924defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 925defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 926defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM). 927defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root. 928defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 929defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 930defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM). 931defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 932defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate. 933defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM). 934defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM). 935defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM). 936defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate. 937defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM). 938defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM). 939defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM). 940defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add. 941defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM). 942defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM). 943defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM). 944defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 945defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 946defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 947defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 948defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding. 949defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 950defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM). 951defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 952defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 953defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM). 954defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 955defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 956defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM). 957defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 958defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 959defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM). 960defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 961defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 962defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM). 963defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends. 964defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 965defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM). 966defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends. 967defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 968defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM). 969 970// Horizontal Add/Sub (float and integer) 971defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>; 972defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>; 973defm : X86WriteResPairUnsupported<WriteFHAddZ>; 974defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 975defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>; 976defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 977defm : X86WriteResPairUnsupported<WritePHAddZ>; 978 979// Vector integer operations. 980defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 981defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 982defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 983defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 984defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 985defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 986defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 987defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 988defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 989 990def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { 991 let Latency = 4; 992 let ReleaseAtCycles = [1]; 993 let NumMicroOps = 1; 994} 995def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 996 997def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { 998 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 999 let ReleaseAtCycles = [1, 1, 1]; 1000 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 1001} 1002def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1003 1004def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { 1005 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1006 let ReleaseAtCycles = [1, 1, 1]; 1007 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1008} 1009def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1010 1011defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1012defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1013defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1014defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1015defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 1016defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 1017defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1018 1019defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>; 1020defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>; 1021 1022def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1023 let Latency = 1; 1024 let ReleaseAtCycles = [1, 2]; 1025 let NumMicroOps = 2; 1026} 1027def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1028 1029def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1030 let Latency = 1; 1031 let ReleaseAtCycles = [1, 4]; 1032 let NumMicroOps = 2; 1033} 1034def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1035 1036defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1037 1038def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1039 let Latency = 3; 1040 let ReleaseAtCycles = [1, 1]; 1041 let NumMicroOps = 1; 1042} 1043def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1044 1045def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1046 let Latency = 3; 1047 let ReleaseAtCycles = [1, 1]; 1048 let NumMicroOps = 2; 1049} 1050def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1051 1052defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1053 1054def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1055 let Latency = 1; 1056 let ReleaseAtCycles = [1]; 1057 let NumMicroOps = 1; 1058} 1059def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1060 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1061 PAVGBrr, PAVGWrr, 1062 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1063 VPABSBrr, VPABSDrr, VPABSWrr, 1064 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1065 VPAVGBrr, VPAVGWrr, 1066 VPCMPEQQrr, 1067 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1068 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1069 1070def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> { 1071 let Latency = 1; 1072 let ReleaseAtCycles = [1]; 1073 let NumMicroOps = 1; 1074} 1075def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, 1076 MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr, 1077 MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr, 1078 MMX_PAVGBrr, MMX_PAVGWrr, 1079 MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>; 1080 1081defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1082 1083def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1084 let Latency = 1; 1085 let ReleaseAtCycles = [1]; 1086 let NumMicroOps = 1; 1087} 1088def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1089 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1090 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1091 VPAVGBYrr, VPAVGWYrr, 1092 VPCMPEQQYrr, 1093 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1094 1095defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM). 1096defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1097defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1098defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1099defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM). 1100defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1101defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1102defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM). 1103defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1104defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM). 1105defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1106defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM). 1107defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1108defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1109defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1110defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM). 1111defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1112defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1113defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1114defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM). 1115defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD. 1116defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1117defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM). 1118defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles. 1119defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1120defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1121defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM). 1122defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1123defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1124defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1125defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM). 1126defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends. 1127defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1128defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM). 1129defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends. 1130defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1131defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM). 1132defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1133defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1134defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1135defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM). 1136defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1137defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1138defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM). 1139defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1140 1141// Vector insert/extract operations. 1142defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1143defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1144defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1145 1146// MOVMSK operations. 1147defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1148defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1149defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>; 1150defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1151 1152// Conversion between integer and float. 1153defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer. 1154defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM). 1155defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM). 1156defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM). 1157 1158def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1159 let Latency = 1; 1160 let ReleaseAtCycles = [2]; 1161 let NumMicroOps = 2; 1162} 1163def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>; 1164 1165defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer. 1166 1167defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1168defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM). 1169defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM). 1170 1171defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1172defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1173defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1174defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM). 1175 1176def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1177 let Latency = 2; 1178 let ReleaseAtCycles = [6]; 1179 let NumMicroOps = 2; 1180} 1181def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>; 1182 1183defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1184defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1185defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1186defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM). 1187 1188def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1189 let Latency = 3; 1190 let ReleaseAtCycles = [1]; 1191 let NumMicroOps = 2; 1192} 1193def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>; 1194 1195defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1196defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1197defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1198defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM). 1199 1200defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1201defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1202defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1203defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM). 1204 1205defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1206defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1207defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM). 1208 1209defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1210defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1211defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM). 1212defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1213defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1214defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM). 1215 1216// CRC32 instruction. 1217defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>; 1218 1219def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1220 let Latency = 2; 1221 let ReleaseAtCycles = [2]; 1222 let NumMicroOps = 2; 1223} 1224def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1225 1226def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1227 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); 1228 let ReleaseAtCycles = [1, 1, 2]; 1229 let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); 1230} 1231def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1232 1233def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { 1234 let Latency = 1; 1235 let ReleaseAtCycles = [2]; 1236 let NumMicroOps = 1; 1237} 1238def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1239 1240def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1241 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1242 let ReleaseAtCycles = [1, 1, 2]; 1243 let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1244} 1245def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1246 1247def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1248 let Latency = 2; 1249 let ReleaseAtCycles = [3]; 1250 let NumMicroOps = 2; 1251} 1252def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1253 1254def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1255 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); 1256 let ReleaseAtCycles = [1, 1, 3]; 1257 let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); 1258} 1259def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1260 1261def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { 1262 let Latency = 3; 1263 let ReleaseAtCycles = [8]; 1264 let NumMicroOps = 4; 1265} 1266def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1267 1268def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1269 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); 1270 let ReleaseAtCycles = [1, 1, 8]; 1271 let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); 1272} 1273def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1274 1275def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> { 1276 let Latency = 6; 1277 let ReleaseAtCycles = [8]; 1278 let NumMicroOps = 1; 1279} 1280def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1281 1282def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> { 1283 let Latency = 4; 1284 let ReleaseAtCycles = [8]; 1285 let NumMicroOps = 1; 1286} 1287def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1288 1289// Strings instructions. 1290// Packed Compare Implicit Length Strings, Return Mask 1291defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1292// Packed Compare Explicit Length Strings, Return Mask 1293defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1294// Packed Compare Implicit Length Strings, Return Index 1295defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>; 1296// Packed Compare Explicit Length Strings, Return Index 1297defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1298 1299// AES instructions. 1300defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption. 1301defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn. 1302defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation. 1303 1304// Carry-less multiplication instructions. 1305defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>; 1306 1307// EMMS/FEMMS 1308defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1309 1310// Load/store MXCSR 1311defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1312defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1313 1314// Catch-all for expensive system instructions. 1315defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>; 1316 1317def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> { 1318 let Latency = 0; // FIXME: not from llvm-exegesis 1319 let ReleaseAtCycles = [1]; 1320 let NumMicroOps = 1; 1321} 1322def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>; 1323 1324def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> { 1325 let Latency = 10; // FIXME: not from llvm-exegesis 1326 let ReleaseAtCycles = [24]; 1327 let NumMicroOps = 18; 1328} 1329def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>; 1330 1331// AVX2. 1332defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1333defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1334defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles. 1335 1336def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { 1337 let Latency = 3; 1338 let ReleaseAtCycles = [1]; 1339 let NumMicroOps = 1; 1340} 1341def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1342 1343def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1344 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); 1345 let ReleaseAtCycles = [1, 1, 1]; 1346 let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1347} 1348def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1349 1350def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1351 let Latency = !add(Znver3Model.LoadLatency, 7); 1352 let ReleaseAtCycles = [1, 1, 2]; 1353 let NumMicroOps = 3; 1354} 1355def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1356 1357def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { 1358 let Latency = 6; 1359 let ReleaseAtCycles = [1]; 1360 let NumMicroOps = 2; 1361} 1362def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1363 1364def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1365 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); 1366 let ReleaseAtCycles = [1, 1, 2]; 1367 let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); 1368} 1369def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1370 1371def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1372 let Latency = !add(Znver3Model.LoadLatency, 5); 1373 let ReleaseAtCycles = [1, 1, 2]; 1374 let NumMicroOps = 2; 1375} 1376def : InstRW<[Zn3WriteVPERMDYm], (instrs VPERMQYmi, VPERMDYrm)>; 1377 1378defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1379defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf], 5, [1], 2, /*LoadUOps=*/1>; // 256-bit width vector variable shuffles. 1380defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts. 1381defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1382defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM). 1383 1384// Old microcoded instructions that nobody use. 1385defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>; 1386 1387// Fence instructions. 1388defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>; 1389 1390def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> { 1391 let Latency = 1; 1392 let ReleaseAtCycles = [30]; 1393 let NumMicroOps = 1; 1394} 1395def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>; 1396 1397def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> { 1398 let Latency = 1; 1399 let ReleaseAtCycles = [1]; 1400 let NumMicroOps = 1; 1401} 1402def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>; 1403 1404// Nop, not very useful expect it provides a model for nops! 1405defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1406 1407 1408/////////////////////////////////////////////////////////////////////////////// 1409// Zero Cycle Move 1410/////////////////////////////////////////////////////////////////////////////// 1411 1412def Zn3WriteZeroLatency : SchedWriteRes<[]> { 1413 let Latency = 0; 1414 let ReleaseAtCycles = []; 1415 let NumMicroOps = 1; 1416} 1417def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1418 MOV64rr, MOV64rr_REV, 1419 MOVSX32rr32)>; 1420 1421def Zn3WriteSwapRenameable : SchedWriteRes<[]> { 1422 let Latency = 0; 1423 let ReleaseAtCycles = []; 1424 let NumMicroOps = 2; 1425} 1426def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1427 XCHG64rr, XCHG64ar)>; 1428 1429defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1430 1431defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class 1432defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1433defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1434defm : X86WriteResUnsupported<WriteFMoveZ>; 1435 1436defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX 1437defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1438defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1439defm : X86WriteResUnsupported<WriteVecMoveZ>; 1440 1441def : IsOptimizableRegisterMove<[ 1442 InstructionEquivalenceClass<[ 1443 // GPR variants. 1444 MOV32rr, MOV32rr_REV, 1445 MOV64rr, MOV64rr_REV, 1446 MOVSX32rr32, 1447 XCHG32rr, XCHG32ar, 1448 XCHG64rr, XCHG64ar, 1449 1450 // MMX variants. 1451 // MMX moves are *NOT* eliminated. 1452 1453 // SSE variants. 1454 MOVAPSrr, MOVAPSrr_REV, 1455 MOVUPSrr, MOVUPSrr_REV, 1456 MOVAPDrr, MOVAPDrr_REV, 1457 MOVUPDrr, MOVUPDrr_REV, 1458 MOVDQArr, MOVDQArr_REV, 1459 MOVDQUrr, MOVDQUrr_REV, 1460 1461 // AVX variants. 1462 VMOVAPSrr, VMOVAPSrr_REV, 1463 VMOVUPSrr, VMOVUPSrr_REV, 1464 VMOVAPDrr, VMOVAPDrr_REV, 1465 VMOVUPDrr, VMOVUPDrr_REV, 1466 VMOVDQArr, VMOVDQArr_REV, 1467 VMOVDQUrr, VMOVDQUrr_REV, 1468 1469 // AVX YMM variants. 1470 VMOVAPSYrr, VMOVAPSYrr_REV, 1471 VMOVUPSYrr, VMOVUPSYrr_REV, 1472 VMOVAPDYrr, VMOVAPDYrr_REV, 1473 VMOVUPDYrr, VMOVUPDYrr_REV, 1474 VMOVDQAYrr, VMOVDQAYrr_REV, 1475 VMOVDQUYrr, VMOVDQUYrr_REV, 1476 ], TruePred > 1477]>; 1478 1479/////////////////////////////////////////////////////////////////////////////// 1480// Dependency breaking instructions. 1481/////////////////////////////////////////////////////////////////////////////// 1482 1483def Zn3WriteZeroIdiom : SchedWriteVariant<[ 1484 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1485 SchedVar<NoSchedPred, [WriteALU]> 1486]>; 1487def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1488 XOR64rr, XOR64rr_REV, 1489 SUB32rr, SUB32rr_REV, 1490 SUB64rr, SUB64rr_REV)>; 1491 1492def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1493 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>, 1494 SchedVar<NoSchedPred, [WriteALU]> 1495]>; 1496def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1497 CMP16rr, CMP16rr_REV, 1498 CMP32rr, CMP32rr_REV, 1499 CMP64rr, CMP64rr_REV)>; 1500 1501def Zn3WriteFZeroIdiom : SchedWriteVariant<[ 1502 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1503 SchedVar<NoSchedPred, [WriteFLogic]> 1504]>; 1505// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1506def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1507 VANDNPSrr, VANDNPDrr)>; 1508 1509def Zn3WriteFZeroIdiomY : SchedWriteVariant<[ 1510 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1511 SchedVar<NoSchedPred, [WriteFLogicY]> 1512]>; 1513def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1514 VANDNPSYrr, VANDNPDYrr)>; 1515 1516def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1517 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1518 SchedVar<NoSchedPred, [WriteVecLogicX]> 1519]>; 1520// NOTE: PXORrr,PANDNrr are not zero-cycle! 1521def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1522 1523def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1524 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1525 SchedVar<NoSchedPred, [WriteVecLogicY]> 1526]>; 1527def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1528 1529def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[ 1530 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1531 SchedVar<NoSchedPred, [WriteVecALUX]> 1532]>; 1533// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1534// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1535def : InstRW<[Zn3WriteVZeroIdiomALUX], 1536 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1537 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1538 1539def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[ 1540 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1541 SchedVar<NoSchedPred, [WriteVecALUY]> 1542]>; 1543def : InstRW<[Zn3WriteVZeroIdiomALUY], 1544 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1545 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1546 1547def : IsZeroIdiomFunction<[ 1548 // GPR Zero-idioms. 1549 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1550 XOR64rr, XOR64rr_REV, 1551 SUB32rr, SUB32rr_REV, 1552 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1553 1554 // SSE XMM Zero-idioms. 1555 DepBreakingClass<[ 1556 // fp variants. 1557 XORPSrr, XORPDrr, 1558 ANDNPSrr, ANDNPDrr, 1559 1560 // int variants. 1561 PXORrr, 1562 PANDNrr, 1563 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1564 PSUBSBrr, PSUBSWrr, 1565 PSUBUSBrr, PSUBUSWrr, 1566 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1567 ], ZeroIdiomPredicate>, 1568 1569 // AVX XMM Zero-idioms. 1570 DepBreakingClass<[ 1571 // fp variants. 1572 VXORPSrr, VXORPDrr, 1573 VANDNPSrr, VANDNPDrr, 1574 1575 // int variants. 1576 VPXORrr, 1577 VPANDNrr, 1578 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1579 VPSUBSBrr, VPSUBSWrr, 1580 VPSUBUSBrr, VPSUBUSWrr, 1581 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1582 ], ZeroIdiomPredicate>, 1583 1584 // AVX YMM Zero-idioms. 1585 DepBreakingClass<[ 1586 // fp variants. 1587 VXORPSYrr, VXORPDYrr, 1588 VANDNPSYrr, VANDNPDYrr, 1589 1590 // int variants. 1591 VPXORYrr, 1592 VPANDNYrr, 1593 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1594 VPSUBSBYrr, VPSUBSWYrr, 1595 VPSUBUSBYrr, VPSUBUSWYrr, 1596 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1597 ], ZeroIdiomPredicate>, 1598]>; 1599 1600def : IsDepBreakingFunction<[ 1601 // GPR 1602 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1603 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1604 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1605 CMP16rr, CMP16rr_REV, 1606 CMP32rr, CMP32rr_REV, 1607 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1608 1609 // MMX 1610 DepBreakingClass<[ 1611 MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr 1612 ], ZeroIdiomPredicate>, 1613 1614 // SSE 1615 DepBreakingClass<[ 1616 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1617 ], ZeroIdiomPredicate>, 1618 1619 // AVX XMM 1620 DepBreakingClass<[ 1621 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1622 ], ZeroIdiomPredicate>, 1623 1624 // AVX YMM 1625 DepBreakingClass<[ 1626 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1627 ], ZeroIdiomPredicate>, 1628]>; 1629 1630} // SchedModel 1631