1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for Znver3 to support instruction 10// scheduling and other instruction cost heuristics. 11// Based on: 12// * AMD Software Optimization Guide for AMD Family 19h Processors. 13// https://www.amd.com/system/files/TechDocs/56665.zip 14// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog 15// http://www.agner.org/optimize/microarchitecture.pdf 16// * AMD Zen 3 Ryzen Deep Dive Review 17// https://www.anandtech.com/show/16214/ 18//===----------------------------------------------------------------------===// 19 20def Znver3Model : SchedMachineModel { 21 // AMD SOG 19h, 2.9.6 Dispatch 22 // The processor may dispatch up to 6 macro ops per cycle 23 // into the execution engine. 24 let IssueWidth = 6; 25 // AMD SOG 19h, 2.10.3 26 // The retire control unit (RCU) tracks the completion status of all 27 // outstanding operations (integer, load/store, and floating-point) and is 28 // the final arbiter for exception processing and recovery. 29 // The unit can receive up to 6 macro ops dispatched per cycle and track up 30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. 31 let MicroOpBufferSize = 256; 32 // AMD SOG 19h, 2.9.1 Op Cache 33 // The op cache is organized as an associative cache with 64 sets and 8 ways. 34 // At each set-way intersection is an entry containing up to 8 macro ops. 35 // The maximum capacity of the op cache is 4K ops. 36 // Assuming a maximum dispatch of 8 ops/cy and a mispredict cost of 12cy from 37 // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop unrolling 38 // leading to excessive filling of the op-cache from frontend. 39 let LoopMicroOpBufferSize = 96; 40 // AMD SOG 19h, 2.6.2 L1 Data Cache 41 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. 42 // AMD SOG 19h, 2.12 L1 Data Cache 43 // The AGU and LS pipelines are optimized for simple address generation modes. 44 // <...> and can achieve 4-cycle load-to-use integer load latency. 45 let LoadLatency = 4; 46 // AMD SOG 19h, 2.12 L1 Data Cache 47 // The AGU and LS pipelines are optimized for simple address generation modes. 48 // <...> and can achieve <...> 7-cycle load-to-use FP load latency. 49 int VecLoadLatency = 7; 50 // Latency of a simple store operation. 51 int StoreLatency = 1; 52 // FIXME: 53 let HighLatency = 25; // FIXME: any better choice? 54 // AMD SOG 19h, 2.8 Optimizing Branching 55 // The branch misprediction penalty is in the range from 11 to 18 cycles, 56 // <...>. The common case penalty is 13 cycles. 57 let MispredictPenalty = 13; 58 59 let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. 60 61 let CompleteModel = 1; 62} 63 64let SchedModel = Znver3Model in { 65 66 67//===----------------------------------------------------------------------===// 68// RCU 69//===----------------------------------------------------------------------===// 70 71// AMD SOG 19h, 2.10.3 Retire Control Unit 72// The unit can receive up to 6 macro ops dispatched per cycle and track up to 73// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> 74// The retire unit handles in-order commit of up to eight macro ops per cycle. 75def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>; 76 77//===----------------------------------------------------------------------===// 78// Units 79//===----------------------------------------------------------------------===// 80 81// There are total of three Units, each one with it's own schedulers. 82 83//===----------------------------------------------------------------------===// 84// Integer Execution Unit 85// 86 87// AMD SOG 19h, 2.4 Superscalar Organization 88// The processor uses four decoupled independent integer scheduler queues, 89// each one servicing one ALU pipeline and one or two other pipelines 90 91// 92// Execution pipes 93//===----------------------------------------------------------------------===// 94 95// AMD SOG 19h, 2.10.2 Execution Units 96// The processor contains 4 general purpose integer execution pipes. 97// Each pipe has an ALU capable of general purpose integer operations. 98def Zn3ALU0 : ProcResource<1>; 99def Zn3ALU1 : ProcResource<1>; 100def Zn3ALU2 : ProcResource<1>; 101def Zn3ALU3 : ProcResource<1>; 102 103// AMD SOG 19h, 2.10.2 Execution Units 104// There is also a separate branch execution unit. 105def Zn3BRU1 : ProcResource<1>; 106 107// AMD SOG 19h, 2.10.2 Execution Units 108// There are three Address Generation Units (AGUs) for all load and store 109// address generation. There are also 3 store data movement units 110// associated with the same schedulers as the AGUs. 111def Zn3AGU0 : ProcResource<1>; 112def Zn3AGU1 : ProcResource<1>; 113def Zn3AGU2 : ProcResource<1>; 114 115// 116// Execution Units 117//===----------------------------------------------------------------------===// 118 119// AMD SOG 19h, 2.10.2 Execution Units 120// ALU0 additionally has divide <...> execution capability. 121defvar Zn3Divider = Zn3ALU0; 122 123// AMD SOG 19h, 2.10.2 Execution Units 124// ALU0 additionally has <...> branch execution capability. 125defvar Zn3BRU0 = Zn3ALU0; 126 127// Integer Multiplication issued on ALU1. 128defvar Zn3Multiplier = Zn3ALU1; 129 130// Execution pipeline grouping 131//===----------------------------------------------------------------------===// 132 133// General ALU operations 134def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>; 135 136// General AGU operations 137def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>; 138 139// Control flow: jumps, calls 140def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>; 141 142// Everything that isn't control flow, but still needs to access CC register, 143// namely: conditional moves, SETcc. 144def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>; 145 146// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT 147 148// Simple bit twiddling: bit test, shift/rotate, bit extraction 149def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>; 150 151 152// 153// Scheduling 154//===----------------------------------------------------------------------===// 155 156// AMD SOG 19h, 2.10.3 Retire Control Unit 157// The integer physical register file (PRF) consists of 192 registers. 158def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0], 159 6, // Max moves that can be eliminated per cycle. 160 0>; // Restrict move elimination to zero regs. 161 162// anandtech, The integer scheduler has a 4*24 entry macro op capacity. 163// AMD SOG 19h, 2.10.1 Schedulers 164// The schedulers can receive up to six macro ops per cycle, with a limit of 165// two per scheduler. Each scheduler can issue one micro op per cycle into 166// each of its associated pipelines 167// FIXME: these are 4 separate schedulers, not a single big one. 168def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0 169 Zn3ALU1, Zn3AGU1, // scheduler 1 170 Zn3ALU2, Zn3AGU2, // scheduler 2 171 Zn3ALU3, Zn3BRU1 // scheduler 3 172 ]> { 173 let BufferSize = !mul(4, 24); 174} 175 176 177//===----------------------------------------------------------------------===// 178// Floating-Point Unit 179// 180 181// AMD SOG 19h, 2.4 Superscalar Organization 182// The processor uses <...> two decoupled independent floating point schedulers 183// each servicing two FP pipelines and one store or FP-to-integer pipeline. 184 185// 186// Execution pipes 187//===----------------------------------------------------------------------===// 188 189// AMD SOG 19h, 2.10.1 Schedulers 190// <...>, and six FPU pipes. 191// Agner, 22.10 Floating point execution pipes 192// There are six floating point/vector execution pipes, 193def Zn3FP0 : ProcResource<1>; 194def Zn3FP1 : ProcResource<1>; 195def Zn3FP2 : ProcResource<1>; 196def Zn3FP3 : ProcResource<1>; 197def Zn3FP45 : ProcResource<2>; 198 199// 200// Execution Units 201//===----------------------------------------------------------------------===// 202// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 203 204// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 205defvar Zn3FPFMul0 = Zn3FP0; 206defvar Zn3FPFMul1 = Zn3FP1; 207 208// (v)FADD* 209defvar Zn3FPFAdd0 = Zn3FP2; 210defvar Zn3FPFAdd1 = Zn3FP3; 211 212// All convert operations except pack/unpack 213defvar Zn3FPFCvt0 = Zn3FP2; 214defvar Zn3FPFCvt1 = Zn3FP3; 215 216// All Divide and Square Root except Reciprocal Approximation 217// AMD SOG 19h, 2.11.1 Floating Point Execution Resources 218// FDIV unit can support 2 simultaneous operations in flight 219// even though it occupies a single pipe. 220// FIXME: BufferSize=2 ? 221defvar Zn3FPFDiv = Zn3FP1; 222 223// Moves and Logical operations on Floating Point Data Types 224defvar Zn3FPFMisc0 = Zn3FP0; 225defvar Zn3FPFMisc1 = Zn3FP1; 226defvar Zn3FPFMisc2 = Zn3FP2; 227defvar Zn3FPFMisc3 = Zn3FP3; 228 229// Integer Adds, Subtracts, and Compares 230// Some complex VADD operations are not available in all pipes. 231defvar Zn3FPVAdd0 = Zn3FP0; 232defvar Zn3FPVAdd1 = Zn3FP1; 233defvar Zn3FPVAdd2 = Zn3FP2; 234defvar Zn3FPVAdd3 = Zn3FP3; 235 236// Integer Multiplies, SAD, Blendvb 237defvar Zn3FPVMul0 = Zn3FP0; 238defvar Zn3FPVMul1 = Zn3FP3; 239 240// Data Shuffles, Packs, Unpacks, Permute 241// Some complex shuffle operations are only available in pipe1. 242defvar Zn3FPVShuf = Zn3FP1; 243defvar Zn3FPVShufAux = Zn3FP2; 244 245// Bit Shift Left/Right operations 246defvar Zn3FPVShift0 = Zn3FP1; 247defvar Zn3FPVShift1 = Zn3FP2; 248 249// Moves and Logical operations on Packed Integer Data Types 250defvar Zn3FPVMisc0 = Zn3FP0; 251defvar Zn3FPVMisc1 = Zn3FP1; 252defvar Zn3FPVMisc2 = Zn3FP2; 253defvar Zn3FPVMisc3 = Zn3FP3; 254 255// *AES* 256defvar Zn3FPAES0 = Zn3FP0; 257defvar Zn3FPAES1 = Zn3FP1; 258 259// *CLM* 260defvar Zn3FPCLM0 = Zn3FP0; 261defvar Zn3FPCLM1 = Zn3FP1; 262 263// Execution pipeline grouping 264//===----------------------------------------------------------------------===// 265 266// AMD SOG 19h, 2.11 Floating-Point Unit 267// Stores and floating point to general purpose register transfer 268// have 2 dedicated pipelines (pipe 5 and 6). 269def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>; 270 271// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) 272def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>; 273 274// (v)FADD* 275// Some complex VADD operations are not available in all pipes. 276def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>; 277 278// All convert operations except pack/unpack 279def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>; 280 281// All Divide and Square Root except Reciprocal Approximation 282// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>; 283 284// Moves and Logical operations on Floating Point Data Types 285def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>; 286 287def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>; 288 289// Loads, Stores and Move to General Register (EX) Operations 290// AMD SOG 19h, 2.11 Floating-Point Unit 291// Stores and floating point to general purpose register transfer 292// have 2 dedicated pipelines (pipe 5 and 6). 293defvar Zn3FPLd01 = Zn3FP45; 294 295// AMD SOG 19h, 2.11 Floating-Point Unit 296// Note that FP stores are supported on two pipelines, 297// but throughput is limited to one per cycle. 298let Super = Zn3FP45 in 299def Zn3FPSt : ProcResource<1>; 300 301// Integer Adds, Subtracts, and Compares 302// Some complex VADD operations are not available in all pipes. 303def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>; 304 305def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>; 306def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>; 307 308// Integer Multiplies, SAD, Blendvb 309def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>; 310 311// Data Shuffles, Packs, Unpacks, Permute 312// Some complex shuffle operations are only available in pipe1. 313def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>; 314 315// Bit Shift Left/Right operations 316def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>; 317 318// Moves and Logical operations on Packed Integer Data Types 319def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>; 320 321// *AES* 322def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>; 323 324// *CLM* 325def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>; 326 327 328// 329// Scheduling 330//===----------------------------------------------------------------------===// 331 332// Agner, 21.8 Register renaming and out-of-order schedulers 333// The floating point register file has 160 vector registers 334// of 128 bits each in Zen 1 and 256 bits each in Zen 2. 335// anandtech also confirms this. 336def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1], 337 6, // Max moves that can be eliminated per cycle. 338 0>; // Restrict move elimination to zero regs. 339 340// AMD SOG 19h, 2.11 Floating-Point Unit 341// The floating-point scheduler has a 2*32 entry macro op capacity. 342// AMD SOG 19h, 2.11 Floating-Point Unit 343// <...> the scheduler can issue 1 micro op per cycle for each pipe. 344// FIXME: those are two separate schedulers, not a single big one. 345def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2, /*Zn3FP4,*/ // scheduler 0 346 Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/ // scheduler 1 347 ]> { 348 let BufferSize = !mul(2, 32); 349} 350 351// AMD SOG 19h, 2.11 Floating-Point Unit 352// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) 353// even if floating-point scheduler is full. 354// FIXME: how to model this properly? 355 356 357//===----------------------------------------------------------------------===// 358// Load-Store Unit 359// 360 361// AMD SOG 19h, 2.12 Load-Store Unit 362// The LS unit contains three largely independent pipe-lines 363// enabling the execution of three 256-bit memory operations per cycle. 364def Zn3LSU : ProcResource<3>; 365 366// AMD SOG 19h, 2.12 Load-Store Unit 367// All three memory operations can be loads. 368let Super = Zn3LSU in 369def Zn3Load : ProcResource<3> { 370 // AMD SOG 19h, 2.12 Load-Store Unit 371 // The LS unit can process up to 72 out-of-order loads. 372 let BufferSize = 72; 373} 374 375def Zn3LoadQueue : LoadQueue<Zn3Load>; 376 377// AMD SOG 19h, 2.12 Load-Store Unit 378// A maximum of two of the memory operations can be stores. 379let Super = Zn3LSU in 380def Zn3Store : ProcResource<2> { 381 // AMD SOG 19h, 2.12 Load-Store Unit 382 // The LS unit utilizes a 64-entry store queue (STQ). 383 let BufferSize = 64; 384} 385 386def Zn3StoreQueue : StoreQueue<Zn3Store>; 387 388//===----------------------------------------------------------------------===// 389// Basic helper classes. 390//===----------------------------------------------------------------------===// 391 392// Many SchedWrites are defined in pairs with and without a folded load. 393// Instructions with folded loads are usually micro-fused, so they only appear 394// as two micro-ops when dispatched by the schedulers. 395// This multiclass defines the resource usage for variants with and without 396// folded loads. 397 398multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, 399 int Lat = 1, list<int> Res = [], int UOps = 1> { 400 def : WriteRes<SchedRW, ExePorts> { 401 let Latency = Lat; 402 let ReleaseAtCycles = Res; 403 let NumMicroOps = UOps; 404 } 405} 406 407multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW, 408 list<ProcResourceKind> ExePorts, int Lat, 409 list<int> Res, int UOps, int LoadLat, int LoadUOps, 410 ProcResourceKind AGU, int LoadRes> { 411 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 412 413 defm : __zn3WriteRes<SchedRW.Folded, 414 !listconcat([AGU, Zn3Load], ExePorts), 415 !add(Lat, LoadLat), 416 !if(!and(!empty(Res), !eq(LoadRes, 1)), 417 [], 418 !listconcat([1, LoadRes], 419 !if(!empty(Res), 420 !listsplat(1, !size(ExePorts)), 421 Res))), 422 !add(UOps, LoadUOps)>; 423} 424 425// For classes without folded loads. 426multiclass Zn3WriteResInt<SchedWrite SchedRW, 427 list<ProcResourceKind> ExePorts, int Lat = 1, 428 list<int> Res = [], int UOps = 1> { 429 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 430} 431 432multiclass Zn3WriteResXMM<SchedWrite SchedRW, 433 list<ProcResourceKind> ExePorts, int Lat = 1, 434 list<int> Res = [], int UOps = 1> { 435 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 436} 437 438multiclass Zn3WriteResYMM<SchedWrite SchedRW, 439 list<ProcResourceKind> ExePorts, int Lat = 1, 440 list<int> Res = [], int UOps = 1> { 441 defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; 442} 443 444// For classes with folded loads. 445multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW, 446 list<ProcResourceKind> ExePorts, int Lat = 1, 447 list<int> Res = [], int UOps = 1, 448 int LoadUOps = 0, int LoadRes = 1> { 449 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 450 Znver3Model.LoadLatency, 451 LoadUOps, Zn3AGU012, LoadRes>; 452} 453 454multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW, 455 list<ProcResourceKind> ExePorts, int Lat = 1, 456 list<int> Res = [], int UOps = 1, 457 int LoadUOps = 0, int LoadRes = 1> { 458 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 459 Znver3Model.VecLoadLatency, 460 LoadUOps, Zn3FPLd01, LoadRes>; 461} 462 463multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW, 464 list<ProcResourceKind> ExePorts, int Lat = 1, 465 list<int> Res = [], int UOps = 1, 466 int LoadUOps = 0, int LoadRes = 1> { 467 defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, 468 Znver3Model.VecLoadLatency, 469 LoadUOps, Zn3FPLd01, LoadRes>; 470} 471 472 473//===----------------------------------------------------------------------===// 474// Here be dragons. 475//===----------------------------------------------------------------------===// 476 477def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>; 478 479def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>; 480def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>; 481def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>; 482 483// AMD SOG 19h, 2.11 Floating-Point Unit 484// There is 1 cycle of added latency for a result to cross 485// from F to I or I to F domain. 486def : ReadAdvance<ReadInt2Fpu, -1>; 487 488// Instructions with both a load and a store folded are modeled as a folded 489// load + WriteRMW. 490defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>; 491 492// Loads, stores, and moves, not folded with other operations. 493defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>; 494 495// Model the effect of clobbering the read-write mask operand of the GATHER operation. 496// Does not cost anything by itself, only has latency, matching that of the WriteLoad, 497defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>; 498 499def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { 500 let Latency = !add(Znver3Model.LoadLatency, 1); 501 let ReleaseAtCycles = [3, 1]; 502 let NumMicroOps = 1; 503} 504def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; 505 506defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 507defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>; 508defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>; 509 510// Treat misc copies as a move. 511def : InstRW<[WriteMove], (instrs COPY)>; 512 513def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 514 let Latency = Znver3Model.LoadLatency; 515 let ReleaseAtCycles = [1, 1, 4]; 516 let NumMicroOps = 1; 517} 518def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>; 519 520def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> { 521 let Latency = Znver3Model.StoreLatency; 522 let ReleaseAtCycles = [4, 1, 1]; 523 let NumMicroOps = 2; 524} 525def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; 526 527// Arithmetic. 528defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op. 529 530def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> { 531 let Latency = 1; 532 let ReleaseAtCycles = [4]; 533 let NumMicroOps = 1; 534} 535def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, 536 AND8i8, AND16i16, AND32i32, AND64i32, 537 OR8i8, OR16i16, OR32i32, OR64i32, 538 SUB8i8, SUB16i16, SUB32i32, SUB64i32, 539 XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; 540 541def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> { 542 let Latency = 1; 543 let ReleaseAtCycles = [4]; 544 let NumMicroOps = 1; 545} 546def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; 547 548def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> { 549 let Latency = 1; 550 let ReleaseAtCycles = [2]; 551 let NumMicroOps = 1; 552} 553def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; 554 555def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> { 556 let Latency = 3; 557 let ReleaseAtCycles = [1]; 558 let NumMicroOps = 1; 559} 560def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, 561 PEXT32rr, PEXT64rr)>; 562 563defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op. 564 565def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> { 566 let Latency = 1; 567 let ReleaseAtCycles = [1, 1, 7, 1]; 568 let NumMicroOps = 1; 569} 570def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; 571 572// This is for simple LEAs with one or two input operands. 573defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>; // LEA instructions can't fold loads. 574 575// This write is used for slow LEA instructions. 576def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> { 577 let Latency = 2; 578 let ReleaseAtCycles = [1]; 579 let NumMicroOps = 2; 580} 581 582// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset), 583// or an LEA with a `Scale` value different than 1. 584def Zn3SlowLEAPredicate : MCSchedPredicate< 585 CheckAny<[ 586 // A 3-operand LEA (base, index, offset). 587 IsThreeOperandsLEAFn, 588 // An LEA with a "Scale" different than 1. 589 CheckAll<[ 590 CheckIsImmOperand<2>, 591 CheckNot<CheckImmOperand<2, 1>> 592 ]> 593 ]> 594>; 595 596def Zn3WriteLEA : SchedWriteVariant<[ 597 SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>, 598 SchedVar<NoSchedPred, [WriteLEA]> 599]>; 600 601def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 602 603def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> { 604 let Latency = 2; // FIXME: not from llvm-exegesis 605 let ReleaseAtCycles = [4]; 606 let NumMicroOps = 2; 607} 608 609def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>; 610 611// Integer multiplication 612defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. 613defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. 614defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. 615defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. 616defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. 617defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 618defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. 619defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. 620defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. 621defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. 622defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. 623defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. 624defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>; // Integer multiplication, high part. 625defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. 626 627defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. 628defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. 629 630defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap. 631 632def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> { 633 let Latency = 3; 634 let ReleaseAtCycles = [12]; 635 let NumMicroOps = 3; 636} 637def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; 638 639defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>; // Compare and set, compare and swap. 640 641def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 642 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency); 643 let ReleaseAtCycles = [1, 1, 12]; 644 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2); 645} 646def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; 647 648def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> { 649 let Latency = 3; // FIXME: not from llvm-exegesis 650 let ReleaseAtCycles = [24]; 651 let NumMicroOps = 19; 652} 653def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>; 654 655def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> { 656 let Latency = 4; // FIXME: not from llvm-exegesis 657 let ReleaseAtCycles = [59]; 658 let NumMicroOps = 28; 659} 660def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; 661 662def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> { 663 let Latency = 1; 664 let ReleaseAtCycles = [2]; 665 let NumMicroOps = 2; 666} 667def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; 668 669def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 670 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis 671 let ReleaseAtCycles = [1, 1, 2]; 672 let NumMicroOps = 5; 673} 674def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; 675 676def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> { 677 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis 678 let ReleaseAtCycles = [1, 1, 2]; 679 let NumMicroOps = 2; 680} 681def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; 682 683// Integer division. 684// FIXME: uops for 8-bit division measures as 2. for others it's a guess. 685// FIXME: latency for 8-bit division measures as 10. for others it's a guess. 686defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>; 687defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>; 688defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>; 689defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>; 690defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>; 691defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>; 692defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>; 693defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>; 694 695defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward. 696defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse. 697 698defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count. 699 700def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 701 let Latency = 1; 702 let ReleaseAtCycles = [4]; 703 let NumMicroOps = 1; 704} 705def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>; 706 707defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count. 708 709def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 710 let Latency = 1; 711 let ReleaseAtCycles = [4]; 712 let NumMicroOps = 1; 713} 714def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>; 715 716defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count. 717 718def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> { 719 let Latency = 2; 720 let ReleaseAtCycles = [4]; 721 let NumMicroOps = 2; 722} 723def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>; 724 725defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move. 726defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. 727defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code. 728defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 729defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH. 730 731defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test 732defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>; 733defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>; 734 735defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set 736defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>; 737defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>; 738 739// Integer shifts and rotates. 740defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 741defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 742defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 743 744def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> { 745 let Latency = 1; 746 let ReleaseAtCycles = [2]; 747 let NumMicroOps = 1; 748} 749def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, 750 RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; 751 752def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 753 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency); 754 let ReleaseAtCycles = [1, 1, 2]; 755 let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1); 756} 757def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, 758 RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; 759 760def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> { 761 let Latency = 3; 762 let ReleaseAtCycles = [6]; 763 let NumMicroOps = 7; 764} 765def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; 766 767def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 768 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency); 769 let ReleaseAtCycles = [1, 1, 8]; 770 let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3); 771} 772def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; 773 774def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> { 775 let Latency = 4; 776 let ReleaseAtCycles = [8]; 777 let NumMicroOps = 9; 778} 779def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; 780 781def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 782 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency); 783 let ReleaseAtCycles = [1, 1, 8]; 784 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2); 785} 786def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; 787 788defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 789 790def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> { 791 let Latency = 3; 792 let ReleaseAtCycles = [6]; 793 let NumMicroOps = 7; 794} 795def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; 796 797def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 798 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency); 799 let ReleaseAtCycles = [1, 1, 8]; 800 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2); 801} 802def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; 803 804def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> { 805 let Latency = 4; 806 let ReleaseAtCycles = [8]; 807 let NumMicroOps = 9; 808} 809def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; 810 811def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> { 812 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency); 813 let ReleaseAtCycles = [1, 1, 8]; 814 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2); 815} 816def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; 817 818// Double shift instructions. 819defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>; 820defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>; 821defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 822defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>; 823 824// BMI1 BEXTR/BLS, BMI2 BZHI 825defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 826defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>; 827defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>; 828 829// Idioms that clear a register, like xorps %xmm0, %xmm0. 830// These can often bypass execution ports completely. 831defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>; 832 833// Branches don't produce values, so they have no latency, but they still 834// consume resources. Indirect branches can fold loads. 835defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis 836 837// Floating point. This covers both scalar and vector operations. 838defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>; 839defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 840defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>; 841defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 842defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 843defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 844defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 845defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 846defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 847 848def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> { 849 let Latency = 2; // FIXME: not from llvm-exegesis 850 let ReleaseAtCycles = [1, 1]; 851 let NumMicroOps = 2; 852} 853def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, 854 VMOVHPDmr, VMOVHPSmr)>; 855 856defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 857defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 858defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 859defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 860defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 861 862defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 863defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 864defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 865defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 866 867defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub. 868 869def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 870 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 871 let ReleaseAtCycles = [1, 1, 24]; 872 let NumMicroOps = 2; 873} 874def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, 875 SUB_FI16m, SUB_FI32m, 876 SUBR_FI16m, SUBR_FI32m, 877 MUL_FI16m, MUL_FI32m)>; 878 879def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 880 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis 881 let ReleaseAtCycles = [1, 1, 62]; 882 let NumMicroOps = 2; 883} 884def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, 885 DIVR_FI16m, DIVR_FI32m)>; 886 887defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). 888defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). 889defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM). 890defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub. 891defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). 892defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). 893defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM). 894defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare. 895defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM). 896defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM). 897defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM). 898defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare. 899defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). 900defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM). 901defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM). 902defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). 903defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). 904defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication. 905defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). 906defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). 907defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM). 908defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication. 909defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). 910defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). 911defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM). 912defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division. 913defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM). 914defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM). 915defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM). 916defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division. 917defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM). 918defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM). 919defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM). 920defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root. 921defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM). 922defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM). 923defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM). 924defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root. 925defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). 926defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). 927defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM). 928defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. 929defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate. 930defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM). 931defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM). 932defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM). 933defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate. 934defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM). 935defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM). 936defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM). 937defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add. 938defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM). 939defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM). 940defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM). 941defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. 942defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. 943defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). 944defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. 945defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding. 946defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). 947defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM). 948defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. 949defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). 950defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM). 951defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. 952defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). 953defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM). 954defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. 955defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). 956defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM). 957defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. 958defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). 959defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM). 960defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends. 961defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). 962defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM). 963defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends. 964defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). 965defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM). 966 967// Horizontal Add/Sub (float and integer) 968defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>; 969defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>; 970defm : X86WriteResPairUnsupported<WriteFHAddZ>; 971defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 972defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>; 973defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; 974defm : X86WriteResPairUnsupported<WritePHAddZ>; 975 976// Vector integer operations. 977defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 978defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 979defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 980defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 981defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 982defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 983defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>; 984defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 985defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 986 987def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { 988 let Latency = 4; 989 let ReleaseAtCycles = [1]; 990 let NumMicroOps = 1; 991} 992def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; 993 994def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { 995 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 996 let ReleaseAtCycles = [1, 1, 1]; 997 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); 998} 999def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; 1000 1001def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { 1002 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); 1003 let ReleaseAtCycles = [1, 1, 1]; 1004 let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); 1005} 1006def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; 1007 1008defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1009defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1010defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>; 1011defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1012defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>; 1013defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>; 1014defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>; 1015 1016defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>; 1017defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>; 1018 1019def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1020 let Latency = 1; 1021 let ReleaseAtCycles = [1, 2]; 1022 let NumMicroOps = 2; 1023} 1024def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; 1025 1026def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> { 1027 let Latency = 1; 1028 let ReleaseAtCycles = [1, 4]; 1029 let NumMicroOps = 2; 1030} 1031def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; 1032 1033defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. 1034 1035def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1036 let Latency = 3; 1037 let ReleaseAtCycles = [1, 1]; 1038 let NumMicroOps = 1; 1039} 1040def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; 1041 1042def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> { 1043 let Latency = 3; 1044 let ReleaseAtCycles = [1, 1]; 1045 let NumMicroOps = 2; 1046} 1047def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; 1048 1049defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). 1050 1051def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1052 let Latency = 1; 1053 let ReleaseAtCycles = [1]; 1054 let NumMicroOps = 1; 1055} 1056def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, 1057 PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, 1058 PAVGBrr, PAVGWrr, 1059 PSIGNBrr, PSIGNDrr, PSIGNWrr, 1060 VPABSBrr, VPABSDrr, VPABSWrr, 1061 VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, 1062 VPAVGBrr, VPAVGWrr, 1063 VPCMPEQQrr, 1064 VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, 1065 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; 1066 1067def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> { 1068 let Latency = 1; 1069 let ReleaseAtCycles = [1]; 1070 let NumMicroOps = 1; 1071} 1072def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, 1073 MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr, 1074 MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr, 1075 MMX_PAVGBrr, MMX_PAVGWrr, 1076 MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>; 1077 1078defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). 1079 1080def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> { 1081 let Latency = 1; 1082 let ReleaseAtCycles = [1]; 1083 let NumMicroOps = 1; 1084} 1085def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, 1086 VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, 1087 VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, 1088 VPAVGBYrr, VPAVGWYrr, 1089 VPCMPEQQYrr, 1090 VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; 1091 1092defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM). 1093defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. 1094defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). 1095defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). 1096defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM). 1097defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. 1098defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). 1099defm : X86WriteResPairUnsupported<WriteVecTestZ>; // Vector integer TEST instructions (ZMM). 1100defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (default). 1101defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM). 1102defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). 1103defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM). 1104defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). 1105defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). 1106defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). 1107defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM). 1108defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (default). 1109defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). 1110defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). 1111defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM). 1112defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD. 1113defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). 1114defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM). 1115defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles. 1116defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). 1117defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). 1118defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM). 1119defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles. 1120defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). 1121defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). 1122defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM). 1123defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends. 1124defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). 1125defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM). 1126defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends. 1127defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). 1128defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM). 1129defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW. 1130defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). 1131defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). 1132defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM). 1133defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. 1134defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). 1135defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM). 1136defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. 1137 1138// Vector insert/extract operations. 1139defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. 1140defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr. 1141defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. 1142 1143// MOVMSK operations. 1144defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1145defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1146defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>; 1147defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>; 1148 1149// Conversion between integer and float. 1150defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer. 1151defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM). 1152defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM). 1153defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM). 1154 1155def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1156 let Latency = 1; 1157 let ReleaseAtCycles = [2]; 1158 let NumMicroOps = 2; 1159} 1160def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>; 1161 1162defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer. 1163 1164defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). 1165defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM). 1166defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM). 1167 1168defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. 1169defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). 1170defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). 1171defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM). 1172 1173def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1174 let Latency = 2; 1175 let ReleaseAtCycles = [6]; 1176 let NumMicroOps = 2; 1177} 1178def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>; 1179 1180defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. 1181defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). 1182defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). 1183defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM). 1184 1185def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> { 1186 let Latency = 3; 1187 let ReleaseAtCycles = [1]; 1188 let NumMicroOps = 2; 1189} 1190def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>; 1191 1192defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. 1193defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). 1194defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). 1195defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM). 1196 1197defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. 1198defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). 1199defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). 1200defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM). 1201 1202defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. 1203defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). 1204defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM). 1205 1206defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. 1207defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). 1208defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM). 1209defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. 1210defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). 1211defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM). 1212 1213// CRC32 instruction. 1214defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>; 1215 1216def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1217 let Latency = 2; 1218 let ReleaseAtCycles = [2]; 1219 let NumMicroOps = 2; 1220} 1221def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; 1222 1223def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1224 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); 1225 let ReleaseAtCycles = [1, 1, 2]; 1226 let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); 1227} 1228def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; 1229 1230def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { 1231 let Latency = 1; 1232 let ReleaseAtCycles = [2]; 1233 let NumMicroOps = 1; 1234} 1235def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; 1236 1237def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1238 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); 1239 let ReleaseAtCycles = [1, 1, 2]; 1240 let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); 1241} 1242def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; 1243 1244def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { 1245 let Latency = 2; 1246 let ReleaseAtCycles = [3]; 1247 let NumMicroOps = 2; 1248} 1249def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; 1250 1251def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1252 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); 1253 let ReleaseAtCycles = [1, 1, 3]; 1254 let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); 1255} 1256def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; 1257 1258def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { 1259 let Latency = 3; 1260 let ReleaseAtCycles = [8]; 1261 let NumMicroOps = 4; 1262} 1263def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; 1264 1265def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { 1266 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); 1267 let ReleaseAtCycles = [1, 1, 8]; 1268 let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); 1269} 1270def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; 1271 1272def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> { 1273 let Latency = 6; 1274 let ReleaseAtCycles = [8]; 1275 let NumMicroOps = 1; 1276} 1277def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; 1278 1279def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> { 1280 let Latency = 4; 1281 let ReleaseAtCycles = [8]; 1282 let NumMicroOps = 1; 1283} 1284def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; 1285 1286// Strings instructions. 1287// Packed Compare Implicit Length Strings, Return Mask 1288defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; 1289// Packed Compare Explicit Length Strings, Return Mask 1290defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; 1291// Packed Compare Implicit Length Strings, Return Index 1292defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>; 1293// Packed Compare Explicit Length Strings, Return Index 1294defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; 1295 1296// AES instructions. 1297defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption. 1298defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn. 1299defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation. 1300 1301// Carry-less multiplication instructions. 1302defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>; 1303 1304// EMMS/FEMMS 1305defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis 1306 1307// Load/store MXCSR 1308defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis 1309defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis 1310 1311// Catch-all for expensive system instructions. 1312defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>; 1313 1314def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> { 1315 let Latency = 0; // FIXME: not from llvm-exegesis 1316 let ReleaseAtCycles = [1]; 1317 let NumMicroOps = 1; 1318} 1319def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>; 1320 1321def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> { 1322 let Latency = 10; // FIXME: not from llvm-exegesis 1323 let ReleaseAtCycles = [24]; 1324 let NumMicroOps = 18; 1325} 1326def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>; 1327 1328// AVX2. 1329defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. 1330defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. 1331defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles. 1332 1333def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { 1334 let Latency = 3; 1335 let ReleaseAtCycles = [1]; 1336 let NumMicroOps = 1; 1337} 1338def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; 1339 1340def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1341 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); 1342 let ReleaseAtCycles = [1, 1, 1]; 1343 let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); 1344} 1345def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>; 1346 1347def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1348 let Latency = !add(Znver3Model.LoadLatency, 7); 1349 let ReleaseAtCycles = [1, 1, 2]; 1350 let NumMicroOps = 3; 1351} 1352def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>; 1353 1354def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { 1355 let Latency = 6; 1356 let ReleaseAtCycles = [1]; 1357 let NumMicroOps = 2; 1358} 1359def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; 1360 1361def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1362 let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); 1363 let ReleaseAtCycles = [1, 1, 2]; 1364 let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); 1365} 1366def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; 1367 1368def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { 1369 let Latency = !add(Znver3Model.LoadLatency, 5); 1370 let ReleaseAtCycles = [1, 1, 2]; 1371 let NumMicroOps = 2; 1372} 1373def : InstRW<[Zn3WriteVPERMDYm], (instrs VPERMQYmi, VPERMDYrm)>; 1374 1375defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. 1376defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf], 5, [1], 2, /*LoadUOps=*/1>; // 256-bit width vector variable shuffles. 1377defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts. 1378defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). 1379defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM). 1380 1381// Old microcoded instructions that nobody use. 1382defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>; 1383 1384// Fence instructions. 1385defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>; 1386 1387def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> { 1388 let Latency = 1; 1389 let ReleaseAtCycles = [30]; 1390 let NumMicroOps = 1; 1391} 1392def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>; 1393 1394def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> { 1395 let Latency = 1; 1396 let ReleaseAtCycles = [1]; 1397 let NumMicroOps = 1; 1398} 1399def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>; 1400 1401// Nop, not very useful expect it provides a model for nops! 1402defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis 1403 1404 1405/////////////////////////////////////////////////////////////////////////////// 1406// Zero Cycle Move 1407/////////////////////////////////////////////////////////////////////////////// 1408 1409def Zn3WriteZeroLatency : SchedWriteRes<[]> { 1410 let Latency = 0; 1411 let ReleaseAtCycles = []; 1412 let NumMicroOps = 1; 1413} 1414def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, 1415 MOV64rr, MOV64rr_REV, 1416 MOVSX32rr32)>; 1417 1418def Zn3WriteSwapRenameable : SchedWriteRes<[]> { 1419 let Latency = 0; 1420 let ReleaseAtCycles = []; 1421 let NumMicroOps = 2; 1422} 1423def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, 1424 XCHG64rr, XCHG64ar)>; 1425 1426defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. 1427 1428defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class 1429defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>; 1430defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>; 1431defm : X86WriteResUnsupported<WriteFMoveZ>; 1432 1433defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX 1434defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>; 1435defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>; 1436defm : X86WriteResUnsupported<WriteVecMoveZ>; 1437 1438def : IsOptimizableRegisterMove<[ 1439 InstructionEquivalenceClass<[ 1440 // GPR variants. 1441 MOV32rr, MOV32rr_REV, 1442 MOV64rr, MOV64rr_REV, 1443 MOVSX32rr32, 1444 XCHG32rr, XCHG32ar, 1445 XCHG64rr, XCHG64ar, 1446 1447 // MMX variants. 1448 // MMX moves are *NOT* eliminated. 1449 1450 // SSE variants. 1451 MOVAPSrr, MOVAPSrr_REV, 1452 MOVUPSrr, MOVUPSrr_REV, 1453 MOVAPDrr, MOVAPDrr_REV, 1454 MOVUPDrr, MOVUPDrr_REV, 1455 MOVDQArr, MOVDQArr_REV, 1456 MOVDQUrr, MOVDQUrr_REV, 1457 1458 // AVX variants. 1459 VMOVAPSrr, VMOVAPSrr_REV, 1460 VMOVUPSrr, VMOVUPSrr_REV, 1461 VMOVAPDrr, VMOVAPDrr_REV, 1462 VMOVUPDrr, VMOVUPDrr_REV, 1463 VMOVDQArr, VMOVDQArr_REV, 1464 VMOVDQUrr, VMOVDQUrr_REV, 1465 1466 // AVX YMM variants. 1467 VMOVAPSYrr, VMOVAPSYrr_REV, 1468 VMOVUPSYrr, VMOVUPSYrr_REV, 1469 VMOVAPDYrr, VMOVAPDYrr_REV, 1470 VMOVUPDYrr, VMOVUPDYrr_REV, 1471 VMOVDQAYrr, VMOVDQAYrr_REV, 1472 VMOVDQUYrr, VMOVDQUYrr_REV, 1473 ], TruePred > 1474]>; 1475 1476/////////////////////////////////////////////////////////////////////////////// 1477// Dependency breaking instructions. 1478/////////////////////////////////////////////////////////////////////////////// 1479 1480def Zn3WriteZeroIdiom : SchedWriteVariant<[ 1481 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1482 SchedVar<NoSchedPred, [WriteALU]> 1483]>; 1484def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, 1485 XOR64rr, XOR64rr_REV, 1486 SUB32rr, SUB32rr_REV, 1487 SUB64rr, SUB64rr_REV)>; 1488 1489def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[ 1490 SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>, 1491 SchedVar<NoSchedPred, [WriteALU]> 1492]>; 1493def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, 1494 CMP16rr, CMP16rr_REV, 1495 CMP32rr, CMP32rr_REV, 1496 CMP64rr, CMP64rr_REV)>; 1497 1498def Zn3WriteFZeroIdiom : SchedWriteVariant<[ 1499 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1500 SchedVar<NoSchedPred, [WriteFLogic]> 1501]>; 1502// NOTE: XORPSrr, XORPDrr are not zero-cycle! 1503def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, 1504 VANDNPSrr, VANDNPDrr)>; 1505 1506def Zn3WriteFZeroIdiomY : SchedWriteVariant<[ 1507 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1508 SchedVar<NoSchedPred, [WriteFLogicY]> 1509]>; 1510def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 1511 VANDNPSYrr, VANDNPDYrr)>; 1512 1513def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[ 1514 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1515 SchedVar<NoSchedPred, [WriteVecLogicX]> 1516]>; 1517// NOTE: PXORrr,PANDNrr are not zero-cycle! 1518def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; 1519 1520def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[ 1521 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1522 SchedVar<NoSchedPred, [WriteVecLogicY]> 1523]>; 1524def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; 1525 1526def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[ 1527 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1528 SchedVar<NoSchedPred, [WriteVecALUX]> 1529]>; 1530// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1531// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! 1532def : InstRW<[Zn3WriteVZeroIdiomALUX], 1533 (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1534 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; 1535 1536def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[ 1537 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>, 1538 SchedVar<NoSchedPred, [WriteVecALUY]> 1539]>; 1540def : InstRW<[Zn3WriteVZeroIdiomALUY], 1541 (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1542 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; 1543 1544def : IsZeroIdiomFunction<[ 1545 // GPR Zero-idioms. 1546 DepBreakingClass<[ XOR32rr, XOR32rr_REV, 1547 XOR64rr, XOR64rr_REV, 1548 SUB32rr, SUB32rr_REV, 1549 SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, 1550 1551 // SSE XMM Zero-idioms. 1552 DepBreakingClass<[ 1553 // fp variants. 1554 XORPSrr, XORPDrr, 1555 ANDNPSrr, ANDNPDrr, 1556 1557 // int variants. 1558 PXORrr, 1559 PANDNrr, 1560 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 1561 PSUBSBrr, PSUBSWrr, 1562 PSUBUSBrr, PSUBUSWrr, 1563 PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr 1564 ], ZeroIdiomPredicate>, 1565 1566 // AVX XMM Zero-idioms. 1567 DepBreakingClass<[ 1568 // fp variants. 1569 VXORPSrr, VXORPDrr, 1570 VANDNPSrr, VANDNPDrr, 1571 1572 // int variants. 1573 VPXORrr, 1574 VPANDNrr, 1575 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1576 VPSUBSBrr, VPSUBSWrr, 1577 VPSUBUSBrr, VPSUBUSWrr, 1578 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1579 ], ZeroIdiomPredicate>, 1580 1581 // AVX YMM Zero-idioms. 1582 DepBreakingClass<[ 1583 // fp variants. 1584 VXORPSYrr, VXORPDYrr, 1585 VANDNPSYrr, VANDNPDYrr, 1586 1587 // int variants. 1588 VPXORYrr, 1589 VPANDNYrr, 1590 VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, 1591 VPSUBSBYrr, VPSUBSWYrr, 1592 VPSUBUSBYrr, VPSUBUSWYrr, 1593 VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr 1594 ], ZeroIdiomPredicate>, 1595]>; 1596 1597def : IsDepBreakingFunction<[ 1598 // GPR 1599 DepBreakingClass<[ SBB32rr, SBB32rr_REV, 1600 SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, 1601 DepBreakingClass<[ CMP8rr, CMP8rr_REV, 1602 CMP16rr, CMP16rr_REV, 1603 CMP32rr, CMP32rr_REV, 1604 CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, 1605 1606 // MMX 1607 DepBreakingClass<[ 1608 MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr 1609 ], ZeroIdiomPredicate>, 1610 1611 // SSE 1612 DepBreakingClass<[ 1613 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1614 ], ZeroIdiomPredicate>, 1615 1616 // AVX XMM 1617 DepBreakingClass<[ 1618 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1619 ], ZeroIdiomPredicate>, 1620 1621 // AVX YMM 1622 DepBreakingClass<[ 1623 VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr 1624 ], ZeroIdiomPredicate>, 1625]>; 1626 1627} // SchedModel 1628