xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver3 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15//    http://www.agner.org/optimize/microarchitecture.pdf
16//  * AMD Zen 3 Ryzen Deep Dive Review
17//    https://www.anandtech.com/show/16214/
18//===----------------------------------------------------------------------===//
19
20def Znver3Model : SchedMachineModel {
21  // AMD SOG 19h, 2.9.6 Dispatch
22  // The processor may dispatch up to 6 macro ops per cycle
23  // into the execution engine.
24  let IssueWidth = 6;
25  // AMD SOG 19h, 2.10.3
26  // The retire control unit (RCU) tracks the completion status of all
27  // outstanding operations (integer, load/store, and floating-point) and is
28  // the final arbiter for exception processing and recovery.
29  // The unit can receive up to 6 macro ops dispatched per cycle and track up
30  // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
31  let MicroOpBufferSize = 256;
32  // AMD SOG 19h, 2.9.1 Op Cache
33  // The op cache is organized as an associative cache with 64 sets and 8 ways.
34  // At each set-way intersection is an entry containing up to 8 macro ops.
35  // The maximum capacity of the op cache is 4K ops.
36  // Assuming a maximum dispatch of 8 ops/cy and a mispredict cost of 12cy from
37  // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop unrolling
38  // leading to excessive filling of the op-cache from frontend.
39  let LoopMicroOpBufferSize = 96;
40  // AMD SOG 19h, 2.6.2 L1 Data Cache
41  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
42  // AMD SOG 19h, 2.12 L1 Data Cache
43  // The AGU and LS pipelines are optimized for simple address generation modes.
44  // <...> and can achieve 4-cycle load-to-use integer load latency.
45  let LoadLatency = 4;
46  // AMD SOG 19h, 2.12 L1 Data Cache
47  // The AGU and LS pipelines are optimized for simple address generation modes.
48  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
49  int VecLoadLatency = 7;
50  // Latency of a simple store operation.
51  int StoreLatency = 1;
52  // FIXME:
53  let HighLatency = 25; // FIXME: any better choice?
54  // AMD SOG 19h, 2.8 Optimizing Branching
55  // The branch misprediction penalty is in the range from 11 to 18 cycles,
56  // <...>. The common case penalty is 13 cycles.
57  let MispredictPenalty = 13;
58
59  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
60
61  let CompleteModel = 1;
62}
63
64let SchedModel = Znver3Model in {
65
66
67//===----------------------------------------------------------------------===//
68// RCU
69//===----------------------------------------------------------------------===//
70
71// AMD SOG 19h, 2.10.3 Retire Control Unit
72// The unit can receive up to 6 macro ops dispatched per cycle and track up to
73// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
74// The retire unit handles in-order commit of up to eight macro ops per cycle.
75def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
76
77//===----------------------------------------------------------------------===//
78// Units
79//===----------------------------------------------------------------------===//
80
81// There are total of three Units, each one with it's own schedulers.
82
83//===----------------------------------------------------------------------===//
84// Integer Execution Unit
85//
86
87// AMD SOG 19h, 2.4 Superscalar Organization
88// The processor uses four decoupled independent integer scheduler queues,
89// each one servicing one ALU pipeline and one or two other pipelines
90
91//
92// Execution pipes
93//===----------------------------------------------------------------------===//
94
95// AMD SOG 19h, 2.10.2 Execution Units
96// The processor contains 4 general purpose integer execution pipes.
97// Each pipe has an ALU capable of general purpose integer operations.
98def Zn3ALU0 : ProcResource<1>;
99def Zn3ALU1 : ProcResource<1>;
100def Zn3ALU2 : ProcResource<1>;
101def Zn3ALU3 : ProcResource<1>;
102
103// AMD SOG 19h, 2.10.2 Execution Units
104// There is also a separate branch execution unit.
105def Zn3BRU1 : ProcResource<1>;
106
107// AMD SOG 19h, 2.10.2 Execution Units
108// There are three Address Generation Units (AGUs) for all load and store
109// address generation. There are also 3 store data movement units
110// associated with the same schedulers as the AGUs.
111def Zn3AGU0 : ProcResource<1>;
112def Zn3AGU1 : ProcResource<1>;
113def Zn3AGU2 : ProcResource<1>;
114
115//
116// Execution Units
117//===----------------------------------------------------------------------===//
118
119// AMD SOG 19h, 2.10.2 Execution Units
120// ALU0 additionally has divide <...> execution capability.
121defvar Zn3Divider = Zn3ALU0;
122
123// AMD SOG 19h, 2.10.2 Execution Units
124// ALU0 additionally has <...> branch execution capability.
125defvar Zn3BRU0 = Zn3ALU0;
126
127// Integer Multiplication issued on ALU1.
128defvar Zn3Multiplier = Zn3ALU1;
129
130// Execution pipeline grouping
131//===----------------------------------------------------------------------===//
132
133// General ALU operations
134def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
135
136// General AGU operations
137def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
138
139// Control flow: jumps, calls
140def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
141
142// Everything that isn't control flow, but still needs to access CC register,
143// namely: conditional moves, SETcc.
144def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
145
146// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
147
148// Simple bit twiddling: bit test, shift/rotate, bit extraction
149def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
150
151
152//
153// Scheduling
154//===----------------------------------------------------------------------===//
155
156// AMD SOG 19h, 2.10.3 Retire Control Unit
157// The integer physical register file (PRF) consists of 192 registers.
158def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
159                              6,  // Max moves that can be eliminated per cycle.
160                              0>; // Restrict move elimination to zero regs.
161
162// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
163// AMD SOG 19h, 2.10.1 Schedulers
164// The schedulers can receive up to six macro ops per cycle, with a limit of
165// two per scheduler. Each scheduler can issue one micro op per cycle into
166// each of its associated pipelines
167// FIXME: these are 4 separate schedulers, not a single big one.
168def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
169                           Zn3ALU1, Zn3AGU1,          // scheduler 1
170                           Zn3ALU2, Zn3AGU2,          // scheduler 2
171                           Zn3ALU3,          Zn3BRU1  // scheduler 3
172                          ]> {
173  let BufferSize = !mul(4, 24);
174}
175
176
177//===----------------------------------------------------------------------===//
178// Floating-Point Unit
179//
180
181// AMD SOG 19h, 2.4 Superscalar Organization
182// The processor uses <...> two decoupled independent floating point schedulers
183// each servicing two FP pipelines and one store or FP-to-integer pipeline.
184
185//
186// Execution pipes
187//===----------------------------------------------------------------------===//
188
189// AMD SOG 19h, 2.10.1 Schedulers
190// <...>, and six FPU pipes.
191// Agner, 22.10 Floating point execution pipes
192// There are six floating point/vector execution pipes,
193def Zn3FP0  : ProcResource<1>;
194def Zn3FP1  : ProcResource<1>;
195def Zn3FP2  : ProcResource<1>;
196def Zn3FP3  : ProcResource<1>;
197def Zn3FP45 : ProcResource<2>;
198
199//
200// Execution Units
201//===----------------------------------------------------------------------===//
202// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
203
204// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
205defvar Zn3FPFMul0 = Zn3FP0;
206defvar Zn3FPFMul1 = Zn3FP1;
207
208// (v)FADD*
209defvar Zn3FPFAdd0 = Zn3FP2;
210defvar Zn3FPFAdd1 = Zn3FP3;
211
212// All convert operations except pack/unpack
213defvar Zn3FPFCvt0 = Zn3FP2;
214defvar Zn3FPFCvt1 = Zn3FP3;
215
216// All Divide and Square Root except Reciprocal Approximation
217// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
218// FDIV unit can support 2 simultaneous operations in flight
219// even though it occupies a single pipe.
220// FIXME: BufferSize=2 ?
221defvar Zn3FPFDiv = Zn3FP1;
222
223// Moves and Logical operations on Floating Point Data Types
224defvar Zn3FPFMisc0 = Zn3FP0;
225defvar Zn3FPFMisc1 = Zn3FP1;
226defvar Zn3FPFMisc2 = Zn3FP2;
227defvar Zn3FPFMisc3 = Zn3FP3;
228
229// Integer Adds, Subtracts, and Compares
230// Some complex VADD operations are not available in all pipes.
231defvar Zn3FPVAdd0 = Zn3FP0;
232defvar Zn3FPVAdd1 = Zn3FP1;
233defvar Zn3FPVAdd2 = Zn3FP2;
234defvar Zn3FPVAdd3 = Zn3FP3;
235
236// Integer Multiplies, SAD, Blendvb
237defvar Zn3FPVMul0 = Zn3FP0;
238defvar Zn3FPVMul1 = Zn3FP3;
239
240// Data Shuffles, Packs, Unpacks, Permute
241// Some complex shuffle operations are only available in pipe1.
242defvar Zn3FPVShuf = Zn3FP1;
243defvar Zn3FPVShufAux = Zn3FP2;
244
245// Bit Shift Left/Right operations
246defvar Zn3FPVShift0 = Zn3FP1;
247defvar Zn3FPVShift1 = Zn3FP2;
248
249// Moves and Logical operations on Packed Integer Data Types
250defvar Zn3FPVMisc0 = Zn3FP0;
251defvar Zn3FPVMisc1 = Zn3FP1;
252defvar Zn3FPVMisc2 = Zn3FP2;
253defvar Zn3FPVMisc3 = Zn3FP3;
254
255// *AES*
256defvar Zn3FPAES0 = Zn3FP0;
257defvar Zn3FPAES1 = Zn3FP1;
258
259// *CLM*
260defvar Zn3FPCLM0 = Zn3FP0;
261defvar Zn3FPCLM1 = Zn3FP1;
262
263// Execution pipeline grouping
264//===----------------------------------------------------------------------===//
265
266// AMD SOG 19h, 2.11 Floating-Point Unit
267// Stores and floating point to general purpose register transfer
268// have 2 dedicated pipelines (pipe 5 and 6).
269def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>;
270
271// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
272def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
273
274// (v)FADD*
275// Some complex VADD operations are not available in all pipes.
276def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
277
278// All convert operations except pack/unpack
279def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
280
281// All Divide and Square Root except Reciprocal Approximation
282// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
283
284// Moves and Logical operations on Floating Point Data Types
285def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
286
287def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
288
289// Loads, Stores and Move to General Register (EX) Operations
290// AMD SOG 19h, 2.11 Floating-Point Unit
291// Stores and floating point to general purpose register transfer
292// have 2 dedicated pipelines (pipe 5 and 6).
293defvar Zn3FPLd01 = Zn3FP45;
294
295// AMD SOG 19h, 2.11 Floating-Point Unit
296// Note that FP stores are supported on two pipelines,
297// but throughput is limited to one per cycle.
298let Super = Zn3FP45 in
299def Zn3FPSt : ProcResource<1>;
300
301// Integer Adds, Subtracts, and Compares
302// Some complex VADD operations are not available in all pipes.
303def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
304
305def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
306def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
307
308// Integer Multiplies, SAD, Blendvb
309def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
310
311// Data Shuffles, Packs, Unpacks, Permute
312// Some complex shuffle operations are only available in pipe1.
313def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
314
315// Bit Shift Left/Right operations
316def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
317
318// Moves and Logical operations on Packed Integer Data Types
319def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
320
321// *AES*
322def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
323
324// *CLM*
325def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
326
327
328//
329// Scheduling
330//===----------------------------------------------------------------------===//
331
332// Agner, 21.8 Register renaming and out-of-order schedulers
333// The floating point register file has 160 vector registers
334// of 128 bits each in Zen 1 and 256 bits each in Zen 2.
335// anandtech also confirms this.
336def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
337                            6,  // Max moves that can be eliminated per cycle.
338                            0>; // Restrict move elimination to zero regs.
339
340// AMD SOG 19h, 2.11 Floating-Point Unit
341// The floating-point scheduler has a 2*32 entry macro op capacity.
342// AMD SOG 19h, 2.11 Floating-Point Unit
343// <...> the scheduler can issue 1 micro op per cycle for each pipe.
344// FIXME: those are two separate schedulers, not a single big one.
345def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2,          /*Zn3FP4,*/ // scheduler 0
346                          Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/  // scheduler 1
347                         ]> {
348  let BufferSize = !mul(2, 32);
349}
350
351// AMD SOG 19h, 2.11 Floating-Point Unit
352// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
353// even if floating-point scheduler is full.
354// FIXME: how to model this properly?
355
356
357//===----------------------------------------------------------------------===//
358// Load-Store Unit
359//
360
361// AMD SOG 19h, 2.12 Load-Store Unit
362// The LS unit contains three largely independent pipe-lines
363// enabling the execution of three 256-bit memory operations per cycle.
364def Zn3LSU : ProcResource<3>;
365
366// AMD SOG 19h, 2.12 Load-Store Unit
367// All three memory operations can be loads.
368let Super = Zn3LSU in
369def Zn3Load : ProcResource<3> {
370  // AMD SOG 19h, 2.12 Load-Store Unit
371  // The LS unit can process up to 72 out-of-order loads.
372  let BufferSize = 72;
373}
374
375def Zn3LoadQueue : LoadQueue<Zn3Load>;
376
377// AMD SOG 19h, 2.12 Load-Store Unit
378// A maximum of two of the memory operations can be stores.
379let Super = Zn3LSU in
380def Zn3Store : ProcResource<2> {
381  // AMD SOG 19h, 2.12 Load-Store Unit
382  // The LS unit utilizes a 64-entry store queue (STQ).
383  let BufferSize = 64;
384}
385
386def Zn3StoreQueue : StoreQueue<Zn3Store>;
387
388//===----------------------------------------------------------------------===//
389// Basic helper classes.
390//===----------------------------------------------------------------------===//
391
392// Many SchedWrites are defined in pairs with and without a folded load.
393// Instructions with folded loads are usually micro-fused, so they only appear
394// as two micro-ops when dispatched by the schedulers.
395// This multiclass defines the resource usage for variants with and without
396// folded loads.
397
398multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
399                         int Lat = 1, list<int> Res = [], int UOps = 1> {
400  def : WriteRes<SchedRW, ExePorts> {
401    let Latency = Lat;
402    let ReleaseAtCycles = Res;
403    let NumMicroOps = UOps;
404  }
405}
406
407multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
408                             list<ProcResourceKind> ExePorts, int Lat,
409                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
410                             ProcResourceKind AGU, int LoadRes> {
411  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
412
413  defm : __zn3WriteRes<SchedRW.Folded,
414                       !listconcat([AGU, Zn3Load], ExePorts),
415                       !add(Lat, LoadLat),
416                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
417                         [],
418                         !listconcat([1, LoadRes],
419                           !if(!empty(Res),
420                             !listsplat(1, !size(ExePorts)),
421                             Res))),
422                       !add(UOps, LoadUOps)>;
423}
424
425// For classes without folded loads.
426multiclass Zn3WriteResInt<SchedWrite SchedRW,
427                          list<ProcResourceKind> ExePorts, int Lat = 1,
428                          list<int> Res = [], int UOps = 1> {
429  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
430}
431
432multiclass Zn3WriteResXMM<SchedWrite SchedRW,
433                          list<ProcResourceKind> ExePorts, int Lat = 1,
434                          list<int> Res = [], int UOps = 1> {
435  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
436}
437
438multiclass Zn3WriteResYMM<SchedWrite SchedRW,
439                          list<ProcResourceKind> ExePorts, int Lat = 1,
440                          list<int> Res = [], int UOps = 1> {
441  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
442}
443
444// For classes with folded loads.
445multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
446                              list<ProcResourceKind> ExePorts, int Lat = 1,
447                              list<int> Res = [], int UOps = 1,
448                              int LoadUOps = 0, int LoadRes = 1> {
449  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
450                           Znver3Model.LoadLatency,
451                           LoadUOps, Zn3AGU012, LoadRes>;
452}
453
454multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
455                              list<ProcResourceKind> ExePorts, int Lat = 1,
456                              list<int> Res = [], int UOps = 1,
457                              int LoadUOps = 0, int LoadRes = 1> {
458  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
459                           Znver3Model.VecLoadLatency,
460                           LoadUOps, Zn3FPLd01, LoadRes>;
461}
462
463multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
464                              list<ProcResourceKind> ExePorts, int Lat = 1,
465                              list<int> Res = [], int UOps = 1,
466                              int LoadUOps = 0, int LoadRes = 1> {
467  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
468                           Znver3Model.VecLoadLatency,
469                           LoadUOps, Zn3FPLd01, LoadRes>;
470}
471
472
473//===----------------------------------------------------------------------===//
474// Here be dragons.
475//===----------------------------------------------------------------------===//
476
477def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
478
479def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
480def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
481def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
482
483// AMD SOG 19h, 2.11 Floating-Point Unit
484// There is 1 cycle of added latency for a result to cross
485// from F to I or I to F domain.
486def : ReadAdvance<ReadInt2Fpu, -1>;
487
488// Instructions with both a load and a store folded are modeled as a folded
489// load + WriteRMW.
490defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
491
492// Loads, stores, and moves, not folded with other operations.
493defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
494
495// Model the effect of clobbering the read-write mask operand of the GATHER operation.
496// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
497defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
498
499def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
500  let Latency = !add(Znver3Model.LoadLatency, 1);
501  let ReleaseAtCycles = [3, 1];
502  let NumMicroOps = 1;
503}
504def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
505
506defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
507defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
508defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
509
510// Treat misc copies as a move.
511def : InstRW<[WriteMove], (instrs COPY)>;
512
513def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
514  let Latency = Znver3Model.LoadLatency;
515  let ReleaseAtCycles = [1, 1, 4];
516  let NumMicroOps = 1;
517}
518def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
519
520def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
521  let Latency = Znver3Model.StoreLatency;
522  let ReleaseAtCycles = [4, 1, 1];
523  let NumMicroOps = 2;
524}
525def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
526
527// Arithmetic.
528defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
529
530def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
531  let Latency = 1;
532  let ReleaseAtCycles = [4];
533  let NumMicroOps = 1;
534}
535def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
536                                        AND8i8, AND16i16, AND32i32, AND64i32,
537                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
538                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
539                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
540
541def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
542  let Latency = 1;
543  let ReleaseAtCycles = [4];
544  let NumMicroOps = 1;
545}
546def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
547
548def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
549  let Latency = 1;
550  let ReleaseAtCycles = [2];
551  let NumMicroOps = 1;
552}
553def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
554
555def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
556  let Latency = 3;
557  let ReleaseAtCycles = [1];
558  let NumMicroOps = 1;
559}
560def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
561                                          PEXT32rr, PEXT64rr)>;
562
563defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
564
565def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
566  let Latency = 1;
567  let ReleaseAtCycles = [1, 1, 7, 1];
568  let NumMicroOps = 1;
569}
570def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
571
572// This is for simple LEAs with one or two input operands.
573defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
574
575// This write is used for slow LEA instructions.
576def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
577  let Latency = 2;
578  let ReleaseAtCycles = [1];
579  let NumMicroOps = 2;
580}
581
582// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset),
583// or an LEA with a `Scale` value different than 1.
584def Zn3SlowLEAPredicate : MCSchedPredicate<
585  CheckAny<[
586    // A 3-operand LEA (base, index, offset).
587    IsThreeOperandsLEAFn,
588    // An LEA with a "Scale" different than 1.
589    CheckAll<[
590      CheckIsImmOperand<2>,
591      CheckNot<CheckImmOperand<2, 1>>
592    ]>
593  ]>
594>;
595
596def Zn3WriteLEA : SchedWriteVariant<[
597    SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
598    SchedVar<NoSchedPred,         [WriteLEA]>
599]>;
600
601def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
602
603def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
604  let Latency = 2; // FIXME: not from llvm-exegesis
605  let ReleaseAtCycles = [4];
606  let NumMicroOps = 2;
607}
608
609def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
610
611// Integer multiplication
612defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
613defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
614defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
615defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
616defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
617defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
618defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
619defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
620defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
621defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
622defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
623defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
624defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
625defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
626
627defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
628defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
629
630defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
631
632def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
633  let Latency = 3;
634  let ReleaseAtCycles = [12];
635  let NumMicroOps = 3;
636}
637def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
638
639defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
640
641def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
642  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
643  let ReleaseAtCycles = [1, 1, 12];
644  let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
645}
646def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
647
648def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
649  let Latency = 3; // FIXME: not from llvm-exegesis
650  let ReleaseAtCycles = [24];
651  let NumMicroOps = 19;
652}
653def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
654
655def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
656  let Latency = 4; // FIXME: not from llvm-exegesis
657  let ReleaseAtCycles = [59];
658  let NumMicroOps = 28;
659}
660def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
661
662def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
663  let Latency = 1;
664  let ReleaseAtCycles = [2];
665  let NumMicroOps = 2;
666}
667def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
668
669def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
670  let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
671  let ReleaseAtCycles = [1, 1, 2];
672  let NumMicroOps = 5;
673}
674def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
675
676def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
677  let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
678  let ReleaseAtCycles = [1, 1, 2];
679  let NumMicroOps = 2;
680}
681def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
682
683// Integer division.
684// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
685// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
686defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
687defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
688defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
689defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
690defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
691defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
692defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
693defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
694
695defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
696defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
697
698defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
699
700def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
701  let Latency = 1;
702  let ReleaseAtCycles = [4];
703  let NumMicroOps = 1;
704}
705def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
706
707defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
708
709def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
710  let Latency = 1;
711  let ReleaseAtCycles = [4];
712  let NumMicroOps = 1;
713}
714def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
715
716defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
717
718def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
719  let Latency = 2;
720  let ReleaseAtCycles = [4];
721  let NumMicroOps = 2;
722}
723def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
724
725defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
726defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
727defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
728defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
729defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
730
731defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
732defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
733defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
734
735defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
736defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
737defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
738
739// Integer shifts and rotates.
740defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
741defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
742defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
743
744def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
745  let Latency = 1;
746  let ReleaseAtCycles = [2];
747  let NumMicroOps = 1;
748}
749def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
750                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
751
752def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
753  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
754  let ReleaseAtCycles = [1, 1, 2];
755  let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
756}
757def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
758                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
759
760def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
761  let Latency = 3;
762  let ReleaseAtCycles = [6];
763  let NumMicroOps = 7;
764}
765def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
766
767def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
768  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
769  let ReleaseAtCycles = [1, 1, 8];
770  let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
771}
772def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
773
774def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
775  let Latency = 4;
776  let ReleaseAtCycles = [8];
777  let NumMicroOps = 9;
778}
779def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
780
781def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
782  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
783  let ReleaseAtCycles = [1, 1, 8];
784  let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
785}
786def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
787
788defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
789
790def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
791  let Latency = 3;
792  let ReleaseAtCycles = [6];
793  let NumMicroOps = 7;
794}
795def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
796
797def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
798  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
799  let ReleaseAtCycles = [1, 1, 8];
800  let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
801}
802def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
803
804def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
805  let Latency = 4;
806  let ReleaseAtCycles = [8];
807  let NumMicroOps = 9;
808}
809def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
810
811def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
812  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
813  let ReleaseAtCycles = [1, 1, 8];
814  let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
815}
816def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
817
818// Double shift instructions.
819defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
820defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
821defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
822defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
823
824// BMI1 BEXTR/BLS, BMI2 BZHI
825defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
826defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
827defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
828
829// Idioms that clear a register, like xorps %xmm0, %xmm0.
830// These can often bypass execution ports completely.
831defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
832
833// Branches don't produce values, so they have no latency, but they still
834// consume resources. Indirect branches can fold loads.
835defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
836
837// Floating point. This covers both scalar and vector operations.
838defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
839defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
840defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
841defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
842defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
843defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
844defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
845defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
846defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
847
848def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
849  let Latency = 2; // FIXME: not from llvm-exegesis
850  let ReleaseAtCycles = [1, 1];
851  let NumMicroOps = 2;
852}
853def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
854                                               VMOVHPDmr, VMOVHPSmr)>;
855
856defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
857defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
858defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
859defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
860defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
861
862defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
863defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
864defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
865defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
866
867defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
868
869def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
870  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
871  let ReleaseAtCycles = [1, 1, 24];
872  let NumMicroOps = 2;
873}
874def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
875                                         SUB_FI16m, SUB_FI32m,
876                                         SUBR_FI16m, SUBR_FI32m,
877                                         MUL_FI16m, MUL_FI32m)>;
878
879def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
880  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
881  let ReleaseAtCycles = [1, 1, 62];
882  let NumMicroOps = 2;
883}
884def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
885                                       DIVR_FI16m, DIVR_FI32m)>;
886
887defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
888defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
889defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
890defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
891defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
892defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
893defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
894defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>;  // Floating point compare.
895defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
896defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
897defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
898defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>;  // Floating point double compare.
899defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
900defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
901defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
902defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
903defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
904defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>;  // Floating point multiplication.
905defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
906defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
907defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
908defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
909defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
910defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
911defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
912defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>;  // Floating point division.
913defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
914defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
915defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
916defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>;  // Floating point double division.
917defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
918defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
919defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
920defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>;   // Floating point square root.
921defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
922defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
923defm : X86WriteResPairUnsupported<WriteFSqrtZ>;  // Floating point square root (ZMM).
924defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>;  // Floating point double square root.
925defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
926defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
927defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
928defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
929defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>;  // Floating point reciprocal estimate.
930defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
931defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
932defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
933defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>;  // Floating point reciprocal square root estimate.
934defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
935defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
936defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
937defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [1], 1>;  // Fused Multiply Add.
938defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
939defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
940defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
941defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
942defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
943defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
944defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
945defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
946defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
947defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
948defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
949defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
950defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
951defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
952defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
953defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
954defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
955defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
956defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
957defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
958defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
959defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
960defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
961defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
962defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
963defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
964defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
965defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
966
967// Horizontal Add/Sub (float and integer)
968defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
969defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
970defm : X86WriteResPairUnsupported<WriteFHAddZ>;
971defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
972defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
973defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
974defm : X86WriteResPairUnsupported<WritePHAddZ>;
975
976// Vector integer operations.
977defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
978defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
979defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
980defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
981defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
982defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
983defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
984defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
985defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
986
987def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
988  let Latency = 4;
989  let ReleaseAtCycles = [1];
990  let NumMicroOps = 1;
991}
992def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
993
994def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
995  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
996  let ReleaseAtCycles = [1, 1, 1];
997  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
998}
999def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1000
1001def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
1002  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1003  let ReleaseAtCycles = [1, 1, 1];
1004  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1005}
1006def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1007
1008defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1009defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1010defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1011defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1012defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
1013defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
1014defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1015
1016defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1017defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1018
1019def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1020  let Latency = 1;
1021  let ReleaseAtCycles = [1, 2];
1022  let NumMicroOps = 2;
1023}
1024def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1025
1026def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1027  let Latency = 1;
1028  let ReleaseAtCycles = [1, 4];
1029  let NumMicroOps = 2;
1030}
1031def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1032
1033defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1034
1035def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1036  let Latency = 3;
1037  let ReleaseAtCycles = [1, 1];
1038  let NumMicroOps = 1;
1039}
1040def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1041
1042def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1043  let Latency = 3;
1044  let ReleaseAtCycles = [1, 1];
1045  let NumMicroOps = 2;
1046}
1047def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1048
1049defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1050
1051def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1052  let Latency = 1;
1053  let ReleaseAtCycles = [1];
1054  let NumMicroOps = 1;
1055}
1056def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1057                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1058                                            PAVGBrr, PAVGWrr,
1059                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1060                                            VPABSBrr, VPABSDrr, VPABSWrr,
1061                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1062                                            VPAVGBrr, VPAVGWrr,
1063                                            VPCMPEQQrr,
1064                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1065                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1066
1067def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
1068  let Latency = 1;
1069  let ReleaseAtCycles = [1];
1070  let NumMicroOps = 1;
1071}
1072def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
1073                                           MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
1074                                           MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr,
1075                                           MMX_PAVGBrr, MMX_PAVGWrr,
1076                                           MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>;
1077
1078defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1079
1080def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1081  let Latency = 1;
1082  let ReleaseAtCycles = [1];
1083  let NumMicroOps = 1;
1084}
1085def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1086                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1087                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1088                                            VPAVGBYrr, VPAVGWYrr,
1089                                            VPCMPEQQYrr,
1090                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1091
1092defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
1093defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1094defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1095defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1096defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
1097defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1098defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1099defm : X86WriteResPairUnsupported<WriteVecTestZ>;  // Vector integer TEST instructions (ZMM).
1100defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1101defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
1102defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1103defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
1104defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1105defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1106defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1107defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
1108defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1109defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1110defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1111defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
1112defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
1113defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1114defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
1115defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1116defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1117defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1118defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
1119defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1120defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1121defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1122defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
1123defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
1124defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1125defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
1126defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
1127defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1128defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
1129defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1130defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1131defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1132defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
1133defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1134defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1135defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
1136defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1137
1138// Vector insert/extract operations.
1139defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1140defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1141defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1142
1143// MOVMSK operations.
1144defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1145defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1146defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
1147defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1148
1149// Conversion between integer and float.
1150defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>;  // Double -> Integer.
1151defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1152defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1153defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1154
1155def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1156  let Latency = 1;
1157  let ReleaseAtCycles = [2];
1158  let NumMicroOps = 2;
1159}
1160def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>;
1161
1162defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>;  // Float -> Integer.
1163
1164defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1165defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1166defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1167
1168defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1169defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1170defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1171defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1172
1173def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1174  let Latency = 2;
1175  let ReleaseAtCycles = [6];
1176  let NumMicroOps = 2;
1177}
1178def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>;
1179
1180defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1181defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1182defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1183defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1184
1185def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1186  let Latency = 3;
1187  let ReleaseAtCycles = [1];
1188  let NumMicroOps = 2;
1189}
1190def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>;
1191
1192defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1193defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1194defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1195defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1196
1197defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1198defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1199defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1200defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1201
1202defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1203defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1204defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1205
1206defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1207defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1208defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1209defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1210defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1211defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1212
1213// CRC32 instruction.
1214defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
1215
1216def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1217  let Latency = 2;
1218  let ReleaseAtCycles = [2];
1219  let NumMicroOps = 2;
1220}
1221def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1222
1223def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1224  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
1225  let ReleaseAtCycles = [1, 1, 2];
1226  let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
1227}
1228def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1229
1230def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
1231  let Latency = 1;
1232  let ReleaseAtCycles = [2];
1233  let NumMicroOps = 1;
1234}
1235def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1236
1237def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1238  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1239  let ReleaseAtCycles = [1, 1, 2];
1240  let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1241}
1242def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1243
1244def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1245  let Latency = 2;
1246  let ReleaseAtCycles = [3];
1247  let NumMicroOps = 2;
1248}
1249def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1250
1251def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1252  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
1253  let ReleaseAtCycles = [1, 1, 3];
1254  let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
1255}
1256def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1257
1258def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
1259  let Latency = 3;
1260  let ReleaseAtCycles = [8];
1261  let NumMicroOps = 4;
1262}
1263def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1264
1265def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1266  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
1267  let ReleaseAtCycles = [1, 1, 8];
1268  let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
1269}
1270def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1271
1272def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
1273  let Latency = 6;
1274  let ReleaseAtCycles = [8];
1275  let NumMicroOps = 1;
1276}
1277def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1278
1279def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
1280  let Latency = 4;
1281  let ReleaseAtCycles = [8];
1282  let NumMicroOps = 1;
1283}
1284def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1285
1286// Strings instructions.
1287// Packed Compare Implicit Length Strings, Return Mask
1288defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1289// Packed Compare Explicit Length Strings, Return Mask
1290defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1291// Packed Compare Implicit Length Strings, Return Index
1292defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1293// Packed Compare Explicit Length Strings, Return Index
1294defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1295
1296// AES instructions.
1297defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
1298defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
1299defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
1300
1301// Carry-less multiplication instructions.
1302defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
1303
1304// EMMS/FEMMS
1305defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1306
1307// Load/store MXCSR
1308defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1309defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1310
1311// Catch-all for expensive system instructions.
1312defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
1313
1314def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
1315  let Latency = 0; // FIXME: not from llvm-exegesis
1316  let ReleaseAtCycles = [1];
1317  let NumMicroOps = 1;
1318}
1319def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
1320
1321def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
1322  let Latency = 10; // FIXME: not from llvm-exegesis
1323  let ReleaseAtCycles = [24];
1324  let NumMicroOps = 18;
1325}
1326def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
1327
1328// AVX2.
1329defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1330defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1331defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1332
1333def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
1334  let Latency = 3;
1335  let ReleaseAtCycles = [1];
1336  let NumMicroOps = 1;
1337}
1338def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1339
1340def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1341  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
1342  let ReleaseAtCycles = [1, 1, 1];
1343  let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1344}
1345def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1346
1347def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1348  let Latency = !add(Znver3Model.LoadLatency, 7);
1349  let ReleaseAtCycles = [1, 1, 2];
1350  let NumMicroOps = 3;
1351}
1352def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1353
1354def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
1355  let Latency = 6;
1356  let ReleaseAtCycles = [1];
1357  let NumMicroOps = 2;
1358}
1359def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1360
1361def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1362  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
1363  let ReleaseAtCycles = [1, 1, 2];
1364  let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
1365}
1366def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1367
1368def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1369  let Latency = !add(Znver3Model.LoadLatency, 5);
1370  let ReleaseAtCycles = [1, 1, 2];
1371  let NumMicroOps = 2;
1372}
1373def : InstRW<[Zn3WriteVPERMDYm], (instrs VPERMQYmi, VPERMDYrm)>;
1374
1375defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1376defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf], 5, [1], 2, /*LoadUOps=*/1>; // 256-bit width vector variable shuffles.
1377defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
1378defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1379defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
1380
1381// Old microcoded instructions that nobody use.
1382defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
1383
1384// Fence instructions.
1385defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
1386
1387def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
1388  let Latency = 1;
1389  let ReleaseAtCycles = [30];
1390  let NumMicroOps = 1;
1391}
1392def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
1393
1394def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
1395  let Latency = 1;
1396  let ReleaseAtCycles = [1];
1397  let NumMicroOps = 1;
1398}
1399def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
1400
1401// Nop, not very useful expect it provides a model for nops!
1402defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1403
1404
1405///////////////////////////////////////////////////////////////////////////////
1406// Zero Cycle Move
1407///////////////////////////////////////////////////////////////////////////////
1408
1409def Zn3WriteZeroLatency : SchedWriteRes<[]> {
1410  let Latency = 0;
1411  let ReleaseAtCycles = [];
1412  let NumMicroOps = 1;
1413}
1414def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1415                                               MOV64rr, MOV64rr_REV,
1416                                               MOVSX32rr32)>;
1417
1418def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
1419  let Latency = 0;
1420  let ReleaseAtCycles = [];
1421  let NumMicroOps = 2;
1422}
1423def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1424                                               XCHG64rr, XCHG64ar)>;
1425
1426defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1427
1428defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
1429defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1430defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1431defm : X86WriteResUnsupported<WriteFMoveZ>;
1432
1433defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
1434defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1435defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1436defm : X86WriteResUnsupported<WriteVecMoveZ>;
1437
1438def : IsOptimizableRegisterMove<[
1439  InstructionEquivalenceClass<[
1440    // GPR variants.
1441    MOV32rr, MOV32rr_REV,
1442    MOV64rr, MOV64rr_REV,
1443    MOVSX32rr32,
1444    XCHG32rr, XCHG32ar,
1445    XCHG64rr, XCHG64ar,
1446
1447    // MMX variants.
1448    // MMX moves are *NOT* eliminated.
1449
1450    // SSE variants.
1451    MOVAPSrr, MOVAPSrr_REV,
1452    MOVUPSrr, MOVUPSrr_REV,
1453    MOVAPDrr, MOVAPDrr_REV,
1454    MOVUPDrr, MOVUPDrr_REV,
1455    MOVDQArr, MOVDQArr_REV,
1456    MOVDQUrr, MOVDQUrr_REV,
1457
1458    // AVX variants.
1459    VMOVAPSrr, VMOVAPSrr_REV,
1460    VMOVUPSrr, VMOVUPSrr_REV,
1461    VMOVAPDrr, VMOVAPDrr_REV,
1462    VMOVUPDrr, VMOVUPDrr_REV,
1463    VMOVDQArr, VMOVDQArr_REV,
1464    VMOVDQUrr, VMOVDQUrr_REV,
1465
1466    // AVX YMM variants.
1467    VMOVAPSYrr, VMOVAPSYrr_REV,
1468    VMOVUPSYrr, VMOVUPSYrr_REV,
1469    VMOVAPDYrr, VMOVAPDYrr_REV,
1470    VMOVUPDYrr, VMOVUPDYrr_REV,
1471    VMOVDQAYrr, VMOVDQAYrr_REV,
1472    VMOVDQUYrr, VMOVDQUYrr_REV,
1473  ], TruePred >
1474]>;
1475
1476///////////////////////////////////////////////////////////////////////////////
1477// Dependency breaking instructions.
1478///////////////////////////////////////////////////////////////////////////////
1479
1480def Zn3WriteZeroIdiom : SchedWriteVariant<[
1481    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1482    SchedVar<NoSchedPred,                          [WriteALU]>
1483]>;
1484def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1485                                          XOR64rr, XOR64rr_REV,
1486                                          SUB32rr, SUB32rr_REV,
1487                                          SUB64rr, SUB64rr_REV)>;
1488
1489def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1490    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
1491    SchedVar<NoSchedPred,                                 [WriteALU]>
1492]>;
1493def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1494                                                CMP16rr, CMP16rr_REV,
1495                                                CMP32rr, CMP32rr_REV,
1496                                                CMP64rr, CMP64rr_REV)>;
1497
1498def Zn3WriteFZeroIdiom : SchedWriteVariant<[
1499    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1500    SchedVar<NoSchedPred,                          [WriteFLogic]>
1501]>;
1502// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1503def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1504                                           VANDNPSrr, VANDNPDrr)>;
1505
1506def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
1507    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1508    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1509]>;
1510def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1511                                            VANDNPSYrr, VANDNPDYrr)>;
1512
1513def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
1514    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1515    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1516]>;
1517// NOTE: PXORrr,PANDNrr are not zero-cycle!
1518def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1519
1520def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
1521    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1522    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1523]>;
1524def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1525
1526def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
1527    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1528    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1529]>;
1530// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1531//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1532def : InstRW<[Zn3WriteVZeroIdiomALUX],
1533             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1534                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1535
1536def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
1537    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1538    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1539]>;
1540def : InstRW<[Zn3WriteVZeroIdiomALUY],
1541             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1542                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1543
1544def : IsZeroIdiomFunction<[
1545  // GPR Zero-idioms.
1546  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1547                     XOR64rr, XOR64rr_REV,
1548                     SUB32rr, SUB32rr_REV,
1549                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1550
1551  // SSE XMM Zero-idioms.
1552  DepBreakingClass<[
1553    // fp variants.
1554    XORPSrr, XORPDrr,
1555    ANDNPSrr, ANDNPDrr,
1556
1557    // int variants.
1558    PXORrr,
1559    PANDNrr,
1560    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1561    PSUBSBrr, PSUBSWrr,
1562    PSUBUSBrr, PSUBUSWrr,
1563    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1564  ], ZeroIdiomPredicate>,
1565
1566  // AVX XMM Zero-idioms.
1567  DepBreakingClass<[
1568    // fp variants.
1569    VXORPSrr, VXORPDrr,
1570    VANDNPSrr, VANDNPDrr,
1571
1572    // int variants.
1573    VPXORrr,
1574    VPANDNrr,
1575    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1576    VPSUBSBrr, VPSUBSWrr,
1577    VPSUBUSBrr, VPSUBUSWrr,
1578    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1579  ], ZeroIdiomPredicate>,
1580
1581  // AVX YMM Zero-idioms.
1582  DepBreakingClass<[
1583    // fp variants.
1584    VXORPSYrr, VXORPDYrr,
1585    VANDNPSYrr, VANDNPDYrr,
1586
1587    // int variants.
1588    VPXORYrr,
1589    VPANDNYrr,
1590    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1591    VPSUBSBYrr, VPSUBSWYrr,
1592    VPSUBUSBYrr, VPSUBUSWYrr,
1593    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1594  ], ZeroIdiomPredicate>,
1595]>;
1596
1597def : IsDepBreakingFunction<[
1598  // GPR
1599  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1600                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1601  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1602                     CMP16rr, CMP16rr_REV,
1603                     CMP32rr, CMP32rr_REV,
1604                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1605
1606  // MMX
1607  DepBreakingClass<[
1608    MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr
1609  ], ZeroIdiomPredicate>,
1610
1611  // SSE
1612  DepBreakingClass<[
1613    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1614  ], ZeroIdiomPredicate>,
1615
1616  // AVX XMM
1617  DepBreakingClass<[
1618    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1619  ], ZeroIdiomPredicate>,
1620
1621  // AVX YMM
1622  DepBreakingClass<[
1623    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1624  ], ZeroIdiomPredicate>,
1625]>;
1626
1627} // SchedModel
1628