xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver4.td (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver4 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//===----------------------------------------------------------------------===//
15
16def Znver4Model : SchedMachineModel {
17  // AMD SOG 19h, 2.9.6 Dispatch
18  // The processor may dispatch up to 6 macro ops per cycle
19  // into the execution engine.
20  let IssueWidth = 6;
21  // AMD SOG 19h, 2.10.3
22  // The retire control unit (RCU) tracks the completion status of all
23  // outstanding operations (integer, load/store, and floating-point) and is
24  // the final arbiter for exception processing and recovery.
25  // The unit can receive up to 6 macro ops dispatched per cycle and track up
26  // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27  let MicroOpBufferSize = 320;
28  // AMD SOG 19h, 2.9.1 Op Cache
29  // The op cache is organized as an associative cache with 64 sets and 8 ways.
30  // At each set-way intersection is an entry containing up to 8 macro ops.
31  // The maximum capacity of the op cache is 6.75K ops.
32  // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
33  // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
34  // unrolling leading to excessive filling of the op-cache from frontend.
35  let LoopMicroOpBufferSize = 108;
36  // AMD SOG 19h, 2.6.2 L1 Data Cache
37  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38  // AMD SOG 19h, 2.12 L1 Data Cache
39  // The AGU and LS pipelines are optimized for simple address generation modes.
40  // <...> and can achieve 4-cycle load-to-use integer load latency.
41  let LoadLatency = 4;
42  // AMD SOG 19h, 2.12 L1 Data Cache
43  // The AGU and LS pipelines are optimized for simple address generation modes.
44  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
45  int VecLoadLatency = 7;
46  // Latency of a simple store operation.
47  int StoreLatency = 1;
48  // FIXME:
49  let HighLatency = 25; // FIXME: any better choice?
50  // AMD SOG 19h, 2.8 Optimizing Branching
51  // The branch misprediction penalty is in the range from 11 to 18 cycles,
52  // <...>. The common case penalty is 13 cycles.
53  let MispredictPenalty = 13;
54
55  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
56
57  let CompleteModel = 1;
58}
59
60let SchedModel = Znver4Model in {
61
62
63//===----------------------------------------------------------------------===//
64// RCU
65//===----------------------------------------------------------------------===//
66
67// AMD SOG 19h, 2.10.3 Retire Control Unit
68// The unit can receive up to 6 macro ops dispatched per cycle and track up to
69// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
70// The retire unit handles in-order commit of up to nine macro ops per cycle.
71def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
72
73//===----------------------------------------------------------------------===//
74// Integer Execution Unit
75//
76
77// AMD SOG 19h, 2.4 Superscalar Organization
78// The processor uses four decoupled independent integer scheduler queues,
79// each one servicing one ALU pipeline and one or two other pipelines
80
81//
82// Execution pipes
83//===----------------------------------------------------------------------===//
84
85// AMD SOG 19h, 2.10.2 Execution Units
86// The processor contains 4 general purpose integer execution pipes.
87// Each pipe has an ALU capable of general purpose integer operations.
88def Zn4ALU0 : ProcResource<1>;
89def Zn4ALU1 : ProcResource<1>;
90def Zn4ALU2 : ProcResource<1>;
91def Zn4ALU3 : ProcResource<1>;
92
93// AMD SOG 19h, 2.10.2 Execution Units
94// There is also a separate branch execution unit.
95def Zn4BRU1 : ProcResource<1>;
96
97// AMD SOG 19h, 2.10.2 Execution Units
98// There are three Address Generation Units (AGUs) for all load and store
99// address generation. There are also 3 store data movement units
100// associated with the same schedulers as the AGUs.
101def Zn4AGU0 : ProcResource<1>;
102def Zn4AGU1 : ProcResource<1>;
103def Zn4AGU2 : ProcResource<1>;
104
105//
106// Execution Units
107//===----------------------------------------------------------------------===//
108
109// AMD SOG 19h, 2.10.2 Execution Units
110// ALU0 additionally has divide <...> execution capability.
111defvar Zn4Divider = Zn4ALU0;
112
113// AMD SOG 19h, 2.10.2 Execution Units
114// ALU0 additionally has <...> branch execution capability.
115defvar Zn4BRU0 = Zn4ALU0;
116
117// Integer Multiplication issued on ALU1.
118defvar Zn4Multiplier = Zn4ALU1;
119
120// Execution pipeline grouping
121//===----------------------------------------------------------------------===//
122
123// General ALU operations
124def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
125
126// General AGU operations
127def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
128
129// Control flow: jumps, calls
130def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
131
132// Everything that isn't control flow, but still needs to access CC register,
133// namely: conditional moves, SETcc.
134def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
135
136// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
137
138// Simple bit twiddling: bit test, shift/rotate, bit extraction
139def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
140
141
142//
143// Scheduling
144//===----------------------------------------------------------------------===//
145
146// AMD SOG 19h, 2.10.3 Retire Control Unit
147// The integer physical register file (PRF) consists of 224 registers.
148def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
149                              6,  // Max moves that can be eliminated per cycle.
150                              0>; // Restrict move elimination to zero regs.
151
152// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
153// AMD SOG 19h, 2.10.1 Schedulers
154// The schedulers can receive up to six macro ops per cycle, with a limit of
155// two per scheduler. Each scheduler can issue one micro op per cycle into
156// each of its associated pipelines
157def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
158                           Zn4ALU1, Zn4AGU1,          // scheduler 1
159                           Zn4ALU2, Zn4AGU2,          // scheduler 2
160                           Zn4ALU3,          Zn4BRU1  // scheduler 3
161                          ]> {
162  let BufferSize = !mul(4, 24);
163}
164
165
166//===----------------------------------------------------------------------===//
167// Floating-Point Unit
168//
169
170// AMD SOG 19h, 2.4 Superscalar Organization
171// The processor uses <...> two decoupled independent floating point schedulers
172// each servicing two FP pipelines and one store or FP-to-integer pipeline.
173
174//
175// Execution pipes
176//===----------------------------------------------------------------------===//
177
178// AMD SOG 19h, 2.10.1 Schedulers
179// <...>, and six FPU pipes.
180// Agner, 22.10 Floating point execution pipes
181// There are six floating point/vector execution pipes,
182def Zn4FP0  : ProcResource<1>;
183def Zn4FP1  : ProcResource<1>;
184def Zn4FP2  : ProcResource<1>;
185def Zn4FP3  : ProcResource<1>;
186def Zn4FP45 : ProcResource<2>;
187
188//
189// Execution Units
190//===----------------------------------------------------------------------===//
191// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
192
193// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
194defvar Zn4FPFMul0 = Zn4FP0;
195defvar Zn4FPFMul1 = Zn4FP1;
196
197// (v)FADD*
198defvar Zn4FPFAdd0 = Zn4FP2;
199defvar Zn4FPFAdd1 = Zn4FP3;
200
201// All convert operations except pack/unpack
202defvar Zn4FPFCvt0 = Zn4FP2;
203defvar Zn4FPFCvt1 = Zn4FP3;
204
205// All Divide and Square Root except Reciprocal Approximation
206// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
207// FDIV unit can support 2 simultaneous operations in flight
208// even though it occupies a single pipe.
209// FIXME: BufferSize=2 ?
210defvar Zn4FPFDiv = Zn4FP1;
211
212// Moves and Logical operations on Floating Point Data Types
213defvar Zn4FPFMisc0 = Zn4FP0;
214defvar Zn4FPFMisc1 = Zn4FP1;
215defvar Zn4FPFMisc2 = Zn4FP2;
216defvar Zn4FPFMisc3 = Zn4FP3;
217
218// Integer Adds, Subtracts, and Compares
219// Some complex VADD operations are not available in all pipes.
220defvar Zn4FPVAdd0 = Zn4FP0;
221defvar Zn4FPVAdd1 = Zn4FP1;
222defvar Zn4FPVAdd2 = Zn4FP2;
223defvar Zn4FPVAdd3 = Zn4FP3;
224
225// Integer Multiplies, SAD, Blendvb
226defvar Zn4FPVMul0 = Zn4FP0;
227defvar Zn4FPVMul1 = Zn4FP3;
228
229// Data Shuffles, Packs, Unpacks, Permute
230// Some complex shuffle operations are only available in pipe1.
231defvar Zn4FPVShuf = Zn4FP1;
232defvar Zn4FPVShufAux = Zn4FP2;
233
234// Bit Shift Left/Right operations
235defvar Zn4FPVShift0 = Zn4FP1;
236defvar Zn4FPVShift1 = Zn4FP2;
237
238// Moves and Logical operations on Packed Integer Data Types
239defvar Zn4FPVMisc0 = Zn4FP0;
240defvar Zn4FPVMisc1 = Zn4FP1;
241defvar Zn4FPVMisc2 = Zn4FP2;
242defvar Zn4FPVMisc3 = Zn4FP3;
243
244// *AES*
245defvar Zn4FPAES0 = Zn4FP0;
246defvar Zn4FPAES1 = Zn4FP1;
247
248// *CLM*
249defvar Zn4FPCLM0 = Zn4FP0;
250defvar Zn4FPCLM1 = Zn4FP1;
251
252// Execution pipeline grouping
253//===----------------------------------------------------------------------===//
254
255// AMD SOG 19h, 2.11 Floating-Point Unit
256// Stores and floating point to general purpose register transfer
257// have 2 dedicated pipelines (pipe 5 and 6).
258def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
259
260// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
261def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
262
263// (v)FADD*
264// Some complex VADD operations are not available in all pipes.
265def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
266
267// All convert operations except pack/unpack
268def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
269
270// All Divide and Square Root except Reciprocal Approximation
271// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
272
273// Moves and Logical operations on Floating Point Data Types
274def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
275
276// FIXUP and RANGE use FP01 pipelines
277def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
278def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
279// SCALE instructions use FP23 pipelines
280def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
281def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
282
283// Loads, Stores and Move to General Register (EX) Operations
284// AMD SOG 19h, 2.11 Floating-Point Unit
285// Stores and floating point to general purpose register transfer
286// have 2 dedicated pipelines (pipe 5 and 6).
287defvar Zn4FPLd01 = Zn4FP45;
288
289// AMD SOG 19h, 2.11 Floating-Point Unit
290// Note that FP stores are supported on two pipelines,
291// but throughput is limited to one per cycle.
292let Super = Zn4FP45 in
293def Zn4FPSt : ProcResource<1>;
294
295// Integer Adds, Subtracts, and Compares
296// Some complex VADD operations are not available in all pipes.
297def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
298
299def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
300def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
301
302// AVX512 Opmask pipelines
303def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
304def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
305
306// Integer Multiplies, SAD, Blendvb
307def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
308
309// Data Shuffles, Packs, Unpacks, Permute
310// Some complex shuffle operations are only available in pipe1.
311def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
312
313// Bit Shift Left/Right operations
314def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
315
316// Moves and Logical operations on Packed Integer Data Types
317def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
318
319// *AES*
320def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
321
322// *CLM*
323def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
324
325
326//
327// Scheduling
328//===----------------------------------------------------------------------===//
329
330// Agner, 21.8 Register renaming and out-of-order schedulers
331// The floating point register file has 192 vector registers
332// of 512b each in zen4.
333def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
334                            6,  // Max moves that can be eliminated per cycle.
335                            0>; // Restrict move elimination to zero regs.
336
337// AMD SOG 19h, 2.11 Floating-Point Unit
338// The floating-point scheduler has a 2*32 entry macro op capacity.
339// AMD SOG 19h, 2.11 Floating-Point Unit
340// <...> the scheduler can issue 1 micro op per cycle for each pipe.
341// FIXME: those are two separate schedulers, not a single big one.
342def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2,          /*Zn4FP4,*/ // scheduler 0
343                          Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/  // scheduler 1
344                         ]> {
345  let BufferSize = !mul(2, 32);
346}
347
348// AMD SOG 19h, 2.11 Floating-Point Unit
349// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
350// even if floating-point scheduler is full.
351// FIXME: how to model this properly?
352
353
354//===----------------------------------------------------------------------===//
355// Load-Store Unit
356//
357
358// AMD SOG 19h, 2.12 Load-Store Unit
359// The LS unit contains three largely independent pipe-lines
360// enabling the execution of three 256-bit memory operations per cycle.
361def Zn4LSU : ProcResource<3>;
362
363// AMD SOG 19h, 2.12 Load-Store Unit
364// All three memory operations can be loads.
365let Super = Zn4LSU in
366def Zn4Load : ProcResource<3> {
367  // AMD SOG 19h, 2.12 Load-Store Unit
368  // The LS unit can process up to 72 out-of-order loads.
369  let BufferSize = 72;
370}
371
372def Zn4LoadQueue : LoadQueue<Zn4Load>;
373
374// AMD SOG 19h, 2.12 Load-Store Unit
375// A maximum of two of the memory operations can be stores.
376let Super = Zn4LSU in
377def Zn4Store : ProcResource<2> {
378  // AMD SOG 19h, 2.12 Load-Store Unit
379  // The LS unit utilizes a 64-entry store queue (STQ).
380  let BufferSize = 64;
381}
382
383def Zn4StoreQueue : StoreQueue<Zn4Store>;
384
385//===----------------------------------------------------------------------===//
386// Basic helper classes.
387//===----------------------------------------------------------------------===//
388
389// Many SchedWrites are defined in pairs with and without a folded load.
390// Instructions with folded loads are usually micro-fused, so they only appear
391// as two micro-ops when dispatched by the schedulers.
392// This multiclass defines the resource usage for variants with and without
393// folded loads.
394
395multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
396                         int Lat = 1, list<int> Res = [], int UOps = 1> {
397  def : WriteRes<SchedRW, ExePorts> {
398    let Latency = Lat;
399    let ReleaseAtCycles = Res;
400    let NumMicroOps = UOps;
401  }
402}
403
404multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
405                             list<ProcResourceKind> ExePorts, int Lat,
406                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
407                             ProcResourceKind AGU, int LoadRes> {
408  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
409
410  defm : __Zn4WriteRes<SchedRW.Folded,
411                       !listconcat([AGU, Zn4Load], ExePorts),
412                       !add(Lat, LoadLat),
413                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
414                         [],
415                         !listconcat([1, LoadRes],
416                           !if(!empty(Res),
417                             !listsplat(1, !size(ExePorts)),
418                             Res))),
419                       !add(UOps, LoadUOps)>;
420}
421
422// For classes without folded loads.
423multiclass Zn4WriteResInt<SchedWrite SchedRW,
424                          list<ProcResourceKind> ExePorts, int Lat = 1,
425                          list<int> Res = [], int UOps = 1> {
426  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
427}
428
429multiclass Zn4WriteResXMM<SchedWrite SchedRW,
430                          list<ProcResourceKind> ExePorts, int Lat = 1,
431                          list<int> Res = [], int UOps = 1> {
432  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
433}
434
435multiclass Zn4WriteResYMM<SchedWrite SchedRW,
436                          list<ProcResourceKind> ExePorts, int Lat = 1,
437                          list<int> Res = [], int UOps = 1> {
438  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
439}
440
441multiclass Zn4WriteResZMM<SchedWrite SchedRW,
442                          list<ProcResourceKind> ExePorts, int Lat = 1,
443                          list<int> Res = [], int UOps = 1> {
444  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
445}
446
447// For classes with folded loads.
448multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
449                              list<ProcResourceKind> ExePorts, int Lat = 1,
450                              list<int> Res = [], int UOps = 1,
451                              int LoadUOps = 0, int LoadRes = 1> {
452  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453                           Znver4Model.LoadLatency,
454                           LoadUOps, Zn4AGU012, LoadRes>;
455}
456
457multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458                              list<ProcResourceKind> ExePorts, int Lat = 1,
459                              list<int> Res = [], int UOps = 1,
460                              int LoadUOps = 0, int LoadRes = 1> {
461  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462                           Znver4Model.VecLoadLatency,
463                           LoadUOps, Zn4FPLd01, LoadRes>;
464}
465
466multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467                              list<ProcResourceKind> ExePorts, int Lat = 1,
468                              list<int> Res = [], int UOps = 1,
469                              int LoadUOps = 0, int LoadRes = 1> {
470  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471                           Znver4Model.VecLoadLatency,
472                           LoadUOps, Zn4FPLd01, LoadRes>;
473}
474
475multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
476                              list<ProcResourceKind> ExePorts, int Lat = 1,
477                              list<int> Res = [], int UOps = 2,
478                              int LoadUOps = 0, int LoadRes = 1> {
479  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
480                           Znver4Model.VecLoadLatency,
481                           LoadUOps, Zn4FPLd01, LoadRes>;
482}
483
484//===----------------------------------------------------------------------===//
485// Here be dragons.
486//===----------------------------------------------------------------------===//
487
488def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
489
490def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
491def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
492def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
493
494// AMD SOG 19h, 2.11 Floating-Point Unit
495// There is 1 cycle of added latency for a result to cross
496// from F to I or I to F domain.
497def : ReadAdvance<ReadInt2Fpu, -1>;
498
499// Instructions with both a load and a store folded are modeled as a folded
500// load + WriteRMW.
501defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
502
503// Loads, stores, and moves, not folded with other operations.
504defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
505
506// Model the effect of clobbering the read-write mask operand of the GATHER operation.
507// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
508defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
509
510def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
511  let Latency = !add(Znver4Model.LoadLatency, 1);
512  let ReleaseAtCycles = [3, 1];
513  let NumMicroOps = 1;
514}
515def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
516
517defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
518defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
519defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
520
521// Treat misc copies as a move.
522def : InstRW<[WriteMove], (instrs COPY)>;
523
524def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
525  let Latency = Znver4Model.LoadLatency;
526  let ReleaseAtCycles = [1, 1, 4];
527  let NumMicroOps = 1;
528}
529def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
530
531def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
532  let Latency = Znver4Model.StoreLatency;
533  let ReleaseAtCycles = [4, 1, 1];
534  let NumMicroOps = 2;
535}
536def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
537
538// Arithmetic.
539defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
540
541def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
542  let Latency = 1;
543  let ReleaseAtCycles = [4];
544  let NumMicroOps = 1;
545}
546def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
547                                        AND8i8, AND16i16, AND32i32, AND64i32,
548                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
549                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
550                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
551
552def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
553  let Latency = 1;
554  let ReleaseAtCycles = [4];
555  let NumMicroOps = 1;
556}
557def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
558
559def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
560  let Latency = 1;
561  let ReleaseAtCycles = [2];
562  let NumMicroOps = 1;
563}
564def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
565
566def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
567  let Latency = 3;
568  let ReleaseAtCycles = [1];
569  let NumMicroOps = 1;
570}
571def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
572                                          PEXT32rr, PEXT64rr)>;
573
574defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
575
576def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
577  let Latency = 1;
578  let ReleaseAtCycles = [1, 1, 7, 1];
579  let NumMicroOps = 1;
580}
581def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
582
583// This is for simple LEAs with one or two input operands.
584defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
585
586// This write is used for slow LEA instructions.
587def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
588  let Latency = 2;
589  let ReleaseAtCycles = [1];
590  let NumMicroOps = 2;
591}
592
593// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
594// or an LEA with a `Scale` value different than 1.
595def Zn4SlowLEAPredicate : MCSchedPredicate<
596  CheckAny<[
597    // A 3-operand LEA (base, index, offset).
598    IsThreeOperandsLEAFn,
599    // An LEA with a "Scale" different than 1.
600    CheckAll<[
601      CheckIsImmOperand<2>,
602      CheckNot<CheckImmOperand<2, 1>>
603    ]>
604  ]>
605>;
606
607def Zn4WriteLEA : SchedWriteVariant<[
608    SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
609    SchedVar<NoSchedPred,         [WriteLEA]>
610]>;
611
612def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
613
614def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
615  let Latency = 2; // FIXME: not from llvm-exegesis
616  let ReleaseAtCycles = [4];
617  let NumMicroOps = 2;
618}
619
620def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
621
622// Integer multiplication
623defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
624defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
625defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
626defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
627defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
628defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
629defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
630defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
631defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
632defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
633defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
634defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
635defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
636defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
637
638defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
639defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
640
641defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
642
643def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
644  let Latency = 3;
645  let ReleaseAtCycles = [12];
646  let NumMicroOps = 3;
647}
648def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
649
650defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
651
652def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
653  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
654  let ReleaseAtCycles = [1, 1, 12];
655  let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
656}
657def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
658
659def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
660  let Latency = 3; // FIXME: not from llvm-exegesis
661  let ReleaseAtCycles = [24];
662  let NumMicroOps = 19;
663}
664def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
665
666def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
667  let Latency = 4; // FIXME: not from llvm-exegesis
668  let ReleaseAtCycles = [59];
669  let NumMicroOps = 28;
670}
671def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
672
673def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
674  let Latency = 1;
675  let ReleaseAtCycles = [2];
676  let NumMicroOps = 2;
677}
678def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
679
680def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
681  let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
682  let ReleaseAtCycles = [1, 1, 2];
683  let NumMicroOps = 5;
684}
685def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
686
687def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
688  let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
689  let ReleaseAtCycles = [1, 1, 2];
690  let NumMicroOps = 2;
691}
692def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
693
694// Integer division.
695// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
696// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
697defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
698defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
699defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
700defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
701defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
702defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
703defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
704defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
705
706defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
707defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
708
709defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
710
711def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
712  let Latency = 1;
713  let ReleaseAtCycles = [4];
714  let NumMicroOps = 1;
715}
716def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
717
718defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
719
720def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
721  let Latency = 1;
722  let ReleaseAtCycles = [4];
723  let NumMicroOps = 1;
724}
725def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
726
727defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
728
729def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
730  let Latency = 2;
731  let ReleaseAtCycles = [4];
732  let NumMicroOps = 2;
733}
734def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
735
736defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
737defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
738defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
739defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
740defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
741
742defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
743defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
744defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
745
746defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
747defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
748defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
749
750// Integer shifts and rotates.
751defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
752defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
753defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
754
755def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
756  let Latency = 1;
757  let ReleaseAtCycles = [2];
758  let NumMicroOps = 1;
759}
760def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
761                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
762
763def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
764  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
765  let ReleaseAtCycles = [1, 1, 2];
766  let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
767}
768def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
769                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
770
771def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
772  let Latency = 3;
773  let ReleaseAtCycles = [6];
774  let NumMicroOps = 7;
775}
776def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
777
778def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
779  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
780  let ReleaseAtCycles = [1, 1, 8];
781  let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
782}
783def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
784
785def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
786  let Latency = 4;
787  let ReleaseAtCycles = [8];
788  let NumMicroOps = 9;
789}
790def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
791
792def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
793  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
794  let ReleaseAtCycles = [1, 1, 8];
795  let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
796}
797def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
798
799defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
800
801def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
802  let Latency = 3;
803  let ReleaseAtCycles = [6];
804  let NumMicroOps = 7;
805}
806def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
807
808def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
809  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
810  let ReleaseAtCycles = [1, 1, 8];
811  let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
812}
813def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
814
815def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
816  let Latency = 4;
817  let ReleaseAtCycles = [8];
818  let NumMicroOps = 9;
819}
820def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
821
822def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
823  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
824  let ReleaseAtCycles = [1, 1, 8];
825  let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
826}
827def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
828
829// Double shift instructions.
830defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
831defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
832defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
833defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
834
835// BMI1 BEXTR/BLS, BMI2 BZHI
836defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
837defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
838defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
839
840// Idioms that clear a register, like xorps %xmm0, %xmm0.
841// These can often bypass execution ports completely.
842defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
843
844// Branches don't produce values, so they have no latency, but they still
845// consume resources. Indirect branches can fold loads.
846defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
847
848// Floating point. This covers both scalar and vector operations.
849defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
850defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
851defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
852defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
853defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
854defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
855defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
856defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
857defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
858
859def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
860  let Latency = 2; // FIXME: not from llvm-exegesis
861  let ReleaseAtCycles = [1, 1];
862  let NumMicroOps = 2;
863}
864def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
865                                               VMOVHPDmr, VMOVHPSmr)>;
866
867defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
868defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
869defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
870defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
871defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
872
873defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
874defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
875defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
876defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
877
878defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
879
880def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
881  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
882  let ReleaseAtCycles = [1, 1, 24];
883  let NumMicroOps = 2;
884}
885def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
886                                         SUB_FI16m, SUB_FI32m,
887                                         SUBR_FI16m, SUBR_FI32m,
888                                         MUL_FI16m, MUL_FI32m)>;
889
890def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
891  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
892  let ReleaseAtCycles = [1, 1, 62];
893  let NumMicroOps = 2;
894}
895def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
896                                       DIVR_FI16m, DIVR_FI32m)>;
897
898defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
899defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
900defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
901defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
902defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
903defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
904defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
905defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>;  // Floating point compare.
906defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM).
907defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM).
908defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM).
909defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>;  // Floating point double compare.
910defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM).
911defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM).
912defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM).
913defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
914defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
915defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>;  // Floating point multiplication.
916defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
917defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
918defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
919defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
920defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
921defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
922defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
923defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>;  // Floating point division.
924defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
925defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
926defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
927defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>;  // Floating point double division.
928defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
929defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
930defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
931defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>;   // Floating point square root.
932defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
933defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
934defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>;  // Floating point square root (ZMM).
935defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>;  // Floating point double square root.
936defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
937defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
938defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
939defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
940defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>;  // Floating point reciprocal estimate.
941defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
942defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
943defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
944defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>;  // Floating point reciprocal square root estimate.
945defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
946defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
947defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
948defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
949defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
950defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
951defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM).
952defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
953defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
954defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
955defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
956defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
957defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
958defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
959
960defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
961defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
962defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
963defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
964defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
965defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
966defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
967defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
968defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
969defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
970defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
971defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
972defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
973defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
974defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
975defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
976defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
977defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
978
979// Horizontal Add/Sub (float and integer)
980defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
981defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
982defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
983defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
984defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
985defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
986defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
987
988// Vector integer operations.
989defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
990defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
991defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
992defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
993defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
994defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
995defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
996defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
997defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
998
999def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
1000  let Latency = 4;
1001  let ReleaseAtCycles = [1];
1002  let NumMicroOps = 1;
1003}
1004def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1005
1006def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
1007  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1008  let ReleaseAtCycles = [1, 1, 1];
1009  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1010}
1011def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1012
1013def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
1014  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1015  let ReleaseAtCycles = [1, 1, 1];
1016  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1017}
1018def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1019
1020defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1021defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1022defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1023defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1024defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
1025defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
1026defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1027
1028defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
1029defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
1030
1031def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1032  let Latency = 1;
1033  let ReleaseAtCycles = [1, 2];
1034  let NumMicroOps = 2;
1035}
1036def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1037
1038def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1039  let Latency = 1;
1040  let ReleaseAtCycles = [1, 4];
1041  let NumMicroOps = 2;
1042}
1043def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1044
1045defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1046
1047def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1048  let Latency = 3;
1049  let ReleaseAtCycles = [1, 1];
1050  let NumMicroOps = 1;
1051}
1052def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1053
1054def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1055  let Latency = 3;
1056  let ReleaseAtCycles = [1, 1];
1057  let NumMicroOps = 2;
1058}
1059def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1060
1061defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1062
1063def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1064  let Latency = 2;
1065  let ReleaseAtCycles = [2];
1066  let NumMicroOps = 1;
1067}
1068def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1069                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1070                                            PAVGBrr, PAVGWrr,
1071                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1072                                            VPABSBrr, VPABSDrr, VPABSWrr,
1073                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1074                                            VPAVGBrr, VPAVGWrr,
1075                                            VPCMPEQQrr,
1076                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1077                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1078
1079def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
1080  let Latency = 1;
1081  let ReleaseAtCycles = [1];
1082  let NumMicroOps = 1;
1083}
1084def : InstRW<[Zn4WriteVecOpMask], (instrs   KADDBrr, KADDDrr, KADDQrr, KADDWrr,
1085                                            KANDBrr, KANDDrr, KANDQrr, KANDWrr,
1086                                            KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr,
1087                                            KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
1088                                            KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
1089                                            KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr,
1090                                            KORBrr, KORDrr, KORQrr, KORWrr,
1091                                            KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr,
1092                                            KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr,
1093                                            KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr,
1094                                            KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr,
1095                                            KXORBrr, KXORDrr, KXORQrr, KXORWrr)>;
1096
1097def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
1098  let Latency = 1;
1099  let ReleaseAtCycles = [1];
1100  let NumMicroOps = 1;
1101}
1102def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
1103
1104def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1105  let Latency = 1;
1106  let ReleaseAtCycles = [1];
1107  let NumMicroOps = 1;
1108}
1109def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1110
1111def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1112  // TODO: All align instructions are expected to be of 4 cycle latency
1113  let Latency = 4;
1114  let ReleaseAtCycles = [1];
1115  let NumMicroOps = 1;
1116}
1117def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1118                                            VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1119                                            >;
1120defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1121
1122def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1123  let Latency = 1;
1124  let ReleaseAtCycles = [1];
1125  let NumMicroOps = 1;
1126}
1127def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1128                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1129                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1130                                            VPAVGBYrr, VPAVGWYrr,
1131                                            VPCMPEQQYrr,
1132                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1133
1134defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
1135
1136defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1137defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1138defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1139defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
1140defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1141defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1142defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (ZMM).
1143defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1144defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
1145defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1146defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
1147defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1148defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1149defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1150defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
1151defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1152defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1153defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1154defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
1155defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
1156defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1157defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
1158defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1159defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1160defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1161defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
1162defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1163defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1164defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1165defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
1166defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
1167defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1168defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
1169defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
1170defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1171defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
1172defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1173defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1174defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1175defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
1176defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1177defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1178defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
1179defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1180
1181// Vector insert/extract operations.
1182defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1183defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1184defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1185
1186// MOVMSK operations.
1187defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1188defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1189defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
1190defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1191
1192// Conversion between integer and float.
1193defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>;  // Double -> Integer.
1194defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1195defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1196defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1197
1198def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1199  let Latency = 1;
1200  let ReleaseAtCycles = [2];
1201  let NumMicroOps = 2;
1202}
1203defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>;  // Float -> Integer.
1204
1205defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1206defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1207defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1208
1209defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1210defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1211defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1212defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
1213
1214def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1215  let Latency = 2;
1216  let ReleaseAtCycles = [6];
1217  let NumMicroOps = 2;
1218}
1219
1220defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1221defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1222defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1223defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1224
1225def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1226  let Latency = 3;
1227  let ReleaseAtCycles = [1];
1228  let NumMicroOps = 2;
1229}
1230
1231defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1232defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1233defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1234defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
1235
1236defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1237defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1238defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1239defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
1240
1241defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1242defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1243defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
1244
1245defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1246defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1247defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
1248
1249defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1250defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1251defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
1252
1253// CRC32 instruction.
1254defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
1255
1256def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1257  let Latency = 2;
1258  let ReleaseAtCycles = [2];
1259  let NumMicroOps = 2;
1260}
1261def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1262
1263def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1264  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
1265  let ReleaseAtCycles = [1, 1, 2];
1266  let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
1267}
1268def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1269
1270def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
1271  let Latency = 1;
1272  let ReleaseAtCycles = [2];
1273  let NumMicroOps = 1;
1274}
1275def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1276
1277def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1278  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1279  let ReleaseAtCycles = [1, 1, 2];
1280  let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1281}
1282def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1283
1284def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1285  let Latency = 2;
1286  let ReleaseAtCycles = [3];
1287  let NumMicroOps = 2;
1288}
1289def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1290
1291def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1292  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
1293  let ReleaseAtCycles = [1, 1, 3];
1294  let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
1295}
1296def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1297
1298def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
1299  let Latency = 3;
1300  let ReleaseAtCycles = [8];
1301  let NumMicroOps = 4;
1302}
1303def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1304
1305def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1306  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
1307  let ReleaseAtCycles = [1, 1, 8];
1308  let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
1309}
1310def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1311
1312def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
1313  let Latency = 6;
1314  let ReleaseAtCycles = [8];
1315  let NumMicroOps = 1;
1316}
1317def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1318
1319def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
1320  let Latency = 4;
1321  let ReleaseAtCycles = [8];
1322  let NumMicroOps = 1;
1323}
1324def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1325
1326// Strings instructions.
1327// Packed Compare Implicit Length Strings, Return Mask
1328defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1329// Packed Compare Explicit Length Strings, Return Mask
1330defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1331// Packed Compare Implicit Length Strings, Return Index
1332defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
1333// Packed Compare Explicit Length Strings, Return Index
1334defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1335
1336// AES instructions.
1337defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
1338defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
1339defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
1340
1341// Carry-less multiplication instructions.
1342defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1343
1344// EMMS/FEMMS
1345defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1346
1347// Load/store MXCSR
1348defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1349defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1350
1351// Catch-all for expensive system instructions.
1352defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
1353
1354def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
1355  let Latency = 0; // FIXME: not from llvm-exegesis
1356  let ReleaseAtCycles = [1];
1357  let NumMicroOps = 1;
1358}
1359def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
1360
1361def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
1362  let Latency = 10; // FIXME: not from llvm-exegesis
1363  let ReleaseAtCycles = [24];
1364  let NumMicroOps = 18;
1365}
1366def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
1367
1368// AVX2.
1369defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1370defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1371defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
1372
1373def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
1374  let Latency = 3;
1375  let ReleaseAtCycles = [1];
1376  let NumMicroOps = 1;
1377}
1378def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1379
1380def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1381  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
1382  let ReleaseAtCycles = [1, 1, 1];
1383  let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1384}
1385def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1386
1387def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1388  let Latency = 7;
1389  let ReleaseAtCycles = [1];
1390  let NumMicroOps = 2;
1391}
1392def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1393
1394def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1395  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
1396  let ReleaseAtCycles = [1, 1, 2];
1397  let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1398}
1399def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1400
1401def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1402  let Latency = 6;
1403  let ReleaseAtCycles = [1];
1404  let NumMicroOps = 2;
1405}
1406def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1407
1408def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1409  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
1410  let ReleaseAtCycles = [1, 1, 2];
1411  let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1412}
1413def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1414
1415def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1416  let Latency = 5;
1417  let ReleaseAtCycles = [1];
1418  let NumMicroOps = 2;
1419}
1420def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
1421
1422def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1423  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
1424  let ReleaseAtCycles = [1, 1, 2];
1425  let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1426}
1427def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1428
1429defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1430defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1431defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
1432defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1433defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
1434
1435// Old microcoded instructions that nobody use.
1436defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
1437
1438// Fence instructions.
1439defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
1440
1441def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
1442  let Latency = 1;
1443  let ReleaseAtCycles = [30];
1444  let NumMicroOps = 1;
1445}
1446def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
1447
1448def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
1449  let Latency = 1;
1450  let ReleaseAtCycles = [1];
1451  let NumMicroOps = 1;
1452}
1453def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
1454
1455// Nop, not very useful expect it provides a model for nops!
1456defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1457
1458
1459///////////////////////////////////////////////////////////////////////////////
1460// Zero Cycle Move
1461///////////////////////////////////////////////////////////////////////////////
1462
1463def Zn4WriteZeroLatency : SchedWriteRes<[]> {
1464  let Latency = 0;
1465  let ReleaseAtCycles = [];
1466  let NumMicroOps = 1;
1467}
1468def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1469                                               MOV64rr, MOV64rr_REV,
1470                                               MOVSX32rr32)>;
1471
1472def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
1473  let Latency = 0;
1474  let ReleaseAtCycles = [];
1475  let NumMicroOps = 2;
1476}
1477def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1478                                               XCHG64rr, XCHG64ar)>;
1479
1480defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1481
1482defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1483defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1484defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
1485
1486defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
1487defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1488defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1489defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
1490
1491def : IsOptimizableRegisterMove<[
1492  InstructionEquivalenceClass<[
1493    // GPR variants.
1494    MOV32rr, MOV32rr_REV,
1495    MOV64rr, MOV64rr_REV,
1496    MOVSX32rr32,
1497    XCHG32rr, XCHG32ar,
1498    XCHG64rr, XCHG64ar,
1499
1500    // MMX variants.
1501    // MMX moves are *NOT* eliminated.
1502
1503    // SSE variants.
1504    MOVAPSrr, MOVAPSrr_REV,
1505    MOVUPSrr, MOVUPSrr_REV,
1506    MOVAPDrr, MOVAPDrr_REV,
1507    MOVUPDrr, MOVUPDrr_REV,
1508    MOVDQArr, MOVDQArr_REV,
1509    MOVDQUrr, MOVDQUrr_REV,
1510
1511    // AVX variants.
1512    VMOVAPSrr, VMOVAPSrr_REV,
1513    VMOVUPSrr, VMOVUPSrr_REV,
1514    VMOVAPDrr, VMOVAPDrr_REV,
1515    VMOVUPDrr, VMOVUPDrr_REV,
1516    VMOVDQArr, VMOVDQArr_REV,
1517    VMOVDQUrr, VMOVDQUrr_REV,
1518
1519    // AVX YMM variants.
1520    VMOVAPSYrr, VMOVAPSYrr_REV,
1521    VMOVUPSYrr, VMOVUPSYrr_REV,
1522    VMOVAPDYrr, VMOVAPDYrr_REV,
1523    VMOVUPDYrr, VMOVUPDYrr_REV,
1524    VMOVDQAYrr, VMOVDQAYrr_REV,
1525    VMOVDQUYrr, VMOVDQUYrr_REV,
1526  ], TruePred >
1527]>;
1528
1529// FIXUP and RANGE Instructions
1530def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
1531  let Latency = 2;
1532  let ReleaseAtCycles = [2];
1533  let NumMicroOps = 1;
1534}
1535def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1536	"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1537        "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri",  "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1538	"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1539	)>;
1540
1541// SCALE & REDUCE instructions
1542def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
1543  let Latency = 6;
1544  let ReleaseAtCycles = [6];
1545  let NumMicroOps = 2;
1546}
1547def : InstRW<[Zn4WriteSCALErr], (instregex
1548        "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
1549        "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1550	)>;
1551
1552//BF16PS Instructions
1553def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
1554  let Latency = 6;
1555  let ReleaseAtCycles = [6];
1556  let NumMicroOps = 2;
1557}
1558def : InstRW<[Zn4WriteBF16], (instregex
1559        "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1560	)>;
1561
1562// BUSD and VPMADD Instructions
1563def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
1564  let Latency = 4;
1565  let ReleaseAtCycles = [4];
1566  let NumMicroOps = 1;
1567}
1568def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1569	"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1570        "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
1571	)>;
1572
1573// SHIFT instructions
1574def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
1575  let Latency = 2;
1576  let ReleaseAtCycles = [2];
1577  let NumMicroOps = 1;
1578}
1579def : InstRW<[Zn4WriteSHIFTrr], (instregex
1580        "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
1581        "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1582        "(V?)P(SLL|SRL|SRA)DQYri",
1583        "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1584        "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1585        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1586        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1587        "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1588	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz"
1589	)>;
1590
1591def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
1592  let Latency = 1;
1593  let ReleaseAtCycles = [1];
1594  let NumMicroOps = 1;
1595}
1596def : InstRW<[Zn4WriteSHIFTri], (instregex
1597        "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
1598	)>;
1599
1600// ALIGN Instructions
1601def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1602  let Latency = 2;
1603  let ReleaseAtCycles = [2];
1604  let NumMicroOps = 1;
1605}
1606def : InstRW<[Zn4WriteALIGN], (instregex
1607        "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1608	)>;
1609
1610//PACK Instructions
1611def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
1612  let Latency = 2;
1613  let ReleaseAtCycles = [2];
1614  let NumMicroOps = 1;
1615}
1616def : InstRW<[Zn4WritePACK], (instregex
1617        "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1618	)>;
1619
1620// MAX and MIN Instructions
1621def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
1622  let Latency = 2;
1623  let ReleaseAtCycles = [2];
1624  let NumMicroOps = 1;
1625}
1626def : InstRW<[Zn4WriteFCmp64], (instregex
1627        "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1628        "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1629        "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
1630        "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1631	)>;
1632
1633// MOV Instructions
1634def Zn4MOVDUPZ: SchedWriteRes<[Zn4FPFMisc12]> {
1635  let Latency = 2;
1636  let ReleaseAtCycles = [2];
1637  let NumMicroOps = 1;
1638}
1639def : InstRW<[Zn4MOVDUPZ], (instregex
1640        "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)"
1641	)>;
1642
1643def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
1644  let Latency = 2;
1645  let ReleaseAtCycles = [1];
1646  let NumMicroOps = 1;
1647}
1648def : InstRW<[Zn4MOVS], (instregex
1649        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
1650        "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
1651        "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
1652        "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?|Z256?)(rr|rrk|rrkz)"
1653	)>;
1654
1655def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
1656  let Latency = 4;
1657  let ReleaseAtCycles = [4];
1658  let NumMicroOps = 1;
1659}
1660def : InstRW<[Zn4MOVSZ], (instregex
1661        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)"
1662	)>;
1663
1664def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
1665  let Latency = 5;
1666  let ReleaseAtCycles = [5];
1667  let NumMicroOps = 1;
1668}
1669def : InstRW<[Zn4MOVSrr], (instregex
1670        "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)"
1671	)>;
1672
1673
1674//VPTEST Instructions
1675def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1676  let Latency = 3;
1677  let ReleaseAtCycles = [3];
1678  let NumMicroOps = 1;
1679}
1680def : InstRW<[Zn4VPTESTZ128], (instregex
1681        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1682	)>;
1683
1684def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1685  let Latency = 4;
1686  let ReleaseAtCycles = [4];
1687  let NumMicroOps = 1;
1688}
1689def : InstRW<[Zn4VPTESTZ256], (instregex
1690        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1691	)>;
1692
1693def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
1694  let Latency = 5;
1695  let ReleaseAtCycles = [5];
1696  let NumMicroOps = 1;
1697}
1698def : InstRW<[Zn4VPTESTZ], (instregex
1699        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1700	)>;
1701
1702// CONFLICT Instructions
1703def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1704  let Latency = 2;
1705  let ReleaseAtCycles = [2];
1706  let NumMicroOps = 1;
1707}
1708def : InstRW<[Zn4CONFLICTZ128], (instregex
1709        "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
1710	)>;
1711
1712def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
1713  let Latency = 6;
1714  let ReleaseAtCycles = [2,2,2];
1715  let NumMicroOps = 4;
1716}
1717def : InstRW<[Zn4CONFLICTrr], (instregex
1718        "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
1719	)>;
1720
1721// RSQRT Instructions
1722def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1723  let Latency = 5;
1724  let ReleaseAtCycles = [2];
1725  let NumMicroOps = 1;
1726}
1727def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
1728        "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
1729	)>;
1730
1731
1732// PERM Instructions
1733def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
1734  let Latency = 2;
1735  let ReleaseAtCycles = [2];
1736  let NumMicroOps = 1;
1737}
1738def : InstRW<[Zn4PERMILP], (instregex
1739        "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
1740	)>;
1741
1742def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
1743  let Latency = 3;
1744  let ReleaseAtCycles = [2];
1745  let NumMicroOps = 1;
1746}
1747def : InstRW<[Zn4PERMIT2_128], (instregex
1748	"VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)",
1749	"VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)"
1750	)>;
1751
1752def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
1753  let Latency = 2;
1754  let ReleaseAtCycles = [2];
1755  let NumMicroOps = 1;
1756}
1757def : InstRW<[Zn4PERMIT2_128rr], (instregex
1758	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1759	"VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
1760	)>;
1761
1762def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
1763  let Latency = 4;
1764  let ReleaseAtCycles = [2];
1765  let NumMicroOps = 1;
1766}
1767def : InstRW<[Zn4PERMIT2_256], (instregex
1768	"VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)",
1769	"VPERMP(S|D)Z256(rr|rrk|rrkz)",
1770	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1771	"VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
1772	"VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)",
1773	"VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
1774	)>;
1775
1776def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
1777  let Latency = 5;
1778  let ReleaseAtCycles = [2];
1779  let NumMicroOps = 1;
1780}
1781def : InstRW<[Zn4PERMIT2Z], (instregex
1782	"VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)",
1783	"VPERM(B|D|W)Z(rr|rrk|rrkz)",
1784	"VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)",
1785	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1786	"VPEXPAND(B|W)Z(rr|rrk|rrkz)",
1787	"VPERMP(S|D)Z(rr|rrk|rrkz)"
1788	)>;
1789
1790// ALU SLOW Misc Instructions
1791def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
1792  let Latency = 2;
1793  let ReleaseAtCycles = [2];
1794  let NumMicroOps = 1;
1795}
1796def : InstRW<[Zn4VecALUZSlow], (instrs
1797	VPABSBZ128rr,      VPABSBZ128rrk,  VPABSBZ128rrkz,   VPABSDZ128rr,
1798	VPABSDZ128rrk,     VPABSDZ128rrkz, VPABSQZ128rr,     VPABSQZ128rrk,
1799	VPABSQZ128rrkz,    VPABSWZ128rr,   VPABSWZ128rrk,    VPABSWZ128rrkz,
1800	VPADDSBZ128rr,     VPADDSBZ128rrk, VPADDSBZ128rrkz,  VPADDSWZ128rr,
1801	VPADDSWZ128rrk,    VPADDSWZ128rrkz,VPADDUSBZ128rr,   VPADDUSBZ128rrk,
1802	VPADDUSBZ128rrkz,  VPADDUSWZ128rr, VPADDUSWZ128rrk,  VPADDUSWZ128rrkz,
1803	VPAVGBZ128rr,      VPAVGBZ128rrk,  VPAVGBZ128rrkz,   VPAVGWZ128rr,
1804	VPAVGWZ128rrk,     VPAVGWZ128rrkz, VPOPCNTBZ128rr,   VPOPCNTBZ128rrk,
1805	VPOPCNTBZ128rrkz,  VPOPCNTDZ128rr, VPOPCNTDZ128rrk,  VPOPCNTDZ128rrkz,
1806	VPOPCNTQZ128rr,    VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
1807	VPOPCNTWZ128rrk,   VPOPCNTWZ128rrkz,VPSUBSBZ128rr,   VPSUBSBZ128rrk,
1808	VPSUBSBZ128rrkz,   VPSUBSWZ128rr,   VPSUBSWZ128rrk,  VPSUBSWZ128rrkz,
1809	VPSUBUSBZ128rr,    VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
1810	VPSUBUSWZ128rrk,   VPSUBUSWZ128rrkz
1811	)>;
1812
1813
1814///////////////////////////////////////////////////////////////////////////////
1815// Dependency breaking instructions.
1816///////////////////////////////////////////////////////////////////////////////
1817
1818def Zn4WriteZeroIdiom : SchedWriteVariant<[
1819    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1820    SchedVar<NoSchedPred,                          [WriteALU]>
1821]>;
1822def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1823                                          XOR64rr, XOR64rr_REV,
1824                                          SUB32rr, SUB32rr_REV,
1825                                          SUB64rr, SUB64rr_REV)>;
1826
1827def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1828    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
1829    SchedVar<NoSchedPred,                                 [WriteALU]>
1830]>;
1831def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1832                                                CMP16rr, CMP16rr_REV,
1833                                                CMP32rr, CMP32rr_REV,
1834                                                CMP64rr, CMP64rr_REV)>;
1835
1836def Zn4WriteFZeroIdiom : SchedWriteVariant<[
1837    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1838    SchedVar<NoSchedPred,                          [WriteFLogic]>
1839]>;
1840// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1841def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1842                                           VANDNPSrr, VANDNPDrr)>;
1843
1844def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
1845    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1846    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1847]>;
1848def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1849                                            VANDNPSYrr, VANDNPDYrr)>;
1850
1851def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
1852    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1853    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1854]>;
1855// NOTE: PXORrr,PANDNrr are not zero-cycle!
1856def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1857
1858def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
1859    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1860    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1861]>;
1862def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1863
1864def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
1865    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1866    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1867]>;
1868// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1869//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1870def : InstRW<[Zn4WriteVZeroIdiomALUX],
1871             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1872                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1873
1874def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
1875    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1876    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1877]>;
1878def : InstRW<[Zn4WriteVZeroIdiomALUY],
1879             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1880                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1881
1882def : IsZeroIdiomFunction<[
1883  // GPR Zero-idioms.
1884  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1885                     XOR64rr, XOR64rr_REV,
1886                     SUB32rr, SUB32rr_REV,
1887                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1888
1889  // SSE XMM Zero-idioms.
1890  DepBreakingClass<[
1891    // fp variants.
1892    XORPSrr, XORPDrr,
1893    ANDNPSrr, ANDNPDrr,
1894
1895    // int variants.
1896    PXORrr,
1897    PANDNrr,
1898    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1899    PSUBSBrr, PSUBSWrr,
1900    PSUBUSBrr, PSUBUSWrr,
1901    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1902  ], ZeroIdiomPredicate>,
1903
1904  // AVX XMM Zero-idioms.
1905  DepBreakingClass<[
1906    // fp variants.
1907    VXORPSrr, VXORPDrr,
1908    VANDNPSrr, VANDNPDrr,
1909
1910    // int variants.
1911    VPXORrr,
1912    VPANDNrr,
1913    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1914    VPSUBSBrr, VPSUBSWrr,
1915    VPSUBUSBrr, VPSUBUSWrr,
1916    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1917  ], ZeroIdiomPredicate>,
1918
1919  // AVX YMM Zero-idioms.
1920  DepBreakingClass<[
1921    // fp variants.
1922    VXORPSYrr, VXORPDYrr,
1923    VANDNPSYrr, VANDNPDYrr,
1924
1925    // int variants.
1926    VPXORYrr,
1927    VPANDNYrr,
1928    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1929    VPSUBSBYrr, VPSUBSWYrr,
1930    VPSUBUSBYrr, VPSUBUSWYrr,
1931    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1932  ], ZeroIdiomPredicate>,
1933]>;
1934
1935def : IsDepBreakingFunction<[
1936  // GPR
1937  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1938                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1939  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1940                     CMP16rr, CMP16rr_REV,
1941                     CMP32rr, CMP32rr_REV,
1942                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1943  // SSE
1944  DepBreakingClass<[
1945    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1946  ], ZeroIdiomPredicate>,
1947
1948  // AVX XMM
1949  DepBreakingClass<[
1950    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1951  ], ZeroIdiomPredicate>,
1952
1953  // AVX YMM
1954  DepBreakingClass<[
1955    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1956  ], ZeroIdiomPredicate>,
1957]>;
1958
1959} // SchedModel
1960
1961