xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver4.td (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver4 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//===----------------------------------------------------------------------===//
15
16def Znver4Model : SchedMachineModel {
17  // AMD SOG 19h, 2.9.6 Dispatch
18  // The processor may dispatch up to 6 macro ops per cycle
19  // into the execution engine.
20  let IssueWidth = 6;
21  // AMD SOG 19h, 2.10.3
22  // The retire control unit (RCU) tracks the completion status of all
23  // outstanding operations (integer, load/store, and floating-point) and is
24  // the final arbiter for exception processing and recovery.
25  // The unit can receive up to 6 macro ops dispatched per cycle and track up
26  // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27  let MicroOpBufferSize = 320;
28  // AMD SOG 19h, 2.9.1 Op Cache
29  // The op cache is organized as an associative cache with 64 sets and 8 ways.
30  // At each set-way intersection is an entry containing up to 8 macro ops.
31  // The maximum capacity of the op cache is 4K ops.
32  // Agner, 22.5 µop cache
33  // The size of the µop cache is big enough for holding most critical loops.
34  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
35  //        with large values here the compilation of certain loops
36  //        ends up taking way too long.
37  // Ideally for znver4, we should have 6.75K. However we don't add that
38  // considerting the impact compile time and prefer using default values
39  // instead.
40  // let LoopMicroOpBufferSize = 6750;
41  // AMD SOG 19h, 2.6.2 L1 Data Cache
42  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
43  // AMD SOG 19h, 2.12 L1 Data Cache
44  // The AGU and LS pipelines are optimized for simple address generation modes.
45  // <...> and can achieve 4-cycle load-to-use integer load latency.
46  let LoadLatency = 4;
47  // AMD SOG 19h, 2.12 L1 Data Cache
48  // The AGU and LS pipelines are optimized for simple address generation modes.
49  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
50  int VecLoadLatency = 7;
51  // Latency of a simple store operation.
52  int StoreLatency = 1;
53  // FIXME:
54  let HighLatency = 25; // FIXME: any better choice?
55  // AMD SOG 19h, 2.8 Optimizing Branching
56  // The branch misprediction penalty is in the range from 11 to 18 cycles,
57  // <...>. The common case penalty is 13 cycles.
58  let MispredictPenalty = 13;
59
60  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
61
62  let CompleteModel = 1;
63}
64
65let SchedModel = Znver4Model in {
66
67
68//===----------------------------------------------------------------------===//
69// RCU
70//===----------------------------------------------------------------------===//
71
72// AMD SOG 19h, 2.10.3 Retire Control Unit
73// The unit can receive up to 6 macro ops dispatched per cycle and track up to
74// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
75// The retire unit handles in-order commit of up to nine macro ops per cycle.
76def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
77
78//===----------------------------------------------------------------------===//
79// Integer Execution Unit
80//
81
82// AMD SOG 19h, 2.4 Superscalar Organization
83// The processor uses four decoupled independent integer scheduler queues,
84// each one servicing one ALU pipeline and one or two other pipelines
85
86//
87// Execution pipes
88//===----------------------------------------------------------------------===//
89
90// AMD SOG 19h, 2.10.2 Execution Units
91// The processor contains 4 general purpose integer execution pipes.
92// Each pipe has an ALU capable of general purpose integer operations.
93def Zn4ALU0 : ProcResource<1>;
94def Zn4ALU1 : ProcResource<1>;
95def Zn4ALU2 : ProcResource<1>;
96def Zn4ALU3 : ProcResource<1>;
97
98// AMD SOG 19h, 2.10.2 Execution Units
99// There is also a separate branch execution unit.
100def Zn4BRU1 : ProcResource<1>;
101
102// AMD SOG 19h, 2.10.2 Execution Units
103// There are three Address Generation Units (AGUs) for all load and store
104// address generation. There are also 3 store data movement units
105// associated with the same schedulers as the AGUs.
106def Zn4AGU0 : ProcResource<1>;
107def Zn4AGU1 : ProcResource<1>;
108def Zn4AGU2 : ProcResource<1>;
109
110//
111// Execution Units
112//===----------------------------------------------------------------------===//
113
114// AMD SOG 19h, 2.10.2 Execution Units
115// ALU0 additionally has divide <...> execution capability.
116defvar Zn4Divider = Zn4ALU0;
117
118// AMD SOG 19h, 2.10.2 Execution Units
119// ALU0 additionally has <...> branch execution capability.
120defvar Zn4BRU0 = Zn4ALU0;
121
122// Integer Multiplication issued on ALU1.
123defvar Zn4Multiplier = Zn4ALU1;
124
125// Execution pipeline grouping
126//===----------------------------------------------------------------------===//
127
128// General ALU operations
129def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
130
131// General AGU operations
132def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
133
134// Control flow: jumps, calls
135def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
136
137// Everything that isn't control flow, but still needs to access CC register,
138// namely: conditional moves, SETcc.
139def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
140
141// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
142
143// Simple bit twiddling: bit test, shift/rotate, bit extraction
144def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
145
146
147//
148// Scheduling
149//===----------------------------------------------------------------------===//
150
151// AMD SOG 19h, 2.10.3 Retire Control Unit
152// The integer physical register file (PRF) consists of 224 registers.
153def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
154                              6,  // Max moves that can be eliminated per cycle.
155                              0>; // Restrict move elimination to zero regs.
156
157// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
158// AMD SOG 19h, 2.10.1 Schedulers
159// The schedulers can receive up to six macro ops per cycle, with a limit of
160// two per scheduler. Each scheduler can issue one micro op per cycle into
161// each of its associated pipelines
162def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
163                           Zn4ALU1, Zn4AGU1,          // scheduler 1
164                           Zn4ALU2, Zn4AGU2,          // scheduler 2
165                           Zn4ALU3,          Zn4BRU1  // scheduler 3
166                          ]> {
167  let BufferSize = !mul(4, 24);
168}
169
170
171//===----------------------------------------------------------------------===//
172// Floating-Point Unit
173//
174
175// AMD SOG 19h, 2.4 Superscalar Organization
176// The processor uses <...> two decoupled independent floating point schedulers
177// each servicing two FP pipelines and one store or FP-to-integer pipeline.
178
179//
180// Execution pipes
181//===----------------------------------------------------------------------===//
182
183// AMD SOG 19h, 2.10.1 Schedulers
184// <...>, and six FPU pipes.
185// Agner, 22.10 Floating point execution pipes
186// There are six floating point/vector execution pipes,
187def Zn4FP0  : ProcResource<1>;
188def Zn4FP1  : ProcResource<1>;
189def Zn4FP2  : ProcResource<1>;
190def Zn4FP3  : ProcResource<1>;
191def Zn4FP45 : ProcResource<2>;
192
193//
194// Execution Units
195//===----------------------------------------------------------------------===//
196// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
197
198// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
199defvar Zn4FPFMul0 = Zn4FP0;
200defvar Zn4FPFMul1 = Zn4FP1;
201
202// (v)FADD*
203defvar Zn4FPFAdd0 = Zn4FP2;
204defvar Zn4FPFAdd1 = Zn4FP3;
205
206// All convert operations except pack/unpack
207defvar Zn4FPFCvt0 = Zn4FP2;
208defvar Zn4FPFCvt1 = Zn4FP3;
209
210// All Divide and Square Root except Reciprocal Approximation
211// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
212// FDIV unit can support 2 simultaneous operations in flight
213// even though it occupies a single pipe.
214// FIXME: BufferSize=2 ?
215defvar Zn4FPFDiv = Zn4FP1;
216
217// Moves and Logical operations on Floating Point Data Types
218defvar Zn4FPFMisc0 = Zn4FP0;
219defvar Zn4FPFMisc1 = Zn4FP1;
220defvar Zn4FPFMisc2 = Zn4FP2;
221defvar Zn4FPFMisc3 = Zn4FP3;
222
223// Integer Adds, Subtracts, and Compares
224// Some complex VADD operations are not available in all pipes.
225defvar Zn4FPVAdd0 = Zn4FP0;
226defvar Zn4FPVAdd1 = Zn4FP1;
227defvar Zn4FPVAdd2 = Zn4FP2;
228defvar Zn4FPVAdd3 = Zn4FP3;
229
230// Integer Multiplies, SAD, Blendvb
231defvar Zn4FPVMul0 = Zn4FP0;
232defvar Zn4FPVMul1 = Zn4FP3;
233
234// Data Shuffles, Packs, Unpacks, Permute
235// Some complex shuffle operations are only available in pipe1.
236defvar Zn4FPVShuf = Zn4FP1;
237defvar Zn4FPVShufAux = Zn4FP2;
238
239// Bit Shift Left/Right operations
240defvar Zn4FPVShift0 = Zn4FP1;
241defvar Zn4FPVShift1 = Zn4FP2;
242
243// Moves and Logical operations on Packed Integer Data Types
244defvar Zn4FPVMisc0 = Zn4FP0;
245defvar Zn4FPVMisc1 = Zn4FP1;
246defvar Zn4FPVMisc2 = Zn4FP2;
247defvar Zn4FPVMisc3 = Zn4FP3;
248
249// *AES*
250defvar Zn4FPAES0 = Zn4FP0;
251defvar Zn4FPAES1 = Zn4FP1;
252
253// *CLM*
254defvar Zn4FPCLM0 = Zn4FP0;
255defvar Zn4FPCLM1 = Zn4FP1;
256
257// Execution pipeline grouping
258//===----------------------------------------------------------------------===//
259
260// AMD SOG 19h, 2.11 Floating-Point Unit
261// Stores and floating point to general purpose register transfer
262// have 2 dedicated pipelines (pipe 5 and 6).
263def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
264
265// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
266def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
267
268// (v)FADD*
269// Some complex VADD operations are not available in all pipes.
270def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
271
272// All convert operations except pack/unpack
273def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
274
275// All Divide and Square Root except Reciprocal Approximation
276// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
277
278// Moves and Logical operations on Floating Point Data Types
279def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
280
281// FIXUP and RANGE use FP01 pipelines
282def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
283def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
284// SCALE instructions use FP23 pipelines
285def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
286def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
287
288// Loads, Stores and Move to General Register (EX) Operations
289// AMD SOG 19h, 2.11 Floating-Point Unit
290// Stores and floating point to general purpose register transfer
291// have 2 dedicated pipelines (pipe 5 and 6).
292defvar Zn4FPLd01 = Zn4FP45;
293
294// AMD SOG 19h, 2.11 Floating-Point Unit
295// Note that FP stores are supported on two pipelines,
296// but throughput is limited to one per cycle.
297let Super = Zn4FP45 in
298def Zn4FPSt : ProcResource<1>;
299
300// Integer Adds, Subtracts, and Compares
301// Some complex VADD operations are not available in all pipes.
302def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
303
304def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
305def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
306
307// AVX512 Opmask pipelines
308def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
309def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
310
311// Integer Multiplies, SAD, Blendvb
312def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
313
314// Data Shuffles, Packs, Unpacks, Permute
315// Some complex shuffle operations are only available in pipe1.
316def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
317
318// Bit Shift Left/Right operations
319def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
320
321// Moves and Logical operations on Packed Integer Data Types
322def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
323
324// *AES*
325def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
326
327// *CLM*
328def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
329
330
331//
332// Scheduling
333//===----------------------------------------------------------------------===//
334
335// Agner, 21.8 Register renaming and out-of-order schedulers
336// The floating point register file has 192 vector registers
337// of 512b each in zen4.
338def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
339                            6,  // Max moves that can be eliminated per cycle.
340                            0>; // Restrict move elimination to zero regs.
341
342// AMD SOG 19h, 2.11 Floating-Point Unit
343// The floating-point scheduler has a 2*32 entry macro op capacity.
344// AMD SOG 19h, 2.11 Floating-Point Unit
345// <...> the scheduler can issue 1 micro op per cycle for each pipe.
346// FIXME: those are two separate schedulers, not a single big one.
347def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2,          /*Zn4FP4,*/ // scheduler 0
348                          Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/  // scheduler 1
349                         ]> {
350  let BufferSize = !mul(2, 32);
351}
352
353// AMD SOG 19h, 2.11 Floating-Point Unit
354// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
355// even if floating-point scheduler is full.
356// FIXME: how to model this properly?
357
358
359//===----------------------------------------------------------------------===//
360// Load-Store Unit
361//
362
363// AMD SOG 19h, 2.12 Load-Store Unit
364// The LS unit contains three largely independent pipe-lines
365// enabling the execution of three 256-bit memory operations per cycle.
366def Zn4LSU : ProcResource<3>;
367
368// AMD SOG 19h, 2.12 Load-Store Unit
369// All three memory operations can be loads.
370let Super = Zn4LSU in
371def Zn4Load : ProcResource<3> {
372  // AMD SOG 19h, 2.12 Load-Store Unit
373  // The LS unit can process up to 72 out-of-order loads.
374  let BufferSize = 72;
375}
376
377def Zn4LoadQueue : LoadQueue<Zn4Load>;
378
379// AMD SOG 19h, 2.12 Load-Store Unit
380// A maximum of two of the memory operations can be stores.
381let Super = Zn4LSU in
382def Zn4Store : ProcResource<2> {
383  // AMD SOG 19h, 2.12 Load-Store Unit
384  // The LS unit utilizes a 64-entry store queue (STQ).
385  let BufferSize = 64;
386}
387
388def Zn4StoreQueue : StoreQueue<Zn4Store>;
389
390//===----------------------------------------------------------------------===//
391// Basic helper classes.
392//===----------------------------------------------------------------------===//
393
394// Many SchedWrites are defined in pairs with and without a folded load.
395// Instructions with folded loads are usually micro-fused, so they only appear
396// as two micro-ops when dispatched by the schedulers.
397// This multiclass defines the resource usage for variants with and without
398// folded loads.
399
400multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
401                         int Lat = 1, list<int> Res = [], int UOps = 1> {
402  def : WriteRes<SchedRW, ExePorts> {
403    let Latency = Lat;
404    let ResourceCycles = Res;
405    let NumMicroOps = UOps;
406  }
407}
408
409multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
410                             list<ProcResourceKind> ExePorts, int Lat,
411                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
412                             ProcResourceKind AGU, int LoadRes> {
413  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
414
415  defm : __Zn4WriteRes<SchedRW.Folded,
416                       !listconcat([AGU, Zn4Load], ExePorts),
417                       !add(Lat, LoadLat),
418                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
419                         [],
420                         !listconcat([1, LoadRes],
421                           !if(!empty(Res),
422                             !listsplat(1, !size(ExePorts)),
423                             Res))),
424                       !add(UOps, LoadUOps)>;
425}
426
427// For classes without folded loads.
428multiclass Zn4WriteResInt<SchedWrite SchedRW,
429                          list<ProcResourceKind> ExePorts, int Lat = 1,
430                          list<int> Res = [], int UOps = 1> {
431  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
432}
433
434multiclass Zn4WriteResXMM<SchedWrite SchedRW,
435                          list<ProcResourceKind> ExePorts, int Lat = 1,
436                          list<int> Res = [], int UOps = 1> {
437  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
438}
439
440multiclass Zn4WriteResYMM<SchedWrite SchedRW,
441                          list<ProcResourceKind> ExePorts, int Lat = 1,
442                          list<int> Res = [], int UOps = 1> {
443  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
444}
445
446multiclass Zn4WriteResZMM<SchedWrite SchedRW,
447                          list<ProcResourceKind> ExePorts, int Lat = 1,
448                          list<int> Res = [], int UOps = 1> {
449  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
450}
451
452// For classes with folded loads.
453multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
454                              list<ProcResourceKind> ExePorts, int Lat = 1,
455                              list<int> Res = [], int UOps = 1,
456                              int LoadUOps = 0, int LoadRes = 1> {
457  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
458                           Znver4Model.LoadLatency,
459                           LoadUOps, Zn4AGU012, LoadRes>;
460}
461
462multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
463                              list<ProcResourceKind> ExePorts, int Lat = 1,
464                              list<int> Res = [], int UOps = 1,
465                              int LoadUOps = 0, int LoadRes = 1> {
466  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
467                           Znver4Model.VecLoadLatency,
468                           LoadUOps, Zn4FPLd01, LoadRes>;
469}
470
471multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
472                              list<ProcResourceKind> ExePorts, int Lat = 1,
473                              list<int> Res = [], int UOps = 1,
474                              int LoadUOps = 0, int LoadRes = 1> {
475  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
476                           Znver4Model.VecLoadLatency,
477                           LoadUOps, Zn4FPLd01, LoadRes>;
478}
479
480multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
481                              list<ProcResourceKind> ExePorts, int Lat = 1,
482                              list<int> Res = [], int UOps = 2,
483                              int LoadUOps = 0, int LoadRes = 1> {
484  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
485                           Znver4Model.VecLoadLatency,
486                           LoadUOps, Zn4FPLd01, LoadRes>;
487}
488
489//===----------------------------------------------------------------------===//
490// Here be dragons.
491//===----------------------------------------------------------------------===//
492
493def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
494
495def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
496def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
497def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
498
499// AMD SOG 19h, 2.11 Floating-Point Unit
500// There is 1 cycle of added latency for a result to cross
501// from F to I or I to F domain.
502def : ReadAdvance<ReadInt2Fpu, -1>;
503
504// Instructions with both a load and a store folded are modeled as a folded
505// load + WriteRMW.
506defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
507
508// Loads, stores, and moves, not folded with other operations.
509defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
510
511// Model the effect of clobbering the read-write mask operand of the GATHER operation.
512// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
513defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
514
515def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
516  let Latency = !add(Znver4Model.LoadLatency, 1);
517  let ResourceCycles = [3, 1];
518  let NumMicroOps = 1;
519}
520def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
521
522defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
523defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
524defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
525
526// Treat misc copies as a move.
527def : InstRW<[WriteMove], (instrs COPY)>;
528
529def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
530  let Latency = Znver4Model.LoadLatency;
531  let ResourceCycles = [1, 1, 4];
532  let NumMicroOps = 1;
533}
534def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
535
536def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
537  let Latency = Znver4Model.StoreLatency;
538  let ResourceCycles = [4, 1, 1];
539  let NumMicroOps = 2;
540}
541def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
542
543// Arithmetic.
544defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
545
546def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
547  let Latency = 1;
548  let ResourceCycles = [4];
549  let NumMicroOps = 1;
550}
551def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
552                                        AND8i8, AND16i16, AND32i32, AND64i32,
553                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
554                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
555                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
556
557def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
558  let Latency = 1;
559  let ResourceCycles = [4];
560  let NumMicroOps = 1;
561}
562def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
563
564def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
565  let Latency = 1;
566  let ResourceCycles = [2];
567  let NumMicroOps = 1;
568}
569def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
570
571def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
572  let Latency = 3;
573  let ResourceCycles = [1];
574  let NumMicroOps = 1;
575}
576def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
577                                          PEXT32rr, PEXT64rr)>;
578
579defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
580
581def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
582  let Latency = 1;
583  let ResourceCycles = [1, 1, 7, 1];
584  let NumMicroOps = 1;
585}
586def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
587
588// This is for simple LEAs with one or two input operands.
589defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
590
591// This write is used for slow LEA instructions.
592def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
593  let Latency = 2;
594  let ResourceCycles = [1];
595  let NumMicroOps = 2;
596}
597
598// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
599// or an LEA with a `Scale` value different than 1.
600def Zn4SlowLEAPredicate : MCSchedPredicate<
601  CheckAny<[
602    // A 3-operand LEA (base, index, offset).
603    IsThreeOperandsLEAFn,
604    // An LEA with a "Scale" different than 1.
605    CheckAll<[
606      CheckIsImmOperand<2>,
607      CheckNot<CheckImmOperand<2, 1>>
608    ]>
609  ]>
610>;
611
612def Zn4WriteLEA : SchedWriteVariant<[
613    SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
614    SchedVar<NoSchedPred,         [WriteLEA]>
615]>;
616
617def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
618
619def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
620  let Latency = 2; // FIXME: not from llvm-exegesis
621  let ResourceCycles = [4];
622  let NumMicroOps = 2;
623}
624
625def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
626
627// Integer multiplication
628defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
629defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
630defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
631defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
632defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
633defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
634defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
635defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
636defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
637defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
638defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
639defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
640defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
641defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
642
643defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
644defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
645
646defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
647
648def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
649  let Latency = 3;
650  let ResourceCycles = [12];
651  let NumMicroOps = 3;
652}
653def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
654
655defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
656
657def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
658  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
659  let ResourceCycles = [1, 1, 12];
660  let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
661}
662def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
663
664def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
665  let Latency = 3; // FIXME: not from llvm-exegesis
666  let ResourceCycles = [24];
667  let NumMicroOps = 19;
668}
669def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
670
671def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
672  let Latency = 4; // FIXME: not from llvm-exegesis
673  let ResourceCycles = [59];
674  let NumMicroOps = 28;
675}
676def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
677
678def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
679  let Latency = 1;
680  let ResourceCycles = [2];
681  let NumMicroOps = 2;
682}
683def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
684
685def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
686  let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
687  let ResourceCycles = [1, 1, 2];
688  let NumMicroOps = 5;
689}
690def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
691
692def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
693  let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
694  let ResourceCycles = [1, 1, 2];
695  let NumMicroOps = 2;
696}
697def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
698
699// Integer division.
700// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
701// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
702defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
703defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
704defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
705defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
706defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
707defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
708defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
709defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
710
711defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
712defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
713
714defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
715
716def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
717  let Latency = 1;
718  let ResourceCycles = [4];
719  let NumMicroOps = 1;
720}
721def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
722
723defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
724
725def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
726  let Latency = 1;
727  let ResourceCycles = [4];
728  let NumMicroOps = 1;
729}
730def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
731
732defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
733
734def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
735  let Latency = 2;
736  let ResourceCycles = [4];
737  let NumMicroOps = 2;
738}
739def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
740
741defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
742defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
743defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
744defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
745defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
746
747defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
748defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
749defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
750
751defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
752defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
753defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
754
755// Integer shifts and rotates.
756defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
757defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
758defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
759
760def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
761  let Latency = 1;
762  let ResourceCycles = [2];
763  let NumMicroOps = 1;
764}
765def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
766                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
767
768def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
769  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
770  let ResourceCycles = [1, 1, 2];
771  let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
772}
773def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
774                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
775
776def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
777  let Latency = 3;
778  let ResourceCycles = [6];
779  let NumMicroOps = 7;
780}
781def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
782
783def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
784  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
785  let ResourceCycles = [1, 1, 8];
786  let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
787}
788def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
789
790def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
791  let Latency = 4;
792  let ResourceCycles = [8];
793  let NumMicroOps = 9;
794}
795def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
796
797def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
798  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
799  let ResourceCycles = [1, 1, 8];
800  let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
801}
802def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
803
804defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
805
806def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
807  let Latency = 3;
808  let ResourceCycles = [6];
809  let NumMicroOps = 7;
810}
811def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
812
813def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
814  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
815  let ResourceCycles = [1, 1, 8];
816  let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
817}
818def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
819
820def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
821  let Latency = 4;
822  let ResourceCycles = [8];
823  let NumMicroOps = 9;
824}
825def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
826
827def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
828  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
829  let ResourceCycles = [1, 1, 8];
830  let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
831}
832def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
833
834// Double shift instructions.
835defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
836defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
837defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
838defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
839
840// BMI1 BEXTR/BLS, BMI2 BZHI
841defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
842defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
843defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
844
845// Idioms that clear a register, like xorps %xmm0, %xmm0.
846// These can often bypass execution ports completely.
847defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
848
849// Branches don't produce values, so they have no latency, but they still
850// consume resources. Indirect branches can fold loads.
851defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
852
853// Floating point. This covers both scalar and vector operations.
854defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
855defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
856defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
857defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
858defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
859defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
860defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
861defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
862defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
863
864def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
865  let Latency = 2; // FIXME: not from llvm-exegesis
866  let ResourceCycles = [1, 1];
867  let NumMicroOps = 2;
868}
869def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
870                                               VMOVHPDmr, VMOVHPSmr)>;
871
872defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
873defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
874defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
875defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
876defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
877
878defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
879defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
880defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
881defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
882
883defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
884
885def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
886  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
887  let ResourceCycles = [1, 1, 24];
888  let NumMicroOps = 2;
889}
890def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
891                                         SUB_FI16m, SUB_FI32m,
892                                         SUBR_FI16m, SUBR_FI32m,
893                                         MUL_FI16m, MUL_FI32m)>;
894
895def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
896  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
897  let ResourceCycles = [1, 1, 62];
898  let NumMicroOps = 2;
899}
900def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
901                                       DIVR_FI16m, DIVR_FI32m)>;
902
903defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
904defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
905defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
906defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
907defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
908defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
909defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
910defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>;  // Floating point compare.
911defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM).
912defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM).
913defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM).
914defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>;  // Floating point double compare.
915defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM).
916defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM).
917defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM).
918defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
919defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
920defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>;  // Floating point multiplication.
921defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
922defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
923defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
924defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
925defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
926defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
927defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
928defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>;  // Floating point division.
929defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
930defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
931defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
932defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>;  // Floating point double division.
933defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
934defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
935defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
936defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>;   // Floating point square root.
937defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
938defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
939defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>;  // Floating point square root (ZMM).
940defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>;  // Floating point double square root.
941defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
942defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
943defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
944defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
945defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>;  // Floating point reciprocal estimate.
946defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
947defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
948defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
949defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>;  // Floating point reciprocal square root estimate.
950defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
951defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
952defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
953defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
954defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
955defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
956defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM).
957defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
958defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
959defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
960defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
961defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
962defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
963defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
964
965defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
966defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
967defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
968defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
969defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
970defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
971defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
972defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
973defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
974defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
975defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
976defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
977defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
978defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
979defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
980defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
981defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
982defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
983
984// Horizontal Add/Sub (float and integer)
985defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
986defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
987defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
988defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
989defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
990defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
991defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
992
993// Vector integer operations.
994defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
995defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
996defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
997defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
998defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
999defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1000defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
1001defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1002defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1003
1004def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
1005  let Latency = 4;
1006  let ResourceCycles = [1];
1007  let NumMicroOps = 1;
1008}
1009def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1010
1011def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
1012  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1013  let ResourceCycles = [1, 1, 1];
1014  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1015}
1016def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1017
1018def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
1019  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1020  let ResourceCycles = [1, 1, 1];
1021  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1022}
1023def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1024
1025defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1026defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1027defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1028defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1029defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
1030defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
1031defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1032
1033defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
1034defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
1035
1036def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1037  let Latency = 1;
1038  let ResourceCycles = [1, 2];
1039  let NumMicroOps = 2;
1040}
1041def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1042
1043def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1044  let Latency = 1;
1045  let ResourceCycles = [1, 4];
1046  let NumMicroOps = 2;
1047}
1048def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1049
1050defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1051
1052def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1053  let Latency = 3;
1054  let ResourceCycles = [1, 1];
1055  let NumMicroOps = 1;
1056}
1057def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1058
1059def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1060  let Latency = 3;
1061  let ResourceCycles = [1, 1];
1062  let NumMicroOps = 2;
1063}
1064def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1065
1066defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1067
1068def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1069  let Latency = 2;
1070  let ResourceCycles = [2];
1071  let NumMicroOps = 1;
1072}
1073def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1074                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1075                                            PAVGBrr, PAVGWrr,
1076                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1077                                            VPABSBrr, VPABSDrr, VPABSWrr,
1078                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1079                                            VPAVGBrr, VPAVGWrr,
1080                                            VPCMPEQQrr,
1081                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1082                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1083
1084def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
1085  let Latency = 1;
1086  let ResourceCycles = [1];
1087  let NumMicroOps = 1;
1088}
1089def : InstRW<[Zn4WriteVecOpMask], (instrs   KADDBrr, KADDDrr, KADDQrr, KADDWrr,
1090                                            KANDBrr, KANDDrr, KANDQrr, KANDWrr,
1091                                            KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr,
1092                                            KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
1093                                            KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
1094                                            KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr,
1095                                            KORBrr, KORDrr, KORQrr, KORWrr,
1096                                            KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr,
1097                                            KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr,
1098                                            KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr,
1099                                            KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr,
1100                                            KXORBrr, KXORDrr, KXORQrr, KXORWrr)>;
1101
1102def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
1103  let Latency = 1;
1104  let ResourceCycles = [1];
1105  let NumMicroOps = 1;
1106}
1107def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
1108
1109def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1110  let Latency = 1;
1111  let ResourceCycles = [1];
1112  let NumMicroOps = 1;
1113}
1114def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1115
1116def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1117  // TODO: All align instructions are expected to be of 4 cycle latency
1118  let Latency = 4;
1119  let ResourceCycles = [1];
1120  let NumMicroOps = 1;
1121}
1122def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1123                                            VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1124                                            >;
1125defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1126
1127def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1128  let Latency = 1;
1129  let ResourceCycles = [1];
1130  let NumMicroOps = 1;
1131}
1132def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1133                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1134                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1135                                            VPAVGBYrr, VPAVGWYrr,
1136                                            VPCMPEQQYrr,
1137                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1138
1139defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
1140
1141defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1142defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1143defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1144defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
1145defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1146defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1147defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (ZMM).
1148defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1149defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
1150defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1151defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
1152defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1153defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1154defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1155defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
1156defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1157defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1158defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1159defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
1160defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
1161defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1162defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
1163defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1164defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1165defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1166defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
1167defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1168defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1169defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1170defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
1171defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
1172defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1173defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
1174defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
1175defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1176defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
1177defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1178defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1179defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1180defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
1181defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1182defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1183defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
1184defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1185
1186// Vector insert/extract operations.
1187defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1188defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1189defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1190
1191// MOVMSK operations.
1192defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1193defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1194defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
1195defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1196
1197// Conversion between integer and float.
1198defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>;  // Double -> Integer.
1199defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1200defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1201defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1202
1203def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1204  let Latency = 1;
1205  let ResourceCycles = [2];
1206  let NumMicroOps = 2;
1207}
1208defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>;  // Float -> Integer.
1209
1210defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1211defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1212defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1213
1214defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1215defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1216defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1217defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
1218
1219def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1220  let Latency = 2;
1221  let ResourceCycles = [6];
1222  let NumMicroOps = 2;
1223}
1224
1225defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1226defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1227defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1228defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1229
1230def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1231  let Latency = 3;
1232  let ResourceCycles = [1];
1233  let NumMicroOps = 2;
1234}
1235
1236defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1237defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1238defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1239defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
1240
1241defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1242defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1243defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1244defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
1245
1246defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1247defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1248defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
1249
1250defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1251defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1252defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
1253
1254defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1255defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1256defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
1257
1258// CRC32 instruction.
1259defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
1260
1261def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1262  let Latency = 2;
1263  let ResourceCycles = [2];
1264  let NumMicroOps = 2;
1265}
1266def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1267
1268def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1269  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
1270  let ResourceCycles = [1, 1, 2];
1271  let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
1272}
1273def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1274
1275def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
1276  let Latency = 1;
1277  let ResourceCycles = [2];
1278  let NumMicroOps = 1;
1279}
1280def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1281
1282def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1283  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1284  let ResourceCycles = [1, 1, 2];
1285  let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1286}
1287def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1288
1289def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1290  let Latency = 2;
1291  let ResourceCycles = [3];
1292  let NumMicroOps = 2;
1293}
1294def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1295
1296def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1297  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
1298  let ResourceCycles = [1, 1, 3];
1299  let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
1300}
1301def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1302
1303def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
1304  let Latency = 3;
1305  let ResourceCycles = [8];
1306  let NumMicroOps = 4;
1307}
1308def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1309
1310def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1311  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
1312  let ResourceCycles = [1, 1, 8];
1313  let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
1314}
1315def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1316
1317def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
1318  let Latency = 6;
1319  let ResourceCycles = [8];
1320  let NumMicroOps = 1;
1321}
1322def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1323
1324def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
1325  let Latency = 4;
1326  let ResourceCycles = [8];
1327  let NumMicroOps = 1;
1328}
1329def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1330
1331// Strings instructions.
1332// Packed Compare Implicit Length Strings, Return Mask
1333defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1334// Packed Compare Explicit Length Strings, Return Mask
1335defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1336// Packed Compare Implicit Length Strings, Return Index
1337defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
1338// Packed Compare Explicit Length Strings, Return Index
1339defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1340
1341// AES instructions.
1342defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
1343defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
1344defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
1345
1346// Carry-less multiplication instructions.
1347defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1348
1349// EMMS/FEMMS
1350defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1351
1352// Load/store MXCSR
1353defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1354defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1355
1356// Catch-all for expensive system instructions.
1357defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
1358
1359def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
1360  let Latency = 0; // FIXME: not from llvm-exegesis
1361  let ResourceCycles = [1];
1362  let NumMicroOps = 1;
1363}
1364def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
1365
1366def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
1367  let Latency = 10; // FIXME: not from llvm-exegesis
1368  let ResourceCycles = [24];
1369  let NumMicroOps = 18;
1370}
1371def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
1372
1373// AVX2.
1374defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1375defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1376defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
1377
1378def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
1379  let Latency = 3;
1380  let ResourceCycles = [1];
1381  let NumMicroOps = 1;
1382}
1383def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1384
1385def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1386  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
1387  let ResourceCycles = [1, 1, 1];
1388  let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1389}
1390def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1391
1392def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1393  let Latency = 7;
1394  let ResourceCycles = [1];
1395  let NumMicroOps = 2;
1396}
1397def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1398
1399def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1400  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
1401  let ResourceCycles = [1, 1, 2];
1402  let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1403}
1404def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1405
1406def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1407  let Latency = 6;
1408  let ResourceCycles = [1];
1409  let NumMicroOps = 2;
1410}
1411def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1412
1413def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1414  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
1415  let ResourceCycles = [1, 1, 2];
1416  let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1417}
1418def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1419
1420def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1421  let Latency = 5;
1422  let ResourceCycles = [1];
1423  let NumMicroOps = 2;
1424}
1425def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
1426
1427def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1428  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
1429  let ResourceCycles = [1, 1, 2];
1430  let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1431}
1432def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1433
1434defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1435defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1436defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
1437defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1438defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
1439
1440// Old microcoded instructions that nobody use.
1441defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
1442
1443// Fence instructions.
1444defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
1445
1446def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
1447  let Latency = 1;
1448  let ResourceCycles = [30];
1449  let NumMicroOps = 1;
1450}
1451def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
1452
1453def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
1454  let Latency = 1;
1455  let ResourceCycles = [1];
1456  let NumMicroOps = 1;
1457}
1458def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
1459
1460// Nop, not very useful expect it provides a model for nops!
1461defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1462
1463
1464///////////////////////////////////////////////////////////////////////////////
1465// Zero Cycle Move
1466///////////////////////////////////////////////////////////////////////////////
1467
1468def Zn4WriteZeroLatency : SchedWriteRes<[]> {
1469  let Latency = 0;
1470  let ResourceCycles = [];
1471  let NumMicroOps = 1;
1472}
1473def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1474                                               MOV64rr, MOV64rr_REV,
1475                                               MOVSX32rr32)>;
1476
1477def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
1478  let Latency = 0;
1479  let ResourceCycles = [];
1480  let NumMicroOps = 2;
1481}
1482def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1483                                               XCHG64rr, XCHG64ar)>;
1484
1485defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1486
1487defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1488defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1489defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
1490
1491defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
1492defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1493defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1494defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
1495
1496def : IsOptimizableRegisterMove<[
1497  InstructionEquivalenceClass<[
1498    // GPR variants.
1499    MOV32rr, MOV32rr_REV,
1500    MOV64rr, MOV64rr_REV,
1501    MOVSX32rr32,
1502    XCHG32rr, XCHG32ar,
1503    XCHG64rr, XCHG64ar,
1504
1505    // MMX variants.
1506    // MMX moves are *NOT* eliminated.
1507
1508    // SSE variants.
1509    MOVAPSrr, MOVAPSrr_REV,
1510    MOVUPSrr, MOVUPSrr_REV,
1511    MOVAPDrr, MOVAPDrr_REV,
1512    MOVUPDrr, MOVUPDrr_REV,
1513    MOVDQArr, MOVDQArr_REV,
1514    MOVDQUrr, MOVDQUrr_REV,
1515
1516    // AVX variants.
1517    VMOVAPSrr, VMOVAPSrr_REV,
1518    VMOVUPSrr, VMOVUPSrr_REV,
1519    VMOVAPDrr, VMOVAPDrr_REV,
1520    VMOVUPDrr, VMOVUPDrr_REV,
1521    VMOVDQArr, VMOVDQArr_REV,
1522    VMOVDQUrr, VMOVDQUrr_REV,
1523
1524    // AVX YMM variants.
1525    VMOVAPSYrr, VMOVAPSYrr_REV,
1526    VMOVUPSYrr, VMOVUPSYrr_REV,
1527    VMOVAPDYrr, VMOVAPDYrr_REV,
1528    VMOVUPDYrr, VMOVUPDYrr_REV,
1529    VMOVDQAYrr, VMOVDQAYrr_REV,
1530    VMOVDQUYrr, VMOVDQUYrr_REV,
1531  ], TruePred >
1532]>;
1533
1534// FIXUP and RANGE Instructions
1535def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
1536  let Latency = 2;
1537  let ResourceCycles = [2];
1538  let NumMicroOps = 1;
1539}
1540def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1541	"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1542        "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri",  "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1543	"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1544	)>;
1545
1546// SCALE & REDUCE instructions
1547def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
1548  let Latency = 6;
1549  let ResourceCycles = [6];
1550  let NumMicroOps = 2;
1551}
1552def : InstRW<[Zn4WriteSCALErr], (instregex
1553        "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
1554        "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1555	)>;
1556
1557//BF16PS Instructions
1558def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
1559  let Latency = 6;
1560  let ResourceCycles = [6];
1561  let NumMicroOps = 2;
1562}
1563def : InstRW<[Zn4WriteBF16], (instregex
1564        "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1565	)>;
1566
1567// BUSD and VPMADD Instructions
1568def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
1569  let Latency = 4;
1570  let ResourceCycles = [4];
1571  let NumMicroOps = 1;
1572}
1573def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1574	"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1575        "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
1576	)>;
1577
1578// SHIFT instructions
1579def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
1580  let Latency = 2;
1581  let ResourceCycles = [2];
1582  let NumMicroOps = 1;
1583}
1584def : InstRW<[Zn4WriteSHIFTrr], (instregex
1585        "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
1586        "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1587        "(V?)P(SLL|SRL|SRA)DQYri",
1588        "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1589        "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1590        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1591        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1592        "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1593	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz"
1594	)>;
1595
1596def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
1597  let Latency = 1;
1598  let ResourceCycles = [1];
1599  let NumMicroOps = 1;
1600}
1601def : InstRW<[Zn4WriteSHIFTri], (instregex
1602        "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
1603	)>;
1604
1605// ALIGN Instructions
1606def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1607  let Latency = 2;
1608  let ResourceCycles = [2];
1609  let NumMicroOps = 1;
1610}
1611def : InstRW<[Zn4WriteALIGN], (instregex
1612        "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1613	)>;
1614
1615//PACK Instructions
1616def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
1617  let Latency = 2;
1618  let ResourceCycles = [2];
1619  let NumMicroOps = 1;
1620}
1621def : InstRW<[Zn4WritePACK], (instregex
1622        "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1623	)>;
1624
1625// MAX and MIN Instructions
1626def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
1627  let Latency = 2;
1628  let ResourceCycles = [2];
1629  let NumMicroOps = 1;
1630}
1631def : InstRW<[Zn4WriteFCmp64], (instregex
1632        "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1633        "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1634        "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
1635        "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1636	)>;
1637
1638// MOV Instructions
1639def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
1640  let Latency = 2;
1641  let ResourceCycles = [2];
1642  let NumMicroOps = 1;
1643}
1644def : InstRW<[Zn4MOVS], (instregex
1645        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
1646        "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
1647        "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
1648        "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)",
1649        "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)"
1650	)>;
1651
1652def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
1653  let Latency = 4;
1654  let ResourceCycles = [4];
1655  let NumMicroOps = 1;
1656}
1657def : InstRW<[Zn4MOVSZ], (instregex
1658        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)"
1659	)>;
1660
1661def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
1662  let Latency = 5;
1663  let ResourceCycles = [5];
1664  let NumMicroOps = 1;
1665}
1666def : InstRW<[Zn4MOVSrr], (instregex
1667        "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)"
1668	)>;
1669
1670
1671//VPTEST Instructions
1672def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1673  let Latency = 3;
1674  let ResourceCycles = [3];
1675  let NumMicroOps = 1;
1676}
1677def : InstRW<[Zn4VPTESTZ128], (instregex
1678        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1679	)>;
1680
1681def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1682  let Latency = 4;
1683  let ResourceCycles = [4];
1684  let NumMicroOps = 1;
1685}
1686def : InstRW<[Zn4VPTESTZ256], (instregex
1687        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1688	)>;
1689
1690def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
1691  let Latency = 5;
1692  let ResourceCycles = [5];
1693  let NumMicroOps = 1;
1694}
1695def : InstRW<[Zn4VPTESTZ], (instregex
1696        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1697	)>;
1698
1699// CONFLICT Instructions
1700def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1701  let Latency = 2;
1702  let ResourceCycles = [2];
1703  let NumMicroOps = 1;
1704}
1705def : InstRW<[Zn4CONFLICTZ128], (instregex
1706        "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
1707	)>;
1708
1709def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
1710  let Latency = 6;
1711  let ResourceCycles = [2,2,2];
1712  let NumMicroOps = 4;
1713}
1714def : InstRW<[Zn4CONFLICTrr], (instregex
1715        "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
1716	)>;
1717
1718// RSQRT Instructions
1719def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1720  let Latency = 5;
1721  let ResourceCycles = [2];
1722  let NumMicroOps = 1;
1723}
1724def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
1725        "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
1726	)>;
1727
1728
1729// PERM Instructions
1730def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
1731  let Latency = 2;
1732  let ResourceCycles = [2];
1733  let NumMicroOps = 1;
1734}
1735def : InstRW<[Zn4PERMILP], (instregex
1736        "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
1737	)>;
1738
1739def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
1740  let Latency = 3;
1741  let ResourceCycles = [2];
1742  let NumMicroOps = 1;
1743}
1744def : InstRW<[Zn4PERMIT2_128], (instregex
1745        "VPERM(I2|T2)(PS|PD|W)128(rr|rrk|rrkz)",
1746        "VPERM(I2|T2)(B|D|Q)128(rr|rrk|rrkz)"
1747	)>;
1748
1749def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
1750  let Latency = 2;
1751  let ResourceCycles = [2];
1752  let NumMicroOps = 1;
1753}
1754def : InstRW<[Zn4PERMIT2_128rr], (instregex
1755	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1756	"VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
1757	)>;
1758
1759def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
1760  let Latency = 4;
1761  let ResourceCycles = [2];
1762  let NumMicroOps = 1;
1763}
1764def : InstRW<[Zn4PERMIT2_256], (instregex
1765        "VPERM(I2|T2)(PS|PD|W)256(rr|rrk|rrkz)",
1766	"VPERMP(S|D)Z256(rr|rrk|rrkz)",
1767	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1768	"VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
1769	"VPERM(I2|Q|T2)(B|D|Q)(Z?)256(rr|rrk|rrkz)",
1770	"VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
1771	)>;
1772
1773def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
1774  let Latency = 5;
1775  let ResourceCycles = [2];
1776  let NumMicroOps = 1;
1777}
1778def : InstRW<[Zn4PERMIT2Z], (instregex
1779        "VPERM(I2|T2)(PS|PD|W)(rr|rrk|rrkz)",
1780	"VPERM(B|D|W)Z(rr|rrk|rrkz)",
1781	"VPERM(I2|Q|T2)(B|D|Q)(Z?)(rr|rrk|rrkz)",
1782	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1783	"VPEXPAND(B|W)Z(rr|rrk|rrkz)",
1784        "VPERMP(S|D)Z(rr|rrk|rrkz)"
1785	)>;
1786
1787// ALU SLOW Misc Instructions
1788def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
1789  let Latency = 2;
1790  let ResourceCycles = [2];
1791  let NumMicroOps = 1;
1792}
1793def : InstRW<[Zn4VecALUZSlow], (instrs
1794	VPABSBZ128rr,      VPABSBZ128rrk,  VPABSBZ128rrkz,   VPABSDZ128rr,
1795	VPABSDZ128rrk,     VPABSDZ128rrkz, VPABSQZ128rr,     VPABSQZ128rrk,
1796	VPABSQZ128rrkz,    VPABSWZ128rr,   VPABSWZ128rrk,    VPABSWZ128rrkz,
1797	VPADDSBZ128rr,     VPADDSBZ128rrk, VPADDSBZ128rrkz,  VPADDSWZ128rr,
1798	VPADDSWZ128rrk,    VPADDSWZ128rrkz,VPADDUSBZ128rr,   VPADDUSBZ128rrk,
1799	VPADDUSBZ128rrkz,  VPADDUSWZ128rr, VPADDUSWZ128rrk,  VPADDUSWZ128rrkz,
1800	VPAVGBZ128rr,      VPAVGBZ128rrk,  VPAVGBZ128rrkz,   VPAVGWZ128rr,
1801	VPAVGWZ128rrk,     VPAVGWZ128rrkz, VPOPCNTBZ128rr,   VPOPCNTBZ128rrk,
1802	VPOPCNTBZ128rrkz,  VPOPCNTDZ128rr, VPOPCNTDZ128rrk,  VPOPCNTDZ128rrkz,
1803	VPOPCNTQZ128rr,    VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
1804	VPOPCNTWZ128rrk,   VPOPCNTWZ128rrkz,VPSUBSBZ128rr,   VPSUBSBZ128rrk,
1805	VPSUBSBZ128rrkz,   VPSUBSWZ128rr,   VPSUBSWZ128rrk,  VPSUBSWZ128rrkz,
1806	VPSUBUSBZ128rr,    VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
1807	VPSUBUSWZ128rrk,   VPSUBUSWZ128rrkz
1808	)>;
1809
1810
1811///////////////////////////////////////////////////////////////////////////////
1812// Dependency breaking instructions.
1813///////////////////////////////////////////////////////////////////////////////
1814
1815def Zn4WriteZeroIdiom : SchedWriteVariant<[
1816    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1817    SchedVar<NoSchedPred,                          [WriteALU]>
1818]>;
1819def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1820                                          XOR64rr, XOR64rr_REV,
1821                                          SUB32rr, SUB32rr_REV,
1822                                          SUB64rr, SUB64rr_REV)>;
1823
1824def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1825    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
1826    SchedVar<NoSchedPred,                                 [WriteALU]>
1827]>;
1828def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1829                                                CMP16rr, CMP16rr_REV,
1830                                                CMP32rr, CMP32rr_REV,
1831                                                CMP64rr, CMP64rr_REV)>;
1832
1833def Zn4WriteFZeroIdiom : SchedWriteVariant<[
1834    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1835    SchedVar<NoSchedPred,                          [WriteFLogic]>
1836]>;
1837// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1838def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1839                                           VANDNPSrr, VANDNPDrr)>;
1840
1841def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
1842    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1843    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1844]>;
1845def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1846                                            VANDNPSYrr, VANDNPDYrr)>;
1847
1848def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
1849    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1850    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1851]>;
1852// NOTE: PXORrr,PANDNrr are not zero-cycle!
1853def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1854
1855def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
1856    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1857    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1858]>;
1859def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1860
1861def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
1862    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1863    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1864]>;
1865// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1866//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1867def : InstRW<[Zn4WriteVZeroIdiomALUX],
1868             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1869                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1870
1871def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
1872    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1873    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1874]>;
1875def : InstRW<[Zn4WriteVZeroIdiomALUY],
1876             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1877                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1878
1879def : IsZeroIdiomFunction<[
1880  // GPR Zero-idioms.
1881  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1882                     XOR64rr, XOR64rr_REV,
1883                     SUB32rr, SUB32rr_REV,
1884                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1885
1886  // SSE XMM Zero-idioms.
1887  DepBreakingClass<[
1888    // fp variants.
1889    XORPSrr, XORPDrr,
1890    ANDNPSrr, ANDNPDrr,
1891
1892    // int variants.
1893    PXORrr,
1894    PANDNrr,
1895    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1896    PSUBSBrr, PSUBSWrr,
1897    PSUBUSBrr, PSUBUSWrr,
1898    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1899  ], ZeroIdiomPredicate>,
1900
1901  // AVX XMM Zero-idioms.
1902  DepBreakingClass<[
1903    // fp variants.
1904    VXORPSrr, VXORPDrr,
1905    VANDNPSrr, VANDNPDrr,
1906
1907    // int variants.
1908    VPXORrr,
1909    VPANDNrr,
1910    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1911    VPSUBSBrr, VPSUBSWrr,
1912    VPSUBUSBrr, VPSUBUSWrr,
1913    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1914  ], ZeroIdiomPredicate>,
1915
1916  // AVX YMM Zero-idioms.
1917  DepBreakingClass<[
1918    // fp variants.
1919    VXORPSYrr, VXORPDYrr,
1920    VANDNPSYrr, VANDNPDYrr,
1921
1922    // int variants.
1923    VPXORYrr,
1924    VPANDNYrr,
1925    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1926    VPSUBSBYrr, VPSUBSWYrr,
1927    VPSUBUSBYrr, VPSUBUSWYrr,
1928    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1929  ], ZeroIdiomPredicate>,
1930]>;
1931
1932def : IsDepBreakingFunction<[
1933  // GPR
1934  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1935                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1936  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1937                     CMP16rr, CMP16rr_REV,
1938                     CMP32rr, CMP32rr_REV,
1939                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1940  // SSE
1941  DepBreakingClass<[
1942    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1943  ], ZeroIdiomPredicate>,
1944
1945  // AVX XMM
1946  DepBreakingClass<[
1947    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1948  ], ZeroIdiomPredicate>,
1949
1950  // AVX YMM
1951  DepBreakingClass<[
1952    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1953  ], ZeroIdiomPredicate>,
1954]>;
1955
1956} // SchedModel
1957
1958