xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver3 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15//    http://www.agner.org/optimize/microarchitecture.pdf
16//  * AMD Zen 3 Ryzen Deep Dive Review
17//    https://www.anandtech.com/show/16214/
18//===----------------------------------------------------------------------===//
19
20def Znver3Model : SchedMachineModel {
21  // AMD SOG 19h, 2.9.6 Dispatch
22  // The processor may dispatch up to 6 macro ops per cycle
23  // into the execution engine.
24  let IssueWidth = 6;
25  // AMD SOG 19h, 2.10.3
26  // The retire control unit (RCU) tracks the completion status of all
27  // outstanding operations (integer, load/store, and floating-point) and is
28  // the final arbiter for exception processing and recovery.
29  // The unit can receive up to 6 macro ops dispatched per cycle and track up
30  // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
31  let MicroOpBufferSize = 256;
32  // AMD SOG 19h, 2.9.1 Op Cache
33  // The op cache is organized as an associative cache with 64 sets and 8 ways.
34  // At each set-way intersection is an entry containing up to 8 macro ops.
35  // The maximum capacity of the op cache is 4K ops.
36  // Agner, 22.5 µop cache
37  // The size of the µop cache is big enough for holding most critical loops.
38  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
39  //        with large values here the compilation of certain loops
40  //        ends up taking way too long.
41  // let LoopMicroOpBufferSize = 4096;
42  let LoopMicroOpBufferSize = 512;
43  // AMD SOG 19h, 2.6.2 L1 Data Cache
44  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
45  // AMD SOG 19h, 2.12 L1 Data Cache
46  // The AGU and LS pipelines are optimized for simple address generation modes.
47  // <...> and can achieve 4-cycle load-to-use integer load latency.
48  let LoadLatency = 4;
49  // AMD SOG 19h, 2.12 L1 Data Cache
50  // The AGU and LS pipelines are optimized for simple address generation modes.
51  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
52  int VecLoadLatency = 7;
53  // Latency of a simple store operation.
54  int StoreLatency = 1;
55  // FIXME
56  let HighLatency = 25; // FIXME: any better choice?
57  // AMD SOG 19h, 2.8 Optimizing Branching
58  // The branch misprediction penalty is in the range from 11 to 18 cycles,
59  // <...>. The common case penalty is 13 cycles.
60  let MispredictPenalty = 13;
61
62  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
63
64  let CompleteModel = 1;
65}
66
67let SchedModel = Znver3Model in {
68
69
70//===----------------------------------------------------------------------===//
71// RCU
72//===----------------------------------------------------------------------===//
73
74// AMD SOG 19h, 2.10.3 Retire Control Unit
75// The unit can receive up to 6 macro ops dispatched per cycle and track up to
76// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
77// The retire unit handles in-order commit of up to eight macro ops per cycle.
78def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
79
80//===----------------------------------------------------------------------===//
81// Units
82//===----------------------------------------------------------------------===//
83
84// There are total of three Units, each one with it's own schedulers.
85
86//===----------------------------------------------------------------------===//
87// Integer Execution Unit
88//
89
90// AMD SOG 19h, 2.4 Superscalar Organization
91// The processor uses four decoupled independent integer scheduler queues,
92// each one servicing one ALU pipeline and one or two other pipelines
93
94//
95// Execution pipes
96//===----------------------------------------------------------------------===//
97
98// AMD SOG 19h, 2.10.2 Execution Units
99// The processor contains 4 general purpose integer execution pipes.
100// Each pipe has an ALU capable of general purpose integer operations.
101def Zn3ALU0 : ProcResource<1>;
102def Zn3ALU1 : ProcResource<1>;
103def Zn3ALU2 : ProcResource<1>;
104def Zn3ALU3 : ProcResource<1>;
105
106// AMD SOG 19h, 2.10.2 Execution Units
107// There is also a separate branch execution unit.
108def Zn3BRU1 : ProcResource<1>;
109
110// AMD SOG 19h, 2.10.2 Execution Units
111// There are three Address Generation Units (AGUs) for all load and store
112// address generation. There are also 3 store data movement units
113// associated with the same schedulers as the AGUs.
114def Zn3AGU0 : ProcResource<1>;
115def Zn3AGU1 : ProcResource<1>;
116def Zn3AGU2 : ProcResource<1>;
117
118//
119// Execution Units
120//===----------------------------------------------------------------------===//
121
122// AMD SOG 19h, 2.10.2 Execution Units
123// ALU0 additionally has divide <...> execution capability.
124defvar Zn3Divider = Zn3ALU0;
125
126// AMD SOG 19h, 2.10.2 Execution Units
127// ALU0 additionally has <...> branch execution capability.
128defvar Zn3BRU0 = Zn3ALU0;
129
130// Integer Multiplication issued on ALU1.
131defvar Zn3Multiplier = Zn3ALU1;
132
133// Execution pipeline grouping
134//===----------------------------------------------------------------------===//
135
136// General ALU operations
137def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
138
139// General AGU operations
140def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
141
142// Control flow: jumps, calls
143def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
144
145// Everything that isn't control flow, but still needs to access CC register,
146// namely: conditional moves, SETcc.
147def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
148
149// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
150
151// Simple bit twiddling: bit test, shift/rotate, bit extraction
152def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
153
154
155//
156// Scheduling
157//===----------------------------------------------------------------------===//
158
159// AMD SOG 19h, 2.10.3 Retire Control Unit
160// The integer physical register file (PRF) consists of 192 registers.
161def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
162                              6,  // Max moves that can be eliminated per cycle.
163                              0>; // Restrict move elimination to zero regs.
164
165// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
166// AMD SOG 19h, 2.10.1 Schedulers
167// The schedulers can receive up to six macro ops per cycle, with a limit of
168// two per scheduler. Each scheduler can issue one micro op per cycle into
169// each of its associated pipelines
170// FIXME: these are 4 separate schedulers, not a single big one.
171def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
172                           Zn3ALU1, Zn3AGU1,          // scheduler 1
173                           Zn3ALU2, Zn3AGU2,          // scheduler 2
174                           Zn3ALU3,          Zn3BRU1  // scheduler 3
175                          ]> {
176  let BufferSize = !mul(4, 24);
177}
178
179
180//===----------------------------------------------------------------------===//
181// Floating-Point Unit
182//
183
184// AMD SOG 19h, 2.4 Superscalar Organization
185// The processor uses <...> two decoupled independent floating point schedulers
186// each servicing two FP pipelines and one store or FP-to-integer pipeline.
187
188//
189// Execution pipes
190//===----------------------------------------------------------------------===//
191
192// AMD SOG 19h, 2.10.1 Schedulers
193// <...>, and six FPU pipes.
194// Agner, 22.10 Floating point execution pipes
195// There are six floating point/vector execution pipes,
196def Zn3FPP0  : ProcResource<1>;
197def Zn3FPP1  : ProcResource<1>;
198def Zn3FPP2  : ProcResource<1>;
199def Zn3FPP3  : ProcResource<1>;
200def Zn3FPP45 : ProcResource<2>;
201
202//
203// Execution Units
204//===----------------------------------------------------------------------===//
205// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
206
207// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
208defvar Zn3FPFMul0 = Zn3FPP0;
209defvar Zn3FPFMul1 = Zn3FPP1;
210
211// (v)FADD*
212defvar Zn3FPFAdd0 = Zn3FPP2;
213defvar Zn3FPFAdd1 = Zn3FPP3;
214
215// All convert operations except pack/unpack
216defvar Zn3FPFCvt0 = Zn3FPP2;
217defvar Zn3FPFCvt1 = Zn3FPP3;
218
219// All Divide and Square Root except Reciprocal Approximation
220// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
221// FDIV unit can support 2 simultaneous operations in flight
222// even though it occupies a single pipe.
223// FIXME: BufferSize=2 ?
224defvar Zn3FPFDiv = Zn3FPP1;
225
226// Moves and Logical operations on Floating Point Data Types
227defvar Zn3FPFMisc0 = Zn3FPP0;
228defvar Zn3FPFMisc1 = Zn3FPP1;
229defvar Zn3FPFMisc2 = Zn3FPP2;
230defvar Zn3FPFMisc3 = Zn3FPP3;
231
232// Integer Adds, Subtracts, and Compares
233// Some complex VADD operations are not available in all pipes.
234defvar Zn3FPVAdd0 = Zn3FPP0;
235defvar Zn3FPVAdd1 = Zn3FPP1;
236defvar Zn3FPVAdd2 = Zn3FPP2;
237defvar Zn3FPVAdd3 = Zn3FPP3;
238
239// Integer Multiplies, SAD, Blendvb
240defvar Zn3FPVMul0 = Zn3FPP0;
241defvar Zn3FPVMul1 = Zn3FPP3;
242
243// Data Shuffles, Packs, Unpacks, Permute
244// Some complex shuffle operations are only available in pipe1.
245defvar Zn3FPVShuf = Zn3FPP1;
246defvar Zn3FPVShufAux = Zn3FPP2;
247
248// Bit Shift Left/Right operations
249defvar Zn3FPVShift0 = Zn3FPP1;
250defvar Zn3FPVShift1 = Zn3FPP2;
251
252// Moves and Logical operations on Packed Integer Data Types
253defvar Zn3FPVMisc0 = Zn3FPP0;
254defvar Zn3FPVMisc1 = Zn3FPP1;
255defvar Zn3FPVMisc2 = Zn3FPP2;
256defvar Zn3FPVMisc3 = Zn3FPP3;
257
258// *AES*
259defvar Zn3FPAES0 = Zn3FPP0;
260defvar Zn3FPAES1 = Zn3FPP1;
261
262// *CLM*
263defvar Zn3FPCLM0 = Zn3FPP0;
264defvar Zn3FPCLM1 = Zn3FPP1;
265
266// Execution pipeline grouping
267//===----------------------------------------------------------------------===//
268
269// AMD SOG 19h, 2.11 Floating-Point Unit
270// Stores and floating point to general purpose register transfer
271// have 2 dedicated pipelines (pipe 5 and 6).
272def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
273
274// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
275def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
276
277// (v)FADD*
278// Some complex VADD operations are not available in all pipes.
279def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
280
281// All convert operations except pack/unpack
282def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
283
284// All Divide and Square Root except Reciprocal Approximation
285// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
286
287// Moves and Logical operations on Floating Point Data Types
288def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
289
290def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
291
292// Loads, Stores and Move to General Register (EX) Operations
293// AMD SOG 19h, 2.11 Floating-Point Unit
294// Stores and floating point to general purpose register transfer
295// have 2 dedicated pipelines (pipe 5 and 6).
296defvar Zn3FPLd01 = Zn3FPP45;
297
298// AMD SOG 19h, 2.11 Floating-Point Unit
299// Note that FP stores are supported on two pipelines,
300// but throughput is limited to one per cycle.
301let Super = Zn3FPP45 in
302def Zn3FPSt : ProcResource<1>;
303
304// Integer Adds, Subtracts, and Compares
305// Some complex VADD operations are not available in all pipes.
306def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
307
308def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
309def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
310
311// Integer Multiplies, SAD, Blendvb
312def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
313
314// Data Shuffles, Packs, Unpacks, Permute
315// Some complex shuffle operations are only available in pipe1.
316def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
317
318// Bit Shift Left/Right operations
319def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
320
321// Moves and Logical operations on Packed Integer Data Types
322def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
323
324// *AES*
325def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
326
327// *CLM*
328def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
329
330
331//
332// Scheduling
333//===----------------------------------------------------------------------===//
334
335// Agner, 21.8 Register renaming and out-of-order schedulers
336// The floating point register file has 160 vector registers
337// of 128 bits each in Zen 1 and 256 bits each in Zen 2.
338// anandtech also confirms this.
339def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
340                            6,  // Max moves that can be eliminated per cycle.
341                            0>; // Restrict move elimination to zero regs.
342
343// AMD SOG 19h, 2.11 Floating-Point Unit
344// The floating-point scheduler has a 2*32 entry macro op capacity.
345// AMD SOG 19h, 2.11 Floating-Point Unit
346// <...> the scheduler can issue 1 micro op per cycle for each pipe.
347// FIXME: those are two separate schedulers, not a single big one.
348def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
349                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
350                         ]> {
351  let BufferSize = !mul(2, 32);
352}
353
354// AMD SOG 19h, 2.11 Floating-Point Unit
355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
356// even if floating-point scheduler is full.
357// FIXME: how to model this properly?
358
359
360//===----------------------------------------------------------------------===//
361// Load-Store Unit
362//
363
364// AMD SOG 19h, 2.12 Load-Store Unit
365// The LS unit contains three largely independent pipe-lines
366// enabling the execution of three 256-bit memory operations per cycle.
367def Zn3LSU : ProcResource<3>;
368
369// AMD SOG 19h, 2.12 Load-Store Unit
370// All three memory operations can be loads.
371let Super = Zn3LSU in
372def Zn3Load : ProcResource<3> {
373  // AMD SOG 19h, 2.12 Load-Store Unit
374  // The LS unit can process up to 72 out-of-order loads.
375  let BufferSize = 72;
376}
377
378def Zn3LoadQueue : LoadQueue<Zn3Load>;
379
380// AMD SOG 19h, 2.12 Load-Store Unit
381// A maximum of two of the memory operations can be stores.
382let Super = Zn3LSU in
383def Zn3Store : ProcResource<2> {
384  // AMD SOG 19h, 2.12 Load-Store Unit
385  // The LS unit utilizes a 64-entry store queue (STQ).
386  let BufferSize = 64;
387}
388
389def Zn3StoreQueue : StoreQueue<Zn3Store>;
390
391//===----------------------------------------------------------------------===//
392// Basic helper classes.
393//===----------------------------------------------------------------------===//
394
395// Many SchedWrites are defined in pairs with and without a folded load.
396// Instructions with folded loads are usually micro-fused, so they only appear
397// as two micro-ops when dispatched by the schedulers.
398// This multiclass defines the resource usage for variants with and without
399// folded loads.
400
401multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
402                         int Lat = 1, list<int> Res = [], int UOps = 1> {
403  def : WriteRes<SchedRW, ExePorts> {
404    let Latency = Lat;
405    let ReleaseAtCycles = Res;
406    let NumMicroOps = UOps;
407  }
408}
409
410multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
411                             list<ProcResourceKind> ExePorts, int Lat,
412                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
413                             ProcResourceKind AGU, int LoadRes> {
414  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
415
416  defm : __zn3WriteRes<SchedRW.Folded,
417                       !listconcat([AGU, Zn3Load], ExePorts),
418                       !add(Lat, LoadLat),
419                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
420                         [],
421                         !listconcat([1, LoadRes],
422                           !if(!empty(Res),
423                             !listsplat(1, !size(ExePorts)),
424                             Res))),
425                       !add(UOps, LoadUOps)>;
426}
427
428// For classes without folded loads.
429multiclass Zn3WriteResInt<SchedWrite SchedRW,
430                          list<ProcResourceKind> ExePorts, int Lat = 1,
431                          list<int> Res = [], int UOps = 1> {
432  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
433}
434
435multiclass Zn3WriteResXMM<SchedWrite SchedRW,
436                          list<ProcResourceKind> ExePorts, int Lat = 1,
437                          list<int> Res = [], int UOps = 1> {
438  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
439}
440
441multiclass Zn3WriteResYMM<SchedWrite SchedRW,
442                          list<ProcResourceKind> ExePorts, int Lat = 1,
443                          list<int> Res = [], int UOps = 1> {
444  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
445}
446
447// For classes with folded loads.
448multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
449                              list<ProcResourceKind> ExePorts, int Lat = 1,
450                              list<int> Res = [], int UOps = 1,
451                              int LoadUOps = 0, int LoadRes = 1> {
452  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453                           Znver3Model.LoadLatency,
454                           LoadUOps, Zn3AGU012, LoadRes>;
455}
456
457multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458                              list<ProcResourceKind> ExePorts, int Lat = 1,
459                              list<int> Res = [], int UOps = 1,
460                              int LoadUOps = 0, int LoadRes = 1> {
461  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462                           Znver3Model.VecLoadLatency,
463                           LoadUOps, Zn3FPLd01, LoadRes>;
464}
465
466multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467                              list<ProcResourceKind> ExePorts, int Lat = 1,
468                              list<int> Res = [], int UOps = 1,
469                              int LoadUOps = 0, int LoadRes = 1> {
470  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471                           Znver3Model.VecLoadLatency,
472                           LoadUOps, Zn3FPLd01, LoadRes>;
473}
474
475
476//===----------------------------------------------------------------------===//
477// Here be dragons.
478//===----------------------------------------------------------------------===//
479
480def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
481
482def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
483def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
484def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
485
486// AMD SOG 19h, 2.11 Floating-Point Unit
487// There is 1 cycle of added latency for a result to cross
488// from F to I or I to F domain.
489def : ReadAdvance<ReadInt2Fpu, -1>;
490
491// Instructions with both a load and a store folded are modeled as a folded
492// load + WriteRMW.
493defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
494
495// Loads, stores, and moves, not folded with other operations.
496defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
497
498// Model the effect of clobbering the read-write mask operand of the GATHER operation.
499// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
500defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
501
502def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
503  let Latency = !add(Znver3Model.LoadLatency, 1);
504  let ReleaseAtCycles = [3, 1];
505  let NumMicroOps = 1;
506}
507def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
508
509defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
510defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
511defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
512
513// Treat misc copies as a move.
514def : InstRW<[WriteMove], (instrs COPY)>;
515
516def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
517  let Latency = Znver3Model.LoadLatency;
518  let ReleaseAtCycles = [1, 1, 4];
519  let NumMicroOps = 1;
520}
521def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
522
523def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
524  let Latency = Znver3Model.StoreLatency;
525  let ReleaseAtCycles = [4, 1, 1];
526  let NumMicroOps = 2;
527}
528def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
529
530// Arithmetic.
531defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
532
533def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
534  let Latency = 1;
535  let ReleaseAtCycles = [4];
536  let NumMicroOps = 1;
537}
538def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
539                                        AND8i8, AND16i16, AND32i32, AND64i32,
540                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
541                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
542                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
543
544def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
545  let Latency = 1;
546  let ReleaseAtCycles = [4];
547  let NumMicroOps = 1;
548}
549def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
550
551def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
552  let Latency = 1;
553  let ReleaseAtCycles = [2];
554  let NumMicroOps = 1;
555}
556def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
557
558def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
559  let Latency = 3;
560  let ReleaseAtCycles = [1];
561  let NumMicroOps = 1;
562}
563def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
564                                          PEXT32rr, PEXT64rr)>;
565
566defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
567
568def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
569  let Latency = 1;
570  let ReleaseAtCycles = [1, 1, 7, 1];
571  let NumMicroOps = 1;
572}
573def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
574
575// This is for simple LEAs with one or two input operands.
576defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
577
578// This write is used for slow LEA instructions.
579def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
580  let Latency = 2;
581  let ReleaseAtCycles = [1];
582  let NumMicroOps = 2;
583}
584
585// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset),
586// or an LEA with a `Scale` value different than 1.
587def Zn3SlowLEAPredicate : MCSchedPredicate<
588  CheckAny<[
589    // A 3-operand LEA (base, index, offset).
590    IsThreeOperandsLEAFn,
591    // An LEA with a "Scale" different than 1.
592    CheckAll<[
593      CheckIsImmOperand<2>,
594      CheckNot<CheckImmOperand<2, 1>>
595    ]>
596  ]>
597>;
598
599def Zn3WriteLEA : SchedWriteVariant<[
600    SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
601    SchedVar<NoSchedPred,         [WriteLEA]>
602]>;
603
604def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
605
606def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
607  let Latency = 2; // FIXME: not from llvm-exegesis
608  let ReleaseAtCycles = [4];
609  let NumMicroOps = 2;
610}
611
612def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
613
614// Integer multiplication
615defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
616defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
617defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
618defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
619defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
620defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
621defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
622defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
623defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
624defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
625defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
626defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
627defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
628defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
629
630defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
631defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
632
633defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
634
635def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
636  let Latency = 3;
637  let ReleaseAtCycles = [12];
638  let NumMicroOps = 3;
639}
640def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
641
642defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
643
644def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
645  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
646  let ReleaseAtCycles = [1, 1, 12];
647  let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
648}
649def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
650
651def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
652  let Latency = 3; // FIXME: not from llvm-exegesis
653  let ReleaseAtCycles = [24];
654  let NumMicroOps = 19;
655}
656def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
657
658def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
659  let Latency = 4; // FIXME: not from llvm-exegesis
660  let ReleaseAtCycles = [59];
661  let NumMicroOps = 28;
662}
663def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
664
665def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
666  let Latency = 1;
667  let ReleaseAtCycles = [2];
668  let NumMicroOps = 2;
669}
670def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
671
672def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
673  let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
674  let ReleaseAtCycles = [1, 1, 2];
675  let NumMicroOps = 5;
676}
677def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
678
679def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
680  let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
681  let ReleaseAtCycles = [1, 1, 2];
682  let NumMicroOps = 2;
683}
684def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
685
686// Integer division.
687// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
688// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
689defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
690defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
691defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
692defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
693defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
694defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
695defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
696defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
697
698defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
699defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
700
701defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
702
703def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
704  let Latency = 1;
705  let ReleaseAtCycles = [4];
706  let NumMicroOps = 1;
707}
708def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
709
710defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
711
712def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
713  let Latency = 1;
714  let ReleaseAtCycles = [4];
715  let NumMicroOps = 1;
716}
717def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
718
719defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
720
721def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
722  let Latency = 2;
723  let ReleaseAtCycles = [4];
724  let NumMicroOps = 2;
725}
726def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
727
728defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
729defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
730defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
731defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
732defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
733
734defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
735defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
736defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
737
738defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
739defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
740defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
741
742// Integer shifts and rotates.
743defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
744defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
745defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
746
747def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
748  let Latency = 1;
749  let ReleaseAtCycles = [2];
750  let NumMicroOps = 1;
751}
752def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
753                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
754
755def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
756  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
757  let ReleaseAtCycles = [1, 1, 2];
758  let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
759}
760def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
761                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
762
763def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
764  let Latency = 3;
765  let ReleaseAtCycles = [6];
766  let NumMicroOps = 7;
767}
768def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
769
770def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
771  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
772  let ReleaseAtCycles = [1, 1, 8];
773  let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
774}
775def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
776
777def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
778  let Latency = 4;
779  let ReleaseAtCycles = [8];
780  let NumMicroOps = 9;
781}
782def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
783
784def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
785  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
786  let ReleaseAtCycles = [1, 1, 8];
787  let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
788}
789def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
790
791defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
792
793def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
794  let Latency = 3;
795  let ReleaseAtCycles = [6];
796  let NumMicroOps = 7;
797}
798def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
799
800def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
801  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
802  let ReleaseAtCycles = [1, 1, 8];
803  let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
804}
805def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
806
807def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
808  let Latency = 4;
809  let ReleaseAtCycles = [8];
810  let NumMicroOps = 9;
811}
812def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
813
814def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
815  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
816  let ReleaseAtCycles = [1, 1, 8];
817  let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
818}
819def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
820
821// Double shift instructions.
822defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
823defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
824defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
825defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
826
827// BMI1 BEXTR/BLS, BMI2 BZHI
828defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
829defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
830defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
831
832// Idioms that clear a register, like xorps %xmm0, %xmm0.
833// These can often bypass execution ports completely.
834defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
835
836// Branches don't produce values, so they have no latency, but they still
837// consume resources. Indirect branches can fold loads.
838defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
839
840// Floating point. This covers both scalar and vector operations.
841defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
842defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
843defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
844defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
845defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
846defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
847defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
848defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
849defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
850
851def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
852  let Latency = 2; // FIXME: not from llvm-exegesis
853  let ReleaseAtCycles = [1, 1];
854  let NumMicroOps = 2;
855}
856def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
857                                               VMOVHPDmr, VMOVHPSmr)>;
858
859defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
860defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
861defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
862defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
863defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
864
865defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
866defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
867defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
868defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
869
870defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
871
872def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
873  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
874  let ReleaseAtCycles = [1, 1, 24];
875  let NumMicroOps = 2;
876}
877def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
878                                         SUB_FI16m, SUB_FI32m,
879                                         SUBR_FI16m, SUBR_FI32m,
880                                         MUL_FI16m, MUL_FI32m)>;
881
882def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
883  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
884  let ReleaseAtCycles = [1, 1, 62];
885  let NumMicroOps = 2;
886}
887def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
888                                       DIVR_FI16m, DIVR_FI32m)>;
889
890defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
891defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
892defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
893defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
894defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
895defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
896defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
897defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>;  // Floating point compare.
898defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
899defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
900defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
901defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>;  // Floating point double compare.
902defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
903defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
904defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
905defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
906defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
907defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>;  // Floating point multiplication.
908defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
909defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
910defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
911defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
912defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
913defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
914defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
915defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>;  // Floating point division.
916defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
917defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
918defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
919defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>;  // Floating point double division.
920defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
921defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
922defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
923defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>;   // Floating point square root.
924defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
925defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
926defm : X86WriteResPairUnsupported<WriteFSqrtZ>;  // Floating point square root (ZMM).
927defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>;  // Floating point double square root.
928defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
929defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
930defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
931defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
932defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>;  // Floating point reciprocal estimate.
933defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
934defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
935defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
936defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>;  // Floating point reciprocal square root estimate.
937defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
938defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
939defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
940defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [1], 1>;  // Fused Multiply Add.
941defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
942defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
943defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
944defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
945defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
946defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
947defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
948defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
949defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
950defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
951defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
952defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
953defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
954defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
955defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
956defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
957defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
958defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
959defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
960defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
961defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
962defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
963defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
964defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
965defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
966defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
967defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
968defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
969
970// Horizontal Add/Sub (float and integer)
971defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
972defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
973defm : X86WriteResPairUnsupported<WriteFHAddZ>;
974defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
975defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
976defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
977defm : X86WriteResPairUnsupported<WritePHAddZ>;
978
979// Vector integer operations.
980defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
981defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
982defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
983defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
984defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
985defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
986defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
987defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
988defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
989
990def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
991  let Latency = 4;
992  let ReleaseAtCycles = [1];
993  let NumMicroOps = 1;
994}
995def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
996
997def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
998  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
999  let ReleaseAtCycles = [1, 1, 1];
1000  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1001}
1002def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1003
1004def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
1005  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1006  let ReleaseAtCycles = [1, 1, 1];
1007  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1008}
1009def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1010
1011defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1012defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1013defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1014defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1015defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
1016defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
1017defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1018
1019defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1020defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1021
1022def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1023  let Latency = 1;
1024  let ReleaseAtCycles = [1, 2];
1025  let NumMicroOps = 2;
1026}
1027def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1028
1029def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1030  let Latency = 1;
1031  let ReleaseAtCycles = [1, 4];
1032  let NumMicroOps = 2;
1033}
1034def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1035
1036defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1037
1038def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1039  let Latency = 3;
1040  let ReleaseAtCycles = [1, 1];
1041  let NumMicroOps = 1;
1042}
1043def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1044
1045def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1046  let Latency = 3;
1047  let ReleaseAtCycles = [1, 1];
1048  let NumMicroOps = 2;
1049}
1050def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1051
1052defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1053
1054def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1055  let Latency = 1;
1056  let ReleaseAtCycles = [1];
1057  let NumMicroOps = 1;
1058}
1059def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1060                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1061                                            PAVGBrr, PAVGWrr,
1062                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1063                                            VPABSBrr, VPABSDrr, VPABSWrr,
1064                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1065                                            VPAVGBrr, VPAVGWrr,
1066                                            VPCMPEQQrr,
1067                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1068                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1069
1070def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
1071  let Latency = 1;
1072  let ReleaseAtCycles = [1];
1073  let NumMicroOps = 1;
1074}
1075def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
1076                                           MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
1077                                           MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr,
1078                                           MMX_PAVGBrr, MMX_PAVGWrr,
1079                                           MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>;
1080
1081defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1082
1083def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1084  let Latency = 1;
1085  let ReleaseAtCycles = [1];
1086  let NumMicroOps = 1;
1087}
1088def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1089                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1090                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1091                                            VPAVGBYrr, VPAVGWYrr,
1092                                            VPCMPEQQYrr,
1093                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1094
1095defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
1096defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1097defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1098defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1099defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
1100defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1101defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1102defm : X86WriteResPairUnsupported<WriteVecTestZ>;  // Vector integer TEST instructions (ZMM).
1103defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1104defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
1105defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1106defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
1107defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1108defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1109defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1110defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
1111defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1112defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1113defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1114defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
1115defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
1116defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1117defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
1118defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1119defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1120defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1121defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
1122defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1123defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1124defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1125defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
1126defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
1127defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1128defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
1129defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
1130defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1131defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
1132defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1133defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1134defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1135defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
1136defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1137defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1138defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
1139defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1140
1141// Vector insert/extract operations.
1142defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1143defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1144defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1145
1146// MOVMSK operations.
1147defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1148defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1149defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
1150defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1151
1152// Conversion between integer and float.
1153defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>;  // Double -> Integer.
1154defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1155defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1156defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1157
1158def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1159  let Latency = 1;
1160  let ReleaseAtCycles = [2];
1161  let NumMicroOps = 2;
1162}
1163def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>;
1164
1165defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>;  // Float -> Integer.
1166
1167defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1168defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1169defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1170
1171defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1172defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1173defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1174defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1175
1176def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1177  let Latency = 2;
1178  let ReleaseAtCycles = [6];
1179  let NumMicroOps = 2;
1180}
1181def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>;
1182
1183defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1184defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1185defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1186defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1187
1188def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1189  let Latency = 3;
1190  let ReleaseAtCycles = [1];
1191  let NumMicroOps = 2;
1192}
1193def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>;
1194
1195defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1196defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1197defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1198defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1199
1200defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1201defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1202defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1203defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1204
1205defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1206defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1207defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1208
1209defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1210defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1211defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1212defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1213defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1214defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1215
1216// CRC32 instruction.
1217defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
1218
1219def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1220  let Latency = 2;
1221  let ReleaseAtCycles = [2];
1222  let NumMicroOps = 2;
1223}
1224def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1225
1226def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1227  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
1228  let ReleaseAtCycles = [1, 1, 2];
1229  let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
1230}
1231def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1232
1233def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
1234  let Latency = 1;
1235  let ReleaseAtCycles = [2];
1236  let NumMicroOps = 1;
1237}
1238def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1239
1240def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1241  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1242  let ReleaseAtCycles = [1, 1, 2];
1243  let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1244}
1245def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1246
1247def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1248  let Latency = 2;
1249  let ReleaseAtCycles = [3];
1250  let NumMicroOps = 2;
1251}
1252def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1253
1254def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1255  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
1256  let ReleaseAtCycles = [1, 1, 3];
1257  let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
1258}
1259def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1260
1261def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
1262  let Latency = 3;
1263  let ReleaseAtCycles = [8];
1264  let NumMicroOps = 4;
1265}
1266def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1267
1268def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1269  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
1270  let ReleaseAtCycles = [1, 1, 8];
1271  let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
1272}
1273def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1274
1275def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
1276  let Latency = 6;
1277  let ReleaseAtCycles = [8];
1278  let NumMicroOps = 1;
1279}
1280def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1281
1282def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
1283  let Latency = 4;
1284  let ReleaseAtCycles = [8];
1285  let NumMicroOps = 1;
1286}
1287def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1288
1289// Strings instructions.
1290// Packed Compare Implicit Length Strings, Return Mask
1291defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1292// Packed Compare Explicit Length Strings, Return Mask
1293defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1294// Packed Compare Implicit Length Strings, Return Index
1295defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1296// Packed Compare Explicit Length Strings, Return Index
1297defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1298
1299// AES instructions.
1300defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
1301defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
1302defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
1303
1304// Carry-less multiplication instructions.
1305defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
1306
1307// EMMS/FEMMS
1308defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1309
1310// Load/store MXCSR
1311defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1312defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1313
1314// Catch-all for expensive system instructions.
1315defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
1316
1317def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
1318  let Latency = 0; // FIXME: not from llvm-exegesis
1319  let ReleaseAtCycles = [1];
1320  let NumMicroOps = 1;
1321}
1322def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
1323
1324def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
1325  let Latency = 10; // FIXME: not from llvm-exegesis
1326  let ReleaseAtCycles = [24];
1327  let NumMicroOps = 18;
1328}
1329def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
1330
1331// AVX2.
1332defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1333defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1334defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1335
1336def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
1337  let Latency = 3;
1338  let ReleaseAtCycles = [1];
1339  let NumMicroOps = 1;
1340}
1341def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1342
1343def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1344  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
1345  let ReleaseAtCycles = [1, 1, 1];
1346  let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1347}
1348def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1349
1350def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1351  let Latency = !add(Znver3Model.LoadLatency, 7);
1352  let ReleaseAtCycles = [1, 1, 2];
1353  let NumMicroOps = 3;
1354}
1355def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1356
1357def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
1358  let Latency = 6;
1359  let ReleaseAtCycles = [1];
1360  let NumMicroOps = 2;
1361}
1362def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1363
1364def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1365  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
1366  let ReleaseAtCycles = [1, 1, 2];
1367  let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
1368}
1369def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1370
1371def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1372  let Latency = !add(Znver3Model.LoadLatency, 5);
1373  let ReleaseAtCycles = [1, 1, 2];
1374  let NumMicroOps = 2;
1375}
1376def : InstRW<[Zn3WriteVPERMDYm], (instrs VPERMQYmi, VPERMDYrm)>;
1377
1378defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1379defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf], 5, [1], 2, /*LoadUOps=*/1>; // 256-bit width vector variable shuffles.
1380defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
1381defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1382defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
1383
1384// Old microcoded instructions that nobody use.
1385defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
1386
1387// Fence instructions.
1388defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
1389
1390def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
1391  let Latency = 1;
1392  let ReleaseAtCycles = [30];
1393  let NumMicroOps = 1;
1394}
1395def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
1396
1397def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
1398  let Latency = 1;
1399  let ReleaseAtCycles = [1];
1400  let NumMicroOps = 1;
1401}
1402def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
1403
1404// Nop, not very useful expect it provides a model for nops!
1405defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1406
1407
1408///////////////////////////////////////////////////////////////////////////////
1409// Zero Cycle Move
1410///////////////////////////////////////////////////////////////////////////////
1411
1412def Zn3WriteZeroLatency : SchedWriteRes<[]> {
1413  let Latency = 0;
1414  let ReleaseAtCycles = [];
1415  let NumMicroOps = 1;
1416}
1417def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1418                                               MOV64rr, MOV64rr_REV,
1419                                               MOVSX32rr32)>;
1420
1421def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
1422  let Latency = 0;
1423  let ReleaseAtCycles = [];
1424  let NumMicroOps = 2;
1425}
1426def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1427                                               XCHG64rr, XCHG64ar)>;
1428
1429defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1430
1431defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
1432defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1433defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1434defm : X86WriteResUnsupported<WriteFMoveZ>;
1435
1436defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
1437defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1438defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1439defm : X86WriteResUnsupported<WriteVecMoveZ>;
1440
1441def : IsOptimizableRegisterMove<[
1442  InstructionEquivalenceClass<[
1443    // GPR variants.
1444    MOV32rr, MOV32rr_REV,
1445    MOV64rr, MOV64rr_REV,
1446    MOVSX32rr32,
1447    XCHG32rr, XCHG32ar,
1448    XCHG64rr, XCHG64ar,
1449
1450    // MMX variants.
1451    // MMX moves are *NOT* eliminated.
1452
1453    // SSE variants.
1454    MOVAPSrr, MOVAPSrr_REV,
1455    MOVUPSrr, MOVUPSrr_REV,
1456    MOVAPDrr, MOVAPDrr_REV,
1457    MOVUPDrr, MOVUPDrr_REV,
1458    MOVDQArr, MOVDQArr_REV,
1459    MOVDQUrr, MOVDQUrr_REV,
1460
1461    // AVX variants.
1462    VMOVAPSrr, VMOVAPSrr_REV,
1463    VMOVUPSrr, VMOVUPSrr_REV,
1464    VMOVAPDrr, VMOVAPDrr_REV,
1465    VMOVUPDrr, VMOVUPDrr_REV,
1466    VMOVDQArr, VMOVDQArr_REV,
1467    VMOVDQUrr, VMOVDQUrr_REV,
1468
1469    // AVX YMM variants.
1470    VMOVAPSYrr, VMOVAPSYrr_REV,
1471    VMOVUPSYrr, VMOVUPSYrr_REV,
1472    VMOVAPDYrr, VMOVAPDYrr_REV,
1473    VMOVUPDYrr, VMOVUPDYrr_REV,
1474    VMOVDQAYrr, VMOVDQAYrr_REV,
1475    VMOVDQUYrr, VMOVDQUYrr_REV,
1476  ], TruePred >
1477]>;
1478
1479///////////////////////////////////////////////////////////////////////////////
1480// Dependency breaking instructions.
1481///////////////////////////////////////////////////////////////////////////////
1482
1483def Zn3WriteZeroIdiom : SchedWriteVariant<[
1484    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1485    SchedVar<NoSchedPred,                          [WriteALU]>
1486]>;
1487def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1488                                          XOR64rr, XOR64rr_REV,
1489                                          SUB32rr, SUB32rr_REV,
1490                                          SUB64rr, SUB64rr_REV)>;
1491
1492def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1493    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
1494    SchedVar<NoSchedPred,                                 [WriteALU]>
1495]>;
1496def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1497                                                CMP16rr, CMP16rr_REV,
1498                                                CMP32rr, CMP32rr_REV,
1499                                                CMP64rr, CMP64rr_REV)>;
1500
1501def Zn3WriteFZeroIdiom : SchedWriteVariant<[
1502    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1503    SchedVar<NoSchedPred,                          [WriteFLogic]>
1504]>;
1505// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1506def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1507                                           VANDNPSrr, VANDNPDrr)>;
1508
1509def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
1510    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1511    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1512]>;
1513def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1514                                            VANDNPSYrr, VANDNPDYrr)>;
1515
1516def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
1517    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1518    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1519]>;
1520// NOTE: PXORrr,PANDNrr are not zero-cycle!
1521def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1522
1523def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
1524    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1525    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1526]>;
1527def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1528
1529def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
1530    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1531    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1532]>;
1533// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1534//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1535def : InstRW<[Zn3WriteVZeroIdiomALUX],
1536             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1537                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1538
1539def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
1540    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1541    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1542]>;
1543def : InstRW<[Zn3WriteVZeroIdiomALUY],
1544             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1545                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1546
1547def : IsZeroIdiomFunction<[
1548  // GPR Zero-idioms.
1549  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1550                     XOR64rr, XOR64rr_REV,
1551                     SUB32rr, SUB32rr_REV,
1552                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1553
1554  // SSE XMM Zero-idioms.
1555  DepBreakingClass<[
1556    // fp variants.
1557    XORPSrr, XORPDrr,
1558    ANDNPSrr, ANDNPDrr,
1559
1560    // int variants.
1561    PXORrr,
1562    PANDNrr,
1563    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1564    PSUBSBrr, PSUBSWrr,
1565    PSUBUSBrr, PSUBUSWrr,
1566    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1567  ], ZeroIdiomPredicate>,
1568
1569  // AVX XMM Zero-idioms.
1570  DepBreakingClass<[
1571    // fp variants.
1572    VXORPSrr, VXORPDrr,
1573    VANDNPSrr, VANDNPDrr,
1574
1575    // int variants.
1576    VPXORrr,
1577    VPANDNrr,
1578    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1579    VPSUBSBrr, VPSUBSWrr,
1580    VPSUBUSBrr, VPSUBUSWrr,
1581    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1582  ], ZeroIdiomPredicate>,
1583
1584  // AVX YMM Zero-idioms.
1585  DepBreakingClass<[
1586    // fp variants.
1587    VXORPSYrr, VXORPDYrr,
1588    VANDNPSYrr, VANDNPDYrr,
1589
1590    // int variants.
1591    VPXORYrr,
1592    VPANDNYrr,
1593    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1594    VPSUBSBYrr, VPSUBSWYrr,
1595    VPSUBUSBYrr, VPSUBUSWYrr,
1596    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1597  ], ZeroIdiomPredicate>,
1598]>;
1599
1600def : IsDepBreakingFunction<[
1601  // GPR
1602  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1603                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1604  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1605                     CMP16rr, CMP16rr_REV,
1606                     CMP32rr, CMP32rr_REV,
1607                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1608
1609  // MMX
1610  DepBreakingClass<[
1611    MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr
1612  ], ZeroIdiomPredicate>,
1613
1614  // SSE
1615  DepBreakingClass<[
1616    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1617  ], ZeroIdiomPredicate>,
1618
1619  // AVX XMM
1620  DepBreakingClass<[
1621    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1622  ], ZeroIdiomPredicate>,
1623
1624  // AVX YMM
1625  DepBreakingClass<[
1626    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1627  ], ZeroIdiomPredicate>,
1628]>;
1629
1630} // SchedModel
1631