xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td (revision 963f5dc7a30624e95d72fb7f87b8892651164e46)
1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver3 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15//    http://www.agner.org/optimize/microarchitecture.pdf
16//  * AMD Zen 3 Ryzen Deep Dive Review
17//    https://www.anandtech.com/show/16214/
18//===----------------------------------------------------------------------===//
19
20def Znver3Model : SchedMachineModel {
21  // AMD SOG 19h, 2.9.6 Dispatch
22  // The processor may dispatch up to 6 macro ops per cycle
23  // into the execution engine.
24  let IssueWidth = 6;
25  // AMD SOG 19h, 2.10.3
26  // The retire control unit (RCU) tracks the completion status of all
27  // outstanding operations (integer, load/store, and floating-point) and is
28  // the final arbiter for exception processing and recovery.
29  // The unit can receive up to 6 macro ops dispatched per cycle and track up
30  // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
31  let MicroOpBufferSize = 256;
32  // AMD SOG 19h, 2.9.1 Op Cache
33  // The op cache is organized as an associative cache with 64 sets and 8 ways.
34  // At each set-way intersection is an entry containing up to 8 macro ops.
35  // The maximum capacity of the op cache is 4K ops.
36  // Agner, 22.5 µop cache
37  // The size of the µop cache is big enough for holding most critical loops.
38  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
39  //        with large values here the compilation of certain loops
40  //        ends up taking way too long.
41  // let LoopMicroOpBufferSize = 4096;
42  let LoopMicroOpBufferSize = 512;
43  // AMD SOG 19h, 2.6.2 L1 Data Cache
44  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
45  // AMD SOG 19h, 2.12 L1 Data Cache
46  // The AGU and LS pipelines are optimized for simple address generation modes.
47  // <...> and can achieve 4-cycle load-to-use integer load latency.
48  let LoadLatency = 4;
49  // AMD SOG 19h, 2.12 L1 Data Cache
50  // The AGU and LS pipelines are optimized for simple address generation modes.
51  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
52  int VecLoadLatency = 7;
53  // Latency of a simple store operation.
54  int StoreLatency = 1;
55  // FIXME
56  let HighLatency = 25; // FIXME: any better choice?
57  // AMD SOG 19h, 2.8 Optimizing Branching
58  // The branch misprediction penalty is in the range from 11 to 18 cycles,
59  // <...>. The common case penalty is 13 cycles.
60  let MispredictPenalty = 13;
61
62  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
63
64  let CompleteModel = 1;
65}
66
67let SchedModel = Znver3Model in {
68
69
70//===----------------------------------------------------------------------===//
71// RCU
72//===----------------------------------------------------------------------===//
73
74// AMD SOG 19h, 2.10.3 Retire Control Unit
75// The unit can receive up to 6 macro ops dispatched per cycle and track up to
76// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
77// The retire unit handles in-order commit of up to eight macro ops per cycle.
78def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
79
80//===----------------------------------------------------------------------===//
81// Units
82//===----------------------------------------------------------------------===//
83
84// There are total of three Units, each one with it's own schedulers.
85
86//===----------------------------------------------------------------------===//
87// Integer Execution Unit
88//
89
90// AMD SOG 19h, 2.4 Superscalar Organization
91// The processor uses four decoupled independent integer scheduler queues,
92// each one servicing one ALU pipeline and one or two other pipelines
93
94//
95// Execution pipes
96//===----------------------------------------------------------------------===//
97
98// AMD SOG 19h, 2.10.2 Execution Units
99// The processor contains 4 general purpose integer execution pipes.
100// Each pipe has an ALU capable of general purpose integer operations.
101def Zn3ALU0 : ProcResource<1>;
102def Zn3ALU1 : ProcResource<1>;
103def Zn3ALU2 : ProcResource<1>;
104def Zn3ALU3 : ProcResource<1>;
105
106// AMD SOG 19h, 2.10.2 Execution Units
107// There is also a separate branch execution unit.
108def Zn3BRU1 : ProcResource<1>;
109
110// AMD SOG 19h, 2.10.2 Execution Units
111// There are three Address Generation Units (AGUs) for all load and store
112// address generation. There are also 3 store data movement units
113// associated with the same schedulers as the AGUs.
114def Zn3AGU0 : ProcResource<1>;
115def Zn3AGU1 : ProcResource<1>;
116def Zn3AGU2 : ProcResource<1>;
117
118//
119// Execution Units
120//===----------------------------------------------------------------------===//
121
122// AMD SOG 19h, 2.10.2 Execution Units
123// ALU0 additionally has divide <...> execution capability.
124defvar Zn3Divider = Zn3ALU0;
125
126// AMD SOG 19h, 2.10.2 Execution Units
127// ALU0 additionally has <...> branch execution capability.
128defvar Zn3BRU0 = Zn3ALU0;
129
130// Integer Multiplication issued on ALU1.
131defvar Zn3Multiplier = Zn3ALU1;
132
133// Execution pipeline grouping
134//===----------------------------------------------------------------------===//
135
136// General ALU operations
137def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
138
139// General AGU operations
140def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
141
142// Control flow: jumps, calls
143def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
144
145// Everything that isn't control flow, but still needs to access CC register,
146// namely: conditional moves, SETcc.
147def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
148
149// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
150
151// Simple bit twiddling: bit test, shift/rotate, bit extraction
152def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
153
154
155//
156// Scheduling
157//===----------------------------------------------------------------------===//
158
159// AMD SOG 19h, 2.10.3 Retire Control Unit
160// The integer physical register file (PRF) consists of 192 registers.
161def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
162                              6,  // Max moves that can be eliminated per cycle.
163                              0>; // Restrict move elimination to zero regs.
164
165// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
166// AMD SOG 19h, 2.10.1 Schedulers
167// The schedulers can receive up to six macro ops per cycle, with a limit of
168// two per scheduler. Each scheduler can issue one micro op per cycle into
169// each of its associated pipelines
170// FIXME: these are 4 separate schedulers, not a single big one.
171def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
172                           Zn3ALU1, Zn3AGU1,          // scheduler 1
173                           Zn3ALU2, Zn3AGU2,          // scheduler 2
174                           Zn3ALU3,          Zn3BRU1  // scheduler 3
175                          ]> {
176  let BufferSize = !mul(4, 24);
177}
178
179
180//===----------------------------------------------------------------------===//
181// Floating-Point Unit
182//
183
184// AMD SOG 19h, 2.4 Superscalar Organization
185// The processor uses <...> two decoupled independent floating point schedulers
186// each servicing two FP pipelines and one store or FP-to-integer pipeline.
187
188//
189// Execution pipes
190//===----------------------------------------------------------------------===//
191
192// AMD SOG 19h, 2.10.1 Schedulers
193// <...>, and six FPU pipes.
194// Agner, 22.10 Floating point execution pipes
195// There are six floating point/vector execution pipes,
196def Zn3FPP0  : ProcResource<1>;
197def Zn3FPP1  : ProcResource<1>;
198def Zn3FPP2  : ProcResource<1>;
199def Zn3FPP3  : ProcResource<1>;
200def Zn3FPP45 : ProcResource<2>;
201
202//
203// Execution Units
204//===----------------------------------------------------------------------===//
205// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
206
207// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
208defvar Zn3FPFMul0 = Zn3FPP0;
209defvar Zn3FPFMul1 = Zn3FPP1;
210
211// (v)FADD*
212defvar Zn3FPFAdd0 = Zn3FPP2;
213defvar Zn3FPFAdd1 = Zn3FPP3;
214
215// All convert operations except pack/unpack
216defvar Zn3FPFCvt0 = Zn3FPP2;
217defvar Zn3FPFCvt1 = Zn3FPP3;
218
219// All Divide and Square Root except Reciprocal Approximation
220// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
221// FDIV unit can support 2 simultaneous operations in flight
222// even though it occupies a single pipe.
223// FIXME: BufferSize=2 ?
224defvar Zn3FPFDiv = Zn3FPP1;
225
226// Moves and Logical operations on Floating Point Data Types
227defvar Zn3FPFMisc0 = Zn3FPP0;
228defvar Zn3FPFMisc1 = Zn3FPP1;
229defvar Zn3FPFMisc2 = Zn3FPP2;
230defvar Zn3FPFMisc3 = Zn3FPP3;
231
232// Integer Adds, Subtracts, and Compares
233// Some complex VADD operations are not available in all pipes.
234defvar Zn3FPVAdd0 = Zn3FPP0;
235defvar Zn3FPVAdd1 = Zn3FPP1;
236defvar Zn3FPVAdd2 = Zn3FPP2;
237defvar Zn3FPVAdd3 = Zn3FPP3;
238
239// Integer Multiplies, SAD, Blendvb
240defvar Zn3FPVMul0 = Zn3FPP0;
241defvar Zn3FPVMul1 = Zn3FPP3;
242
243// Data Shuffles, Packs, Unpacks, Permute
244// Some complex shuffle operations are only available in pipe1.
245defvar Zn3FPVShuf = Zn3FPP1;
246defvar Zn3FPVShufAux = Zn3FPP2;
247
248// Bit Shift Left/Right operations
249defvar Zn3FPVShift0 = Zn3FPP1;
250defvar Zn3FPVShift1 = Zn3FPP2;
251
252// Moves and Logical operations on Packed Integer Data Types
253defvar Zn3FPVMisc0 = Zn3FPP0;
254defvar Zn3FPVMisc1 = Zn3FPP1;
255defvar Zn3FPVMisc2 = Zn3FPP2;
256defvar Zn3FPVMisc3 = Zn3FPP3;
257
258// *AES*
259defvar Zn3FPAES0 = Zn3FPP0;
260defvar Zn3FPAES1 = Zn3FPP1;
261
262// *CLM*
263defvar Zn3FPCLM0 = Zn3FPP0;
264defvar Zn3FPCLM1 = Zn3FPP1;
265
266// Execution pipeline grouping
267//===----------------------------------------------------------------------===//
268
269// AMD SOG 19h, 2.11 Floating-Point Unit
270// Stores and floating point to general purpose register transfer
271// have 2 dedicated pipelines (pipe 5 and 6).
272def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
273
274// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
275def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
276
277// (v)FADD*
278// Some complex VADD operations are not available in all pipes.
279def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
280
281// All convert operations except pack/unpack
282def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
283
284// All Divide and Square Root except Reciprocal Approximation
285// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
286
287// Moves and Logical operations on Floating Point Data Types
288def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
289
290def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
291
292// Loads, Stores and Move to General Register (EX) Operations
293// AMD SOG 19h, 2.11 Floating-Point Unit
294// Stores and floating point to general purpose register transfer
295// have 2 dedicated pipelines (pipe 5 and 6).
296defvar Zn3FPLd01 = Zn3FPP45;
297
298// AMD SOG 19h, 2.11 Floating-Point Unit
299// Note that FP stores are supported on two pipelines,
300// but throughput is limited to one per cycle.
301let Super = Zn3FPP45 in
302def Zn3FPSt : ProcResource<1>;
303
304// Integer Adds, Subtracts, and Compares
305// Some complex VADD operations are not available in all pipes.
306def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
307
308def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
309def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
310
311// Integer Multiplies, SAD, Blendvb
312def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
313
314// Data Shuffles, Packs, Unpacks, Permute
315// Some complex shuffle operations are only available in pipe1.
316def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
317
318// Bit Shift Left/Right operations
319def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
320
321// Moves and Logical operations on Packed Integer Data Types
322def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
323
324// *AES*
325def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
326
327// *CLM*
328def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
329
330
331//
332// Scheduling
333//===----------------------------------------------------------------------===//
334
335// Agner, 21.8 Register renaming and out-of-order schedulers
336// The floating point register file has 160 vector registers
337// of 128 bits each in Zen 1 and 256 bits each in Zen 2.
338// anandtech also confirms this.
339def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
340                            6,  // Max moves that can be eliminated per cycle.
341                            0>; // Restrict move elimination to zero regs.
342
343// AMD SOG 19h, 2.11 Floating-Point Unit
344// The floating-point scheduler has a 2*32 entry macro op capacity.
345// AMD SOG 19h, 2.11 Floating-Point Unit
346// <...> the scheduler can issue 1 micro op per cycle for each pipe.
347// FIXME: those are two separate schedulers, not a single big one.
348def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
349                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
350                         ]> {
351  let BufferSize = !mul(2, 32);
352}
353
354// AMD SOG 19h, 2.11 Floating-Point Unit
355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
356// even if floating-point scheduler is full.
357// FIXME: how to model this properly?
358
359
360//===----------------------------------------------------------------------===//
361// Load-Store Unit
362//
363
364// AMD SOG 19h, 2.12 Load-Store Unit
365// The LS unit contains three largely independent pipe-lines
366// enabling the execution of three 256-bit memory operations per cycle.
367def Zn3LSU : ProcResource<3>;
368
369// AMD SOG 19h, 2.12 Load-Store Unit
370// All three memory operations can be loads.
371let Super = Zn3LSU in
372def Zn3Load : ProcResource<3> {
373  // AMD SOG 19h, 2.12 Load-Store Unit
374  // The LS unit can process up to 72 out-of-order loads.
375  let BufferSize = 72;
376}
377
378def Zn3LoadQueue : LoadQueue<Zn3Load>;
379
380// AMD SOG 19h, 2.12 Load-Store Unit
381// A maximum of two of the memory operations can be stores.
382let Super = Zn3LSU in
383def Zn3Store : ProcResource<2> {
384  // AMD SOG 19h, 2.12 Load-Store Unit
385  // The LS unit utilizes a 64-entry store queue (STQ).
386  let BufferSize = 64;
387}
388
389def Zn3StoreQueue : StoreQueue<Zn3Store>;
390
391//===----------------------------------------------------------------------===//
392// Basic helper classes.
393//===----------------------------------------------------------------------===//
394
395// Many SchedWrites are defined in pairs with and without a folded load.
396// Instructions with folded loads are usually micro-fused, so they only appear
397// as two micro-ops when dispatched by the schedulers.
398// This multiclass defines the resource usage for variants with and without
399// folded loads.
400
401multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
402                         int Lat = 1, list<int> Res = [], int UOps = 1> {
403  def : WriteRes<SchedRW, ExePorts> {
404    let Latency = Lat;
405    let ResourceCycles = Res;
406    let NumMicroOps = UOps;
407  }
408}
409
410multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
411                             list<ProcResourceKind> ExePorts, int Lat,
412                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
413                             ProcResourceKind AGU, int LoadRes> {
414  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
415
416  defm : __zn3WriteRes<SchedRW.Folded,
417                       !listconcat([AGU, Zn3Load], ExePorts),
418                       !add(Lat, LoadLat),
419                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
420                         [],
421                         !listconcat([1, LoadRes],
422                           !if(!empty(Res),
423                             !listsplat(1, !size(ExePorts)),
424                             Res))),
425                       !add(UOps, LoadUOps)>;
426}
427
428// For classes without folded loads.
429multiclass Zn3WriteResInt<SchedWrite SchedRW,
430                          list<ProcResourceKind> ExePorts, int Lat = 1,
431                          list<int> Res = [], int UOps = 1> {
432  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
433}
434
435multiclass Zn3WriteResXMM<SchedWrite SchedRW,
436                          list<ProcResourceKind> ExePorts, int Lat = 1,
437                          list<int> Res = [], int UOps = 1> {
438  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
439}
440
441multiclass Zn3WriteResYMM<SchedWrite SchedRW,
442                          list<ProcResourceKind> ExePorts, int Lat = 1,
443                          list<int> Res = [], int UOps = 1> {
444  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
445}
446
447// For classes with folded loads.
448multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
449                              list<ProcResourceKind> ExePorts, int Lat = 1,
450                              list<int> Res = [], int UOps = 1,
451                              int LoadUOps = 0, int LoadRes = 1> {
452  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453                           Znver3Model.LoadLatency,
454                           LoadUOps, Zn3AGU012, LoadRes>;
455}
456
457multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458                              list<ProcResourceKind> ExePorts, int Lat = 1,
459                              list<int> Res = [], int UOps = 1,
460                              int LoadUOps = 0, int LoadRes = 1> {
461  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462                           Znver3Model.VecLoadLatency,
463                           LoadUOps, Zn3FPLd01, LoadRes>;
464}
465
466multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467                              list<ProcResourceKind> ExePorts, int Lat = 1,
468                              list<int> Res = [], int UOps = 1,
469                              int LoadUOps = 0, int LoadRes = 1> {
470  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471                           Znver3Model.VecLoadLatency,
472                           LoadUOps, Zn3FPLd01, LoadRes>;
473}
474
475
476//===----------------------------------------------------------------------===//
477// Here be dragons.
478//===----------------------------------------------------------------------===//
479
480def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
481
482def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
483def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
484def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
485
486// AMD SOG 19h, 2.11 Floating-Point Unit
487// There is 1 cycle of added latency for a result to cross
488// from F to I or I to F domain.
489def : ReadAdvance<ReadInt2Fpu, -1>;
490
491// Instructions with both a load and a store folded are modeled as a folded
492// load + WriteRMW.
493defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
494
495// Loads, stores, and moves, not folded with other operations.
496defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
497
498// Model the effect of clobbering the read-write mask operand of the GATHER operation.
499// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
500defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
501
502def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
503  let Latency = !add(Znver3Model.LoadLatency, 1);
504  let ResourceCycles = [3, 1];
505  let NumMicroOps = 1;
506}
507def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
508
509defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
510defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
511defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
512
513// Treat misc copies as a move.
514def : InstRW<[WriteMove], (instrs COPY)>;
515
516def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
517  let Latency = Znver3Model.LoadLatency;
518  let ResourceCycles = [1, 1, 4];
519  let NumMicroOps = 1;
520}
521def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
522
523def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
524  let Latency = Znver3Model.StoreLatency;
525  let ResourceCycles = [4, 1, 1];
526  let NumMicroOps = 2;
527}
528def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
529
530// Arithmetic.
531defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
532
533def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
534  let Latency = 1;
535  let ResourceCycles = [4];
536  let NumMicroOps = 1;
537}
538def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
539                                        AND8i8, AND16i16, AND32i32, AND64i32,
540                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
541                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
542                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
543
544def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
545  let Latency = 1;
546  let ResourceCycles = [4];
547  let NumMicroOps = 1;
548}
549def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
550
551def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
552  let Latency = 1;
553  let ResourceCycles = [2];
554  let NumMicroOps = 1;
555}
556def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
557
558def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
559  let Latency = 3;
560  let ResourceCycles = [1];
561  let NumMicroOps = 1;
562}
563def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
564                                          PEXT32rr, PEXT64rr)>;
565
566defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
567
568def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
569  let Latency = 1;
570  let ResourceCycles = [1, 1, 7, 1];
571  let NumMicroOps = 1;
572}
573def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
574
575// This is for simple LEAs with one or two input operands.
576defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
577
578// This write is used for slow LEA instructions.
579def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
580  let Latency = 2;
581  let ResourceCycles = [1];
582  let NumMicroOps = 2;
583}
584
585// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset),
586// or an LEA with a `Scale` value different than 1.
587def Zn3SlowLEAPredicate : MCSchedPredicate<
588  CheckAny<[
589    // A 3-operand LEA (base, index, offset).
590    IsThreeOperandsLEAFn,
591    // An LEA with a "Scale" different than 1.
592    CheckAll<[
593      CheckIsImmOperand<2>,
594      CheckNot<CheckImmOperand<2, 1>>
595    ]>
596  ]>
597>;
598
599def Zn3WriteLEA : SchedWriteVariant<[
600    SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
601    SchedVar<NoSchedPred,         [WriteLEA]>
602]>;
603
604def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
605
606def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
607  let Latency = 2; // FIXME: not from llvm-exegesis
608  let ResourceCycles = [4];
609  let NumMicroOps = 2;
610}
611
612def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
613
614// Integer multiplication
615defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
616defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
617defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
618defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
619defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
620
621def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
622  let Latency = 4;
623  let ResourceCycles = [1];
624  let NumMicroOps = 2;
625}
626def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>;
627
628def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
629  let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency);
630  let ResourceCycles = [1, 1, 2];
631  let NumMicroOps = Zn3MULX32rr.NumMicroOps;
632}
633def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>;
634
635defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
636defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
637defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
638
639def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
640  let Latency = 4;
641  let ResourceCycles = [1];
642  let NumMicroOps = 2;
643}
644def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>;
645
646def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
647  let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency);
648  let ResourceCycles = [1, 1, 2];
649  let NumMicroOps = Zn3MULX64rr.NumMicroOps;
650}
651def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>;
652
653defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
654defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
655defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;         // Integer multiplication, high part.
656
657defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
658defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
659
660defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
661
662def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
663  let Latency = 3;
664  let ResourceCycles = [12];
665  let NumMicroOps = 3;
666}
667def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
668
669defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
670
671def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
672  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
673  let ResourceCycles = [1, 1, 12];
674  let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
675}
676def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
677
678def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
679  let Latency = 3; // FIXME: not from llvm-exegesis
680  let ResourceCycles = [24];
681  let NumMicroOps = 19;
682}
683def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
684
685def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
686  let Latency = 4; // FIXME: not from llvm-exegesis
687  let ResourceCycles = [59];
688  let NumMicroOps = 28;
689}
690def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
691
692def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
693  let Latency = 1;
694  let ResourceCycles = [2];
695  let NumMicroOps = 2;
696}
697def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
698
699def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
700  let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
701  let ResourceCycles = [1, 1, 2];
702  let NumMicroOps = 5;
703}
704def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
705
706def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
707  let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
708  let ResourceCycles = [1, 1, 2];
709  let NumMicroOps = 2;
710}
711def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
712
713// Integer division.
714// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
715// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
716defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
717defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
718defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
719defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
720defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
721defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
722defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
723defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
724
725defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
726defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
727
728defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
729
730def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
731  let Latency = 1;
732  let ResourceCycles = [4];
733  let NumMicroOps = 1;
734}
735def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
736
737defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
738
739def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
740  let Latency = 1;
741  let ResourceCycles = [4];
742  let NumMicroOps = 1;
743}
744def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
745
746defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
747
748def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
749  let Latency = 2;
750  let ResourceCycles = [4];
751  let NumMicroOps = 2;
752}
753def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
754
755defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
756defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
757defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
758defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
759defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
760
761defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
762defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
763defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
764
765defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
766defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
767defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
768
769// Integer shifts and rotates.
770defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
771defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
772defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
773
774def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
775  let Latency = 1;
776  let ResourceCycles = [2];
777  let NumMicroOps = 1;
778}
779def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
780                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
781
782def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
783  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
784  let ResourceCycles = [1, 1, 2];
785  let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
786}
787def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
788                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
789
790def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
791  let Latency = 3;
792  let ResourceCycles = [6];
793  let NumMicroOps = 7;
794}
795def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
796
797def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
798  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
799  let ResourceCycles = [1, 1, 8];
800  let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
801}
802def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
803
804def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
805  let Latency = 4;
806  let ResourceCycles = [8];
807  let NumMicroOps = 9;
808}
809def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
810
811def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
812  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
813  let ResourceCycles = [1, 1, 8];
814  let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
815}
816def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
817
818defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
819
820def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
821  let Latency = 3;
822  let ResourceCycles = [6];
823  let NumMicroOps = 7;
824}
825def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
826
827def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
828  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
829  let ResourceCycles = [1, 1, 8];
830  let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
831}
832def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
833
834def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
835  let Latency = 4;
836  let ResourceCycles = [8];
837  let NumMicroOps = 9;
838}
839def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
840
841def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
842  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
843  let ResourceCycles = [1, 1, 8];
844  let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
845}
846def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
847
848// Double shift instructions.
849defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
850defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
851defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
852defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
853
854// BMI1 BEXTR/BLS, BMI2 BZHI
855defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
856defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
857defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
858
859// Idioms that clear a register, like xorps %xmm0, %xmm0.
860// These can often bypass execution ports completely.
861defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
862
863// Branches don't produce values, so they have no latency, but they still
864// consume resources. Indirect branches can fold loads.
865defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
866
867// Floating point. This covers both scalar and vector operations.
868defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
869defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
870defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
871defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
872defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
873defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
874defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
875defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
876defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
877
878def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
879  let Latency = 2; // FIXME: not from llvm-exegesis
880  let ResourceCycles = [1, 1];
881  let NumMicroOps = 2;
882}
883def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
884                                               VMOVHPDmr, VMOVHPSmr)>;
885
886defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
887defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
888defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
889defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
890defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
891
892defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
893defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
894defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
895defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
896
897defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
898
899def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
900  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
901  let ResourceCycles = [1, 1, 24];
902  let NumMicroOps = 2;
903}
904def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
905                                         SUB_FI16m, SUB_FI32m,
906                                         SUBR_FI16m, SUBR_FI32m,
907                                         MUL_FI16m, MUL_FI32m)>;
908
909def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
910  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
911  let ResourceCycles = [1, 1, 62];
912  let NumMicroOps = 2;
913}
914def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
915                                       DIVR_FI16m, DIVR_FI32m)>;
916
917defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
918defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
919defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
920defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
921defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
922defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
923defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
924defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>;  // Floating point compare.
925defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
926defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
927defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
928defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>;  // Floating point double compare.
929defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
930defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
931defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
932defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
933defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
934defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>;  // Floating point multiplication.
935defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
936defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
937defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
938defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
939defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
940defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
941defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
942defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>;  // Floating point division.
943defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
944defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
945defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
946defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>;  // Floating point double division.
947defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
948defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
949defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
950defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>;   // Floating point square root.
951defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
952defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
953defm : X86WriteResPairUnsupported<WriteFSqrtZ>;  // Floating point square root (ZMM).
954defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>;  // Floating point double square root.
955defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
956defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
957defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
958defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
959defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>;  // Floating point reciprocal estimate.
960defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
961defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
962defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
963defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>;  // Floating point reciprocal square root estimate.
964defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
965defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
966defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
967defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
968defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM).
969defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM).
970defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
971defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
972defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
973defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
974defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM).
975defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
976defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
977defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
978defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
979defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
980defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
981defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
982defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
983defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
984defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
985defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
986defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
987defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
988defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
989defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
990defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
991defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
992defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
993defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
994defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
995defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
996defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
997
998// Horizontal Add/Sub (float and integer)
999defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
1000defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
1001defm : X86WriteResPairUnsupported<WriteFHAddZ>;
1002defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
1003defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
1004defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
1005defm : X86WriteResPairUnsupported<WritePHAddZ>;
1006
1007// Vector integer operations.
1008defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1009defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1010defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1011defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1012defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1013defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1014defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
1015defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1016defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1017
1018def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
1019  let Latency = 4;
1020  let ResourceCycles = [1];
1021  let NumMicroOps = 1;
1022}
1023def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
1024
1025def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
1026  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1027  let ResourceCycles = [1, 1, 1];
1028  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1029}
1030def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1031
1032def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
1033  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1034  let ResourceCycles = [1, 1, 1];
1035  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1036}
1037def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1038
1039defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1040defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1041defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1042defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1043defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
1044defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
1045defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1046
1047defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1048defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1049
1050def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1051  let Latency = 1;
1052  let ResourceCycles = [1, 2];
1053  let NumMicroOps = 2;
1054}
1055def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1056
1057def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1058  let Latency = 1;
1059  let ResourceCycles = [1, 4];
1060  let NumMicroOps = 2;
1061}
1062def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1063
1064defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1065
1066def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1067  let Latency = 3;
1068  let ResourceCycles = [1, 1];
1069  let NumMicroOps = 1;
1070}
1071def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1072
1073def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1074  let Latency = 3;
1075  let ResourceCycles = [1, 1];
1076  let NumMicroOps = 2;
1077}
1078def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1079
1080defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1081
1082def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1083  let Latency = 1;
1084  let ResourceCycles = [1];
1085  let NumMicroOps = 1;
1086}
1087def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1088                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1089                                            PAVGBrr, PAVGWrr,
1090                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1091                                            VPABSBrr, VPABSDrr, VPABSWrr,
1092                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1093                                            VPAVGBrr, VPAVGWrr,
1094                                            VPCMPEQQrr,
1095                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1096                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1097
1098def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
1099  let Latency = 1;
1100  let ResourceCycles = [1];
1101  let NumMicroOps = 1;
1102}
1103def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
1104                                           MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
1105                                           MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr,
1106                                           MMX_PAVGBirr, MMX_PAVGWirr,
1107                                           MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>;
1108
1109defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1110
1111def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1112  let Latency = 1;
1113  let ResourceCycles = [1];
1114  let NumMicroOps = 1;
1115}
1116def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1117                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1118                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1119                                            VPAVGBYrr, VPAVGWYrr,
1120                                            VPCMPEQQYrr,
1121                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1122
1123defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
1124defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1125defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1126defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1127defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
1128defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1129defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1130defm : X86WriteResPairUnsupported<WriteVecTestZ>;  // Vector integer TEST instructions (ZMM).
1131defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1132defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
1133defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1134defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
1135defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1136defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1137defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1138defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
1139defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1140defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1141defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1142defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
1143defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
1144defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1145defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
1146defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1147defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1148defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1149defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
1150defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1151defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1152defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1153defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
1154defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
1155defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1156defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
1157defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
1158defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1159defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
1160defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1161defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1162defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1163defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
1164defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1165defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1166defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
1167defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1168
1169// Vector insert/extract operations.
1170defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1171defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1172defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1173
1174// MOVMSK operations.
1175defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1176defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1177defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
1178defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1179
1180// Conversion between integer and float.
1181defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>;  // Double -> Integer.
1182defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1183defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1184defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1185
1186def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1187  let Latency = 1;
1188  let ResourceCycles = [2];
1189  let NumMicroOps = 2;
1190}
1191def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>;
1192
1193defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>;  // Float -> Integer.
1194
1195defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1196defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1197defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1198
1199defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1200defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1201defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1202defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1203
1204def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1205  let Latency = 2;
1206  let ResourceCycles = [6];
1207  let NumMicroOps = 2;
1208}
1209def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>;
1210
1211defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1212defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1213defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1214defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1215
1216def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1217  let Latency = 3;
1218  let ResourceCycles = [1];
1219  let NumMicroOps = 2;
1220}
1221def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>;
1222
1223defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1224defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1225defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1226defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1227
1228defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1229defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1230defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1231defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1232
1233defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1234defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1235defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1236
1237defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1238defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1239defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1240defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1241defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1242defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1243
1244// CRC32 instruction.
1245defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
1246
1247def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1248  let Latency = 2;
1249  let ResourceCycles = [2];
1250  let NumMicroOps = 2;
1251}
1252def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1253
1254def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1255  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
1256  let ResourceCycles = [1, 1, 2];
1257  let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
1258}
1259def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1260
1261def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
1262  let Latency = 1;
1263  let ResourceCycles = [2];
1264  let NumMicroOps = 1;
1265}
1266def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1267
1268def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1269  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1270  let ResourceCycles = [1, 1, 2];
1271  let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1272}
1273def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1274
1275def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1276  let Latency = 2;
1277  let ResourceCycles = [3];
1278  let NumMicroOps = 2;
1279}
1280def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1281
1282def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1283  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
1284  let ResourceCycles = [1, 1, 3];
1285  let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
1286}
1287def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1288
1289def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
1290  let Latency = 3;
1291  let ResourceCycles = [8];
1292  let NumMicroOps = 4;
1293}
1294def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1295
1296def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1297  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
1298  let ResourceCycles = [1, 1, 8];
1299  let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
1300}
1301def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1302
1303def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
1304  let Latency = 6;
1305  let ResourceCycles = [8];
1306  let NumMicroOps = 1;
1307}
1308def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1309
1310def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
1311  let Latency = 4;
1312  let ResourceCycles = [8];
1313  let NumMicroOps = 1;
1314}
1315def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1316
1317// Strings instructions.
1318// Packed Compare Implicit Length Strings, Return Mask
1319defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1320// Packed Compare Explicit Length Strings, Return Mask
1321defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1322// Packed Compare Implicit Length Strings, Return Index
1323defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1324// Packed Compare Explicit Length Strings, Return Index
1325defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1326
1327// AES instructions.
1328defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
1329defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
1330defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
1331
1332// Carry-less multiplication instructions.
1333defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
1334
1335// EMMS/FEMMS
1336defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1337
1338// Load/store MXCSR
1339defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1340defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1341
1342// Catch-all for expensive system instructions.
1343defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
1344
1345def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
1346  let Latency = 0; // FIXME: not from llvm-exegesis
1347  let ResourceCycles = [1];
1348  let NumMicroOps = 1;
1349}
1350def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
1351
1352def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
1353  let Latency = 10; // FIXME: not from llvm-exegesis
1354  let ResourceCycles = [24];
1355  let NumMicroOps = 18;
1356}
1357def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
1358
1359// AVX2.
1360defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1361defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1362defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1363
1364def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
1365  let Latency = 3;
1366  let ResourceCycles = [1];
1367  let NumMicroOps = 1;
1368}
1369def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1370
1371def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1372  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
1373  let ResourceCycles = [1, 1, 1];
1374  let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1375}
1376def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1377
1378def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> {
1379  let Latency = 7;
1380  let ResourceCycles = [1];
1381  let NumMicroOps = 2;
1382}
1383def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1384
1385def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1386  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency);
1387  let ResourceCycles = [1, 1, 2];
1388  let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1);
1389}
1390def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1391
1392def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
1393  let Latency = 6;
1394  let ResourceCycles = [1];
1395  let NumMicroOps = 2;
1396}
1397def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1398
1399def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1400  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
1401  let ResourceCycles = [1, 1, 2];
1402  let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
1403}
1404def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1405
1406def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> {
1407  let Latency = 5;
1408  let ResourceCycles = [1];
1409  let NumMicroOps = 2;
1410}
1411def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>;
1412
1413def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1414  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency);
1415  let ResourceCycles = [1, 1, 2];
1416  let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0);
1417}
1418def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1419
1420defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1421defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1422defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
1423defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1424defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
1425
1426// Old microcoded instructions that nobody use.
1427defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
1428
1429// Fence instructions.
1430defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
1431
1432def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
1433  let Latency = 1;
1434  let ResourceCycles = [30];
1435  let NumMicroOps = 1;
1436}
1437def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
1438
1439def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
1440  let Latency = 1;
1441  let ResourceCycles = [1];
1442  let NumMicroOps = 1;
1443}
1444def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
1445
1446// Nop, not very useful expect it provides a model for nops!
1447defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1448
1449
1450///////////////////////////////////////////////////////////////////////////////
1451// Zero Cycle Move
1452///////////////////////////////////////////////////////////////////////////////
1453
1454def Zn3WriteZeroLatency : SchedWriteRes<[]> {
1455  let Latency = 0;
1456  let ResourceCycles = [];
1457  let NumMicroOps = 1;
1458}
1459def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1460                                               MOV64rr, MOV64rr_REV,
1461                                               MOVSX32rr32)>;
1462
1463def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
1464  let Latency = 0;
1465  let ResourceCycles = [];
1466  let NumMicroOps = 2;
1467}
1468def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1469                                               XCHG64rr, XCHG64ar)>;
1470
1471defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1472
1473defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
1474defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1475defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1476
1477defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
1478defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1479defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1480
1481def : IsOptimizableRegisterMove<[
1482  InstructionEquivalenceClass<[
1483    // GPR variants.
1484    MOV32rr, MOV32rr_REV,
1485    MOV64rr, MOV64rr_REV,
1486    MOVSX32rr32,
1487    XCHG32rr, XCHG32ar,
1488    XCHG64rr, XCHG64ar,
1489
1490    // MMX variants.
1491    // MMX moves are *NOT* eliminated.
1492
1493    // SSE variants.
1494    MOVAPSrr, MOVAPSrr_REV,
1495    MOVUPSrr, MOVUPSrr_REV,
1496    MOVAPDrr, MOVAPDrr_REV,
1497    MOVUPDrr, MOVUPDrr_REV,
1498    MOVDQArr, MOVDQArr_REV,
1499    MOVDQUrr, MOVDQUrr_REV,
1500
1501    // AVX variants.
1502    VMOVAPSrr, VMOVAPSrr_REV,
1503    VMOVUPSrr, VMOVUPSrr_REV,
1504    VMOVAPDrr, VMOVAPDrr_REV,
1505    VMOVUPDrr, VMOVUPDrr_REV,
1506    VMOVDQArr, VMOVDQArr_REV,
1507    VMOVDQUrr, VMOVDQUrr_REV,
1508
1509    // AVX YMM variants.
1510    VMOVAPSYrr, VMOVAPSYrr_REV,
1511    VMOVUPSYrr, VMOVUPSYrr_REV,
1512    VMOVAPDYrr, VMOVAPDYrr_REV,
1513    VMOVUPDYrr, VMOVUPDYrr_REV,
1514    VMOVDQAYrr, VMOVDQAYrr_REV,
1515    VMOVDQUYrr, VMOVDQUYrr_REV,
1516  ], TruePred >
1517]>;
1518
1519///////////////////////////////////////////////////////////////////////////////
1520// Dependency breaking instructions.
1521///////////////////////////////////////////////////////////////////////////////
1522
1523def Zn3WriteZeroIdiom : SchedWriteVariant<[
1524    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1525    SchedVar<NoSchedPred,                          [WriteALU]>
1526]>;
1527def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1528                                          XOR64rr, XOR64rr_REV,
1529                                          SUB32rr, SUB32rr_REV,
1530                                          SUB64rr, SUB64rr_REV)>;
1531
1532def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1533    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
1534    SchedVar<NoSchedPred,                                 [WriteALU]>
1535]>;
1536def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1537                                                CMP16rr, CMP16rr_REV,
1538                                                CMP32rr, CMP32rr_REV,
1539                                                CMP64rr, CMP64rr_REV)>;
1540
1541def Zn3WriteFZeroIdiom : SchedWriteVariant<[
1542    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1543    SchedVar<NoSchedPred,                          [WriteFLogic]>
1544]>;
1545// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1546def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1547                                           VANDNPSrr, VANDNPDrr)>;
1548
1549def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
1550    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1551    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1552]>;
1553def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1554                                            VANDNPSYrr, VANDNPDYrr)>;
1555
1556def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
1557    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1558    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1559]>;
1560// NOTE: PXORrr,PANDNrr are not zero-cycle!
1561def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1562
1563def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
1564    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1565    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1566]>;
1567def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1568
1569def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
1570    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1571    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1572]>;
1573// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1574//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1575def : InstRW<[Zn3WriteVZeroIdiomALUX],
1576             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1577                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1578
1579def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
1580    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1581    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1582]>;
1583def : InstRW<[Zn3WriteVZeroIdiomALUY],
1584             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1585                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1586
1587def : IsZeroIdiomFunction<[
1588  // GPR Zero-idioms.
1589  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1590                     XOR64rr, XOR64rr_REV,
1591                     SUB32rr, SUB32rr_REV,
1592                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1593
1594  // SSE XMM Zero-idioms.
1595  DepBreakingClass<[
1596    // fp variants.
1597    XORPSrr, XORPDrr,
1598    ANDNPSrr, ANDNPDrr,
1599
1600    // int variants.
1601    PXORrr,
1602    PANDNrr,
1603    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1604    PSUBSBrr, PSUBSWrr,
1605    PSUBUSBrr, PSUBUSWrr,
1606    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1607  ], ZeroIdiomPredicate>,
1608
1609  // AVX XMM Zero-idioms.
1610  DepBreakingClass<[
1611    // fp variants.
1612    VXORPSrr, VXORPDrr,
1613    VANDNPSrr, VANDNPDrr,
1614
1615    // int variants.
1616    VPXORrr,
1617    VPANDNrr,
1618    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1619    VPSUBSBrr, VPSUBSWrr,
1620    VPSUBUSBrr, VPSUBUSWrr,
1621    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1622  ], ZeroIdiomPredicate>,
1623
1624  // AVX YMM Zero-idioms.
1625  DepBreakingClass<[
1626    // fp variants.
1627    VXORPSYrr, VXORPDYrr,
1628    VANDNPSYrr, VANDNPDYrr,
1629
1630    // int variants.
1631    VPXORYrr,
1632    VPANDNYrr,
1633    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1634    VPSUBSBYrr, VPSUBSWYrr,
1635    VPSUBUSBYrr, VPSUBUSWYrr,
1636    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1637  ], ZeroIdiomPredicate>,
1638]>;
1639
1640def : IsDepBreakingFunction<[
1641  // GPR
1642  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1643                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1644  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1645                     CMP16rr, CMP16rr_REV,
1646                     CMP32rr, CMP32rr_REV,
1647                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1648
1649  // MMX
1650  DepBreakingClass<[
1651    MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr
1652  ], ZeroIdiomPredicate>,
1653
1654  // SSE
1655  DepBreakingClass<[
1656    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1657  ], ZeroIdiomPredicate>,
1658
1659  // AVX XMM
1660  DepBreakingClass<[
1661    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1662  ], ZeroIdiomPredicate>,
1663
1664  // AVX YMM
1665  DepBreakingClass<[
1666    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1667  ], ZeroIdiomPredicate>,
1668]>;
1669
1670} // SchedModel
1671