xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver3.td (revision e32fecd0c2c3ee37c47ee100f169e7eb0282a873)
1//=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver3 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15//    http://www.agner.org/optimize/microarchitecture.pdf
16//  * AMD Zen 3 Ryzen Deep Dive Review
17//    https://www.anandtech.com/show/16214/
18//===----------------------------------------------------------------------===//
19
20def Znver3Model : SchedMachineModel {
21  // AMD SOG 19h, 2.9.6 Dispatch
22  // The processor may dispatch up to 6 macro ops per cycle
23  // into the execution engine.
24  let IssueWidth = 6;
25  // AMD SOG 19h, 2.10.3
26  // The retire control unit (RCU) tracks the completion status of all
27  // outstanding operations (integer, load/store, and floating-point) and is
28  // the final arbiter for exception processing and recovery.
29  // The unit can receive up to 6 macro ops dispatched per cycle and track up
30  // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
31  let MicroOpBufferSize = 256;
32  // AMD SOG 19h, 2.9.1 Op Cache
33  // The op cache is organized as an associative cache with 64 sets and 8 ways.
34  // At each set-way intersection is an entry containing up to 8 macro ops.
35  // The maximum capacity of the op cache is 4K ops.
36  // Agner, 22.5 µop cache
37  // The size of the µop cache is big enough for holding most critical loops.
38  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
39  //        with large values here the compilation of certain loops
40  //        ends up taking way too long.
41  // let LoopMicroOpBufferSize = 4096;
42  let LoopMicroOpBufferSize = 512;
43  // AMD SOG 19h, 2.6.2 L1 Data Cache
44  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
45  // AMD SOG 19h, 2.12 L1 Data Cache
46  // The AGU and LS pipelines are optimized for simple address generation modes.
47  // <...> and can achieve 4-cycle load-to-use integer load latency.
48  let LoadLatency = 4;
49  // AMD SOG 19h, 2.12 L1 Data Cache
50  // The AGU and LS pipelines are optimized for simple address generation modes.
51  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
52  int VecLoadLatency = 7;
53  // Latency of a simple store operation.
54  int StoreLatency = 1;
55  // FIXME
56  let HighLatency = 25; // FIXME: any better choice?
57  // AMD SOG 19h, 2.8 Optimizing Branching
58  // The branch misprediction penalty is in the range from 11 to 18 cycles,
59  // <...>. The common case penalty is 13 cycles.
60  let MispredictPenalty = 13;
61
62  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
63
64  let CompleteModel = 1;
65}
66
67let SchedModel = Znver3Model in {
68
69
70//===----------------------------------------------------------------------===//
71// RCU
72//===----------------------------------------------------------------------===//
73
74// AMD SOG 19h, 2.10.3 Retire Control Unit
75// The unit can receive up to 6 macro ops dispatched per cycle and track up to
76// 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
77// The retire unit handles in-order commit of up to eight macro ops per cycle.
78def Zn3RCU : RetireControlUnit<Znver3Model.MicroOpBufferSize, 8>;
79
80//===----------------------------------------------------------------------===//
81// Units
82//===----------------------------------------------------------------------===//
83
84// There are total of three Units, each one with it's own schedulers.
85
86//===----------------------------------------------------------------------===//
87// Integer Execution Unit
88//
89
90// AMD SOG 19h, 2.4 Superscalar Organization
91// The processor uses four decoupled independent integer scheduler queues,
92// each one servicing one ALU pipeline and one or two other pipelines
93
94//
95// Execution pipes
96//===----------------------------------------------------------------------===//
97
98// AMD SOG 19h, 2.10.2 Execution Units
99// The processor contains 4 general purpose integer execution pipes.
100// Each pipe has an ALU capable of general purpose integer operations.
101def Zn3ALU0 : ProcResource<1>;
102def Zn3ALU1 : ProcResource<1>;
103def Zn3ALU2 : ProcResource<1>;
104def Zn3ALU3 : ProcResource<1>;
105
106// AMD SOG 19h, 2.10.2 Execution Units
107// There is also a separate branch execution unit.
108def Zn3BRU1 : ProcResource<1>;
109
110// AMD SOG 19h, 2.10.2 Execution Units
111// There are three Address Generation Units (AGUs) for all load and store
112// address generation. There are also 3 store data movement units
113// associated with the same schedulers as the AGUs.
114def Zn3AGU0 : ProcResource<1>;
115def Zn3AGU1 : ProcResource<1>;
116def Zn3AGU2 : ProcResource<1>;
117
118//
119// Execution Units
120//===----------------------------------------------------------------------===//
121
122// AMD SOG 19h, 2.10.2 Execution Units
123// ALU0 additionally has divide <...> execution capability.
124defvar Zn3Divider = Zn3ALU0;
125
126// AMD SOG 19h, 2.10.2 Execution Units
127// ALU0 additionally has <...> branch execution capability.
128defvar Zn3BRU0 = Zn3ALU0;
129
130// Integer Multiplication issued on ALU1.
131defvar Zn3Multiplier = Zn3ALU1;
132
133// Execution pipeline grouping
134//===----------------------------------------------------------------------===//
135
136// General ALU operations
137def Zn3ALU0123 : ProcResGroup<[Zn3ALU0, Zn3ALU1, Zn3ALU2, Zn3ALU3]>;
138
139// General AGU operations
140def Zn3AGU012 : ProcResGroup<[Zn3AGU0, Zn3AGU1, Zn3AGU2]>;
141
142// Control flow: jumps, calls
143def Zn3BRU01 : ProcResGroup<[Zn3BRU0, Zn3BRU1]>;
144
145// Everything that isn't control flow, but still needs to access CC register,
146// namely: conditional moves, SETcc.
147def Zn3ALU03 : ProcResGroup<[Zn3ALU0, Zn3ALU3]>;
148
149// Zn3ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
150
151// Simple bit twiddling: bit test, shift/rotate, bit extraction
152def Zn3ALU12 : ProcResGroup<[Zn3ALU1, Zn3ALU2]>;
153
154
155//
156// Scheduling
157//===----------------------------------------------------------------------===//
158
159// AMD SOG 19h, 2.10.3 Retire Control Unit
160// The integer physical register file (PRF) consists of 192 registers.
161def Zn3IntegerPRF : RegisterFile<192, [GR64, CCR], [1, 1], [1, 0],
162                              6,  // Max moves that can be eliminated per cycle.
163                              0>; // Restrict move elimination to zero regs.
164
165// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
166// AMD SOG 19h, 2.10.1 Schedulers
167// The schedulers can receive up to six macro ops per cycle, with a limit of
168// two per scheduler. Each scheduler can issue one micro op per cycle into
169// each of its associated pipelines
170// FIXME: these are 4 separate schedulers, not a single big one.
171def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
172                           Zn3ALU1, Zn3AGU1,          // scheduler 1
173                           Zn3ALU2, Zn3AGU2,          // scheduler 2
174                           Zn3ALU3,          Zn3BRU1  // scheduler 3
175                          ]> {
176  let BufferSize = !mul(4, 24);
177}
178
179
180//===----------------------------------------------------------------------===//
181// Floating-Point Unit
182//
183
184// AMD SOG 19h, 2.4 Superscalar Organization
185// The processor uses <...> two decoupled independent floating point schedulers
186// each servicing two FP pipelines and one store or FP-to-integer pipeline.
187
188//
189// Execution pipes
190//===----------------------------------------------------------------------===//
191
192// AMD SOG 19h, 2.10.1 Schedulers
193// <...>, and six FPU pipes.
194// Agner, 22.10 Floating point execution pipes
195// There are six floating point/vector execution pipes,
196def Zn3FPP0  : ProcResource<1>;
197def Zn3FPP1  : ProcResource<1>;
198def Zn3FPP2  : ProcResource<1>;
199def Zn3FPP3  : ProcResource<1>;
200def Zn3FPP45 : ProcResource<2>;
201
202//
203// Execution Units
204//===----------------------------------------------------------------------===//
205// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
206
207// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
208defvar Zn3FPFMul0 = Zn3FPP0;
209defvar Zn3FPFMul1 = Zn3FPP1;
210
211// (v)FADD*
212defvar Zn3FPFAdd0 = Zn3FPP2;
213defvar Zn3FPFAdd1 = Zn3FPP3;
214
215// All convert operations except pack/unpack
216defvar Zn3FPFCvt0 = Zn3FPP2;
217defvar Zn3FPFCvt1 = Zn3FPP3;
218
219// All Divide and Square Root except Reciprocal Approximation
220// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
221// FDIV unit can support 2 simultaneous operations in flight
222// even though it occupies a single pipe.
223// FIXME: BufferSize=2 ?
224defvar Zn3FPFDiv = Zn3FPP1;
225
226// Moves and Logical operations on Floating Point Data Types
227defvar Zn3FPFMisc0 = Zn3FPP0;
228defvar Zn3FPFMisc1 = Zn3FPP1;
229defvar Zn3FPFMisc2 = Zn3FPP2;
230defvar Zn3FPFMisc3 = Zn3FPP3;
231
232// Integer Adds, Subtracts, and Compares
233// Some complex VADD operations are not available in all pipes.
234defvar Zn3FPVAdd0 = Zn3FPP0;
235defvar Zn3FPVAdd1 = Zn3FPP1;
236defvar Zn3FPVAdd2 = Zn3FPP2;
237defvar Zn3FPVAdd3 = Zn3FPP3;
238
239// Integer Multiplies, SAD, Blendvb
240defvar Zn3FPVMul0 = Zn3FPP0;
241defvar Zn3FPVMul1 = Zn3FPP3;
242
243// Data Shuffles, Packs, Unpacks, Permute
244// Some complex shuffle operations are only available in pipe1.
245defvar Zn3FPVShuf = Zn3FPP1;
246defvar Zn3FPVShufAux = Zn3FPP2;
247
248// Bit Shift Left/Right operations
249defvar Zn3FPVShift0 = Zn3FPP1;
250defvar Zn3FPVShift1 = Zn3FPP2;
251
252// Moves and Logical operations on Packed Integer Data Types
253defvar Zn3FPVMisc0 = Zn3FPP0;
254defvar Zn3FPVMisc1 = Zn3FPP1;
255defvar Zn3FPVMisc2 = Zn3FPP2;
256defvar Zn3FPVMisc3 = Zn3FPP3;
257
258// *AES*
259defvar Zn3FPAES0 = Zn3FPP0;
260defvar Zn3FPAES1 = Zn3FPP1;
261
262// *CLM*
263defvar Zn3FPCLM0 = Zn3FPP0;
264defvar Zn3FPCLM1 = Zn3FPP1;
265
266// Execution pipeline grouping
267//===----------------------------------------------------------------------===//
268
269// AMD SOG 19h, 2.11 Floating-Point Unit
270// Stores and floating point to general purpose register transfer
271// have 2 dedicated pipelines (pipe 5 and 6).
272def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
273
274// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
275def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
276
277// (v)FADD*
278// Some complex VADD operations are not available in all pipes.
279def Zn3FPFAdd01 : ProcResGroup<[Zn3FPFAdd0, Zn3FPFAdd1]>;
280
281// All convert operations except pack/unpack
282def Zn3FPFCvt01 : ProcResGroup<[Zn3FPFCvt0, Zn3FPFCvt1]>;
283
284// All Divide and Square Root except Reciprocal Approximation
285// def Zn3FPFDiv : ProcResGroup<[Zn3FPFDiv]>;
286
287// Moves and Logical operations on Floating Point Data Types
288def Zn3FPFMisc0123 : ProcResGroup<[Zn3FPFMisc0, Zn3FPFMisc1, Zn3FPFMisc2, Zn3FPFMisc3]>;
289
290def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
291
292// Loads, Stores and Move to General Register (EX) Operations
293// AMD SOG 19h, 2.11 Floating-Point Unit
294// Stores and floating point to general purpose register transfer
295// have 2 dedicated pipelines (pipe 5 and 6).
296defvar Zn3FPLd01 = Zn3FPP45;
297
298// AMD SOG 19h, 2.11 Floating-Point Unit
299// Note that FP stores are supported on two pipelines,
300// but throughput is limited to one per cycle.
301let Super = Zn3FPP45 in
302def Zn3FPSt : ProcResource<1>;
303
304// Integer Adds, Subtracts, and Compares
305// Some complex VADD operations are not available in all pipes.
306def Zn3FPVAdd0123 : ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1, Zn3FPVAdd2, Zn3FPVAdd3]>;
307
308def Zn3FPVAdd01: ProcResGroup<[Zn3FPVAdd0, Zn3FPVAdd1]>;
309def Zn3FPVAdd12: ProcResGroup<[Zn3FPVAdd1, Zn3FPVAdd2]>;
310
311// Integer Multiplies, SAD, Blendvb
312def Zn3FPVMul01 : ProcResGroup<[Zn3FPVMul0, Zn3FPVMul1]>;
313
314// Data Shuffles, Packs, Unpacks, Permute
315// Some complex shuffle operations are only available in pipe1.
316def Zn3FPVShuf01 : ProcResGroup<[Zn3FPVShuf, Zn3FPVShufAux]>;
317
318// Bit Shift Left/Right operations
319def Zn3FPVShift01 : ProcResGroup<[Zn3FPVShift0, Zn3FPVShift1]>;
320
321// Moves and Logical operations on Packed Integer Data Types
322def Zn3FPVMisc0123 : ProcResGroup<[Zn3FPVMisc0, Zn3FPVMisc1, Zn3FPVMisc2, Zn3FPVMisc3]>;
323
324// *AES*
325def Zn3FPAES01 : ProcResGroup<[Zn3FPAES0, Zn3FPAES1]>;
326
327// *CLM*
328def Zn3FPCLM01 : ProcResGroup<[Zn3FPCLM0, Zn3FPCLM1]>;
329
330
331//
332// Scheduling
333//===----------------------------------------------------------------------===//
334
335// Agner, 21.8 Register renaming and out-of-order schedulers
336// The floating point register file has 160 vector registers
337// of 128 bits each in Zen 1 and 256 bits each in Zen 2.
338// anandtech also confirms this.
339def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
340                            6,  // Max moves that can be eliminated per cycle.
341                            0>; // Restrict move elimination to zero regs.
342
343// AMD SOG 19h, 2.11 Floating-Point Unit
344// The floating-point scheduler has a 2*32 entry macro op capacity.
345// AMD SOG 19h, 2.11 Floating-Point Unit
346// <...> the scheduler can issue 1 micro op per cycle for each pipe.
347// FIXME: those are two separate schedulers, not a single big one.
348def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
349                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
350                         ]> {
351  let BufferSize = !mul(2, 32);
352}
353
354// AMD SOG 19h, 2.11 Floating-Point Unit
355// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
356// even if floating-point scheduler is full.
357// FIXME: how to model this properly?
358
359
360//===----------------------------------------------------------------------===//
361// Load-Store Unit
362//
363
364// AMD SOG 19h, 2.12 Load-Store Unit
365// The LS unit contains three largely independent pipe-lines
366// enabling the execution of three 256-bit memory operations per cycle.
367def Zn3LSU : ProcResource<3>;
368
369// AMD SOG 19h, 2.12 Load-Store Unit
370// All three memory operations can be loads.
371let Super = Zn3LSU in
372def Zn3Load : ProcResource<3> {
373  // AMD SOG 19h, 2.12 Load-Store Unit
374  // The LS unit can process up to 72 out-of-order loads.
375  let BufferSize = 72;
376}
377
378def Zn3LoadQueue : LoadQueue<Zn3Load>;
379
380// AMD SOG 19h, 2.12 Load-Store Unit
381// A maximum of two of the memory operations can be stores.
382let Super = Zn3LSU in
383def Zn3Store : ProcResource<2> {
384  // AMD SOG 19h, 2.12 Load-Store Unit
385  // The LS unit utilizes a 64-entry store queue (STQ).
386  let BufferSize = 64;
387}
388
389def Zn3StoreQueue : StoreQueue<Zn3Store>;
390
391//===----------------------------------------------------------------------===//
392// Basic helper classes.
393//===----------------------------------------------------------------------===//
394
395// Many SchedWrites are defined in pairs with and without a folded load.
396// Instructions with folded loads are usually micro-fused, so they only appear
397// as two micro-ops when dispatched by the schedulers.
398// This multiclass defines the resource usage for variants with and without
399// folded loads.
400
401multiclass __zn3WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
402                         int Lat = 1, list<int> Res = [], int UOps = 1> {
403  def : WriteRes<SchedRW, ExePorts> {
404    let Latency = Lat;
405    let ResourceCycles = Res;
406    let NumMicroOps = UOps;
407  }
408}
409
410multiclass __zn3WriteResPair<X86FoldableSchedWrite SchedRW,
411                             list<ProcResourceKind> ExePorts, int Lat,
412                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
413                             ProcResourceKind AGU, int LoadRes> {
414  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
415
416  defm : __zn3WriteRes<SchedRW.Folded,
417                       !listconcat([AGU, Zn3Load], ExePorts),
418                       !add(Lat, LoadLat),
419                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
420                         [],
421                         !listconcat([1, LoadRes],
422                           !if(!empty(Res),
423                             !listsplat(1, !size(ExePorts)),
424                             Res))),
425                       !add(UOps, LoadUOps)>;
426}
427
428// For classes without folded loads.
429multiclass Zn3WriteResInt<SchedWrite SchedRW,
430                          list<ProcResourceKind> ExePorts, int Lat = 1,
431                          list<int> Res = [], int UOps = 1> {
432  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
433}
434
435multiclass Zn3WriteResXMM<SchedWrite SchedRW,
436                          list<ProcResourceKind> ExePorts, int Lat = 1,
437                          list<int> Res = [], int UOps = 1> {
438  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
439}
440
441multiclass Zn3WriteResYMM<SchedWrite SchedRW,
442                          list<ProcResourceKind> ExePorts, int Lat = 1,
443                          list<int> Res = [], int UOps = 1> {
444  defm : __zn3WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
445}
446
447// For classes with folded loads.
448multiclass Zn3WriteResIntPair<X86FoldableSchedWrite SchedRW,
449                              list<ProcResourceKind> ExePorts, int Lat = 1,
450                              list<int> Res = [], int UOps = 1,
451                              int LoadUOps = 0, int LoadRes = 1> {
452  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453                           Znver3Model.LoadLatency,
454                           LoadUOps, Zn3AGU012, LoadRes>;
455}
456
457multiclass Zn3WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458                              list<ProcResourceKind> ExePorts, int Lat = 1,
459                              list<int> Res = [], int UOps = 1,
460                              int LoadUOps = 0, int LoadRes = 1> {
461  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462                           Znver3Model.VecLoadLatency,
463                           LoadUOps, Zn3FPLd01, LoadRes>;
464}
465
466multiclass Zn3WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467                              list<ProcResourceKind> ExePorts, int Lat = 1,
468                              list<int> Res = [], int UOps = 1,
469                              int LoadUOps = 0, int LoadRes = 1> {
470  defm : __zn3WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471                           Znver3Model.VecLoadLatency,
472                           LoadUOps, Zn3FPLd01, LoadRes>;
473}
474
475
476//===----------------------------------------------------------------------===//
477// Here be dragons.
478//===----------------------------------------------------------------------===//
479
480def : ReadAdvance<ReadAfterLd, Znver3Model.LoadLatency>;
481
482def : ReadAdvance<ReadAfterVecLd, Znver3Model.VecLoadLatency>;
483def : ReadAdvance<ReadAfterVecXLd, Znver3Model.VecLoadLatency>;
484def : ReadAdvance<ReadAfterVecYLd, Znver3Model.VecLoadLatency>;
485
486// AMD SOG 19h, 2.11 Floating-Point Unit
487// There is 1 cycle of added latency for a result to cross
488// from F to I or I to F domain.
489def : ReadAdvance<ReadInt2Fpu, -1>;
490
491// Instructions with both a load and a store folded are modeled as a folded
492// load + WriteRMW.
493defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 1], 0>;
494
495// Loads, stores, and moves, not folded with other operations.
496defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
497
498// Model the effect of clobbering the read-write mask operand of the GATHER operation.
499// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
500defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
501
502def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
503  let Latency = !add(Znver3Model.LoadLatency, 1);
504  let ResourceCycles = [3, 1];
505  let NumMicroOps = 1;
506}
507def : InstRW<[Zn3WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
508
509defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
510defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
511defm : Zn3WriteResInt<WriteMove, [Zn3ALU0123], 1, [4], 1>;
512
513// Treat misc copies as a move.
514def : InstRW<[WriteMove], (instrs COPY)>;
515
516def Zn3WriteMOVBE16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
517  let Latency = Znver3Model.LoadLatency;
518  let ResourceCycles = [1, 1, 4];
519  let NumMicroOps = 1;
520}
521def : InstRW<[Zn3WriteMOVBE16rm], (instrs MOVBE16rm)>;
522
523def Zn3WriteMOVBEmr : SchedWriteRes<[Zn3ALU0123, Zn3AGU012, Zn3Store]> {
524  let Latency = Znver3Model.StoreLatency;
525  let ResourceCycles = [4, 1, 1];
526  let NumMicroOps = 2;
527}
528def : InstRW<[Zn3WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
529
530// Arithmetic.
531defm : Zn3WriteResIntPair<WriteALU, [Zn3ALU0123], 1, [1], 1>; // Simple integer ALU op.
532
533def Zn3WriteALUSlow : SchedWriteRes<[Zn3ALU0123]> {
534  let Latency = 1;
535  let ResourceCycles = [4];
536  let NumMicroOps = 1;
537}
538def : InstRW<[Zn3WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
539                                        AND8i8, AND16i16, AND32i32, AND64i32,
540                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
541                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
542                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
543
544def Zn3WriteMoveExtend : SchedWriteRes<[Zn3ALU0123]> {
545  let Latency = 1;
546  let ResourceCycles = [4];
547  let NumMicroOps = 1;
548}
549def : InstRW<[Zn3WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
550
551def Zn3WriteMaterialize32bitImm: SchedWriteRes<[Zn3ALU0123]> {
552  let Latency = 1;
553  let ResourceCycles = [2];
554  let NumMicroOps = 1;
555}
556def : InstRW<[Zn3WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
557
558def Zn3WritePDEP_PEXT : SchedWriteRes<[Zn3ALU1]> {
559  let Latency = 3;
560  let ResourceCycles = [1];
561  let NumMicroOps = 1;
562}
563def : InstRW<[Zn3WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
564                                          PEXT32rr, PEXT64rr)>;
565
566defm : Zn3WriteResIntPair<WriteADC, [Zn3ALU0123], 1, [4], 1>; // Integer ALU + flags op.
567
568def Zn3WriteADC8mr_SBB8mr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123, Zn3Store]> {
569  let Latency = 1;
570  let ResourceCycles = [1, 1, 7, 1];
571  let NumMicroOps = 1;
572}
573def : InstRW<[Zn3WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
574
575// This is for simple LEAs with one or two input operands.
576defm : Zn3WriteResInt<WriteLEA, [Zn3AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
577
578// This write is used for slow LEA instructions.
579def Zn3Write3OpsLEA : SchedWriteRes<[Zn3ALU0123]> {
580  let Latency = 2;
581  let ResourceCycles = [1];
582  let NumMicroOps = 2;
583}
584
585// On Znver3, a slow LEA is either a 3Ops LEA (base, index, offset),
586// or an LEA with a `Scale` value different than 1.
587def Zn3SlowLEAPredicate : MCSchedPredicate<
588  CheckAny<[
589    // A 3-operand LEA (base, index, offset).
590    IsThreeOperandsLEAFn,
591    // An LEA with a "Scale" different than 1.
592    CheckAll<[
593      CheckIsImmOperand<2>,
594      CheckNot<CheckImmOperand<2, 1>>
595    ]>
596  ]>
597>;
598
599def Zn3WriteLEA : SchedWriteVariant<[
600    SchedVar<Zn3SlowLEAPredicate, [Zn3Write3OpsLEA]>,
601    SchedVar<NoSchedPred,         [WriteLEA]>
602]>;
603
604def : InstRW<[Zn3WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
605
606def Zn3SlowLEA16r : SchedWriteRes<[Zn3ALU0123]> {
607  let Latency = 2; // FIXME: not from llvm-exegesis
608  let ResourceCycles = [4];
609  let NumMicroOps = 2;
610}
611
612def : InstRW<[Zn3SlowLEA16r], (instrs LEA16r)>;
613
614// Integer multiplication
615defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
616defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
617defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
618defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
619defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
620defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
621defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
622defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
623defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
624defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
625defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
626defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
627defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
628defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
629
630defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
631defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
632
633defm : Zn3WriteResIntPair<WriteCMPXCHG, [Zn3ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
634
635def Zn3WriteCMPXCHG8rr : SchedWriteRes<[Zn3ALU0123]> {
636  let Latency = 3;
637  let ResourceCycles = [12];
638  let NumMicroOps = 3;
639}
640def : InstRW<[Zn3WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
641
642defm : Zn3WriteResInt<WriteCMPXCHGRMW, [Zn3ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
643
644def Zn3WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
645  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteCMPXCHG8rr.Latency);
646  let ResourceCycles = [1, 1, 12];
647  let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
648}
649def : InstRW<[Zn3WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
650
651def Zn3WriteCMPXCHG8B : SchedWriteRes<[Zn3ALU0123]> {
652  let Latency = 3; // FIXME: not from llvm-exegesis
653  let ResourceCycles = [24];
654  let NumMicroOps = 19;
655}
656def : InstRW<[Zn3WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
657
658def Zn3WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn3ALU0123]> {
659  let Latency = 4; // FIXME: not from llvm-exegesis
660  let ResourceCycles = [59];
661  let NumMicroOps = 28;
662}
663def : InstRW<[Zn3WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
664
665def Zn3WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn3ALU0123]> {
666  let Latency = 1;
667  let ResourceCycles = [2];
668  let NumMicroOps = 2;
669}
670def : InstRW<[Zn3WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
671
672def Zn3WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
673  let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
674  let ResourceCycles = [1, 1, 2];
675  let NumMicroOps = 5;
676}
677def : InstRW<[Zn3WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
678
679def Zn3WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU0123]> {
680  let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
681  let ResourceCycles = [1, 1, 2];
682  let NumMicroOps = 2;
683}
684def : InstRW<[Zn3WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
685
686// Integer division.
687// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
688// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
689defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
690defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
691defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
692defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
693defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
694defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
695defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
696defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
697
698defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
699defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
700
701defm : Zn3WriteResIntPair<WritePOPCNT, [Zn3ALU0123], 1, [1], 1>; // Bit population count.
702
703def Zn3WritePOPCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
704  let Latency = 1;
705  let ResourceCycles = [4];
706  let NumMicroOps = 1;
707}
708def : InstRW<[Zn3WritePOPCNT16rr], (instrs POPCNT16rr)>;
709
710defm : Zn3WriteResIntPair<WriteLZCNT, [Zn3ALU0123], 1, [1], 1>; // Leading zero count.
711
712def Zn3WriteLZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
713  let Latency = 1;
714  let ResourceCycles = [4];
715  let NumMicroOps = 1;
716}
717def : InstRW<[Zn3WriteLZCNT16rr], (instrs LZCNT16rr)>;
718
719defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
720
721def Zn3WriteTZCNT16rr : SchedWriteRes<[Zn3ALU0123]> {
722  let Latency = 2;
723  let ResourceCycles = [4];
724  let NumMicroOps = 2;
725}
726def : InstRW<[Zn3WriteTZCNT16rr], (instrs TZCNT16rr)>;
727
728defm : Zn3WriteResIntPair<WriteCMOV, [Zn3ALU03], 1, [1], 1>; // Conditional move.
729defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
730defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
731defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
732defm : Zn3WriteResInt<WriteLAHFSAHF, [Zn3ALU3], 1, [1], 1>; // Load/Store flags in AH.
733
734defm : Zn3WriteResInt<WriteBitTest, [Zn3ALU12], 1, [1], 1>; // Bit Test
735defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
736defm : Zn3WriteResInt<WriteBitTestRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 7>;
737
738defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
739defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
740defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
741
742// Integer shifts and rotates.
743defm : Zn3WriteResIntPair<WriteShift, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
744defm : Zn3WriteResIntPair<WriteShiftCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
745defm : Zn3WriteResIntPair<WriteRotate, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
746
747def Zn3WriteRotateR1 : SchedWriteRes<[Zn3ALU12]> {
748  let Latency = 1;
749  let ResourceCycles = [2];
750  let NumMicroOps = 1;
751}
752def : InstRW<[Zn3WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
753                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
754
755def Zn3WriteRotateM1 : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
756  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateR1.Latency);
757  let ResourceCycles = [1, 1, 2];
758  let NumMicroOps = !add(Zn3WriteRotateR1.NumMicroOps, 1);
759}
760def : InstRW<[Zn3WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
761                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
762
763def Zn3WriteRotateRightRI : SchedWriteRes<[Zn3ALU12]> {
764  let Latency = 3;
765  let ResourceCycles = [6];
766  let NumMicroOps = 7;
767}
768def : InstRW<[Zn3WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
769
770def Zn3WriteRotateRightMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
771  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRI.Latency);
772  let ResourceCycles = [1, 1, 8];
773  let NumMicroOps = !add(Zn3WriteRotateRightRI.NumMicroOps, 3);
774}
775def : InstRW<[Zn3WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
776
777def Zn3WriteRotateLeftRI : SchedWriteRes<[Zn3ALU12]> {
778  let Latency = 4;
779  let ResourceCycles = [8];
780  let NumMicroOps = 9;
781}
782def : InstRW<[Zn3WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
783
784def Zn3WriteRotateLeftMI : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
785  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRI.Latency);
786  let ResourceCycles = [1, 1, 8];
787  let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
788}
789def : InstRW<[Zn3WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
790
791defm : Zn3WriteResIntPair<WriteRotateCL, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
792
793def Zn3WriteRotateRightRCL : SchedWriteRes<[Zn3ALU12]> {
794  let Latency = 3;
795  let ResourceCycles = [6];
796  let NumMicroOps = 7;
797}
798def : InstRW<[Zn3WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
799
800def Zn3WriteRotateRightMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
801  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateRightRCL.Latency);
802  let ResourceCycles = [1, 1, 8];
803  let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
804}
805def : InstRW<[Zn3WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
806
807def Zn3WriteRotateLeftRCL : SchedWriteRes<[Zn3ALU12]> {
808  let Latency = 4;
809  let ResourceCycles = [8];
810  let NumMicroOps = 9;
811}
812def : InstRW<[Zn3WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
813
814def Zn3WriteRotateLeftMCL : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3ALU12]> {
815  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteRotateLeftRCL.Latency);
816  let ResourceCycles = [1, 1, 8];
817  let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
818}
819def : InstRW<[Zn3WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
820
821// Double shift instructions.
822defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
823defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
824defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
825defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
826
827// BMI1 BEXTR/BLS, BMI2 BZHI
828defm : Zn3WriteResIntPair<WriteBEXTR, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
829defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
830defm : Zn3WriteResIntPair<WriteBZHI, [Zn3ALU12], 1, [1], 1, /*LoadUOps=*/1>;
831
832// Idioms that clear a register, like xorps %xmm0, %xmm0.
833// These can often bypass execution ports completely.
834defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
835
836// Branches don't produce values, so they have no latency, but they still
837// consume resources. Indirect branches can fold loads.
838defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
839
840// Floating point. This covers both scalar and vector operations.
841defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
842defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
843defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
844defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
845defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
846defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
847defm : Zn3WriteResXMM<WriteFMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
848defm : Zn3WriteResYMM<WriteFMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
849defm : Zn3WriteResXMM<WriteFStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
850
851def Zn3WriteWriteFStoreMMX : SchedWriteRes<[Zn3FPSt, Zn3Store]> {
852  let Latency = 2; // FIXME: not from llvm-exegesis
853  let ResourceCycles = [1, 1];
854  let NumMicroOps = 2;
855}
856def : InstRW<[Zn3WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
857                                               VMOVHPDmr, VMOVHPSmr)>;
858
859defm : Zn3WriteResXMM<WriteFStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
860defm : Zn3WriteResYMM<WriteFStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
861defm : Zn3WriteResXMM<WriteFStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
862defm : Zn3WriteResXMM<WriteFStoreNTX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
863defm : Zn3WriteResYMM<WriteFStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
864
865defm : Zn3WriteResXMM<WriteFMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
866defm : Zn3WriteResXMM<WriteFMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
867defm : Zn3WriteResYMM<WriteFMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
868defm : Zn3WriteResYMM<WriteFMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
869
870defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
871
872def Zn3WriteX87Arith : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
873  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
874  let ResourceCycles = [1, 1, 24];
875  let NumMicroOps = 2;
876}
877def : InstRW<[Zn3WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
878                                         SUB_FI16m, SUB_FI32m,
879                                         SUBR_FI16m, SUBR_FI32m,
880                                         MUL_FI16m, MUL_FI32m)>;
881
882def Zn3WriteX87Div : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
883  let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
884  let ResourceCycles = [1, 1, 62];
885  let NumMicroOps = 2;
886}
887def : InstRW<[Zn3WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
888                                       DIVR_FI16m, DIVR_FI32m)>;
889
890defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
891defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
892defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
893defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
894defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
895defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
896defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
897defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>;  // Floating point compare.
898defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
899defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
900defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
901defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>;  // Floating point double compare.
902defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
903defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
904defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
905defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
906defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
907defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>;  // Floating point multiplication.
908defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
909defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
910defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
911defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
912defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
913defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
914defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
915defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>;  // Floating point division.
916defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
917defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
918defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
919defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>;  // Floating point double division.
920defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
921defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
922defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
923defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>;   // Floating point square root.
924defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
925defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
926defm : X86WriteResPairUnsupported<WriteFSqrtZ>;  // Floating point square root (ZMM).
927defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>;  // Floating point double square root.
928defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
929defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
930defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
931defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
932defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>;  // Floating point reciprocal estimate.
933defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
934defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
935defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
936defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>;  // Floating point reciprocal square root estimate.
937defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
938defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
939defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
940defm : Zn3WriteResXMMPair<WriteFMA, [Zn3FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
941defm : Zn3WriteResXMMPair<WriteFMAX, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM).
942defm : Zn3WriteResYMMPair<WriteFMAY, [Zn3FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM).
943defm : X86WriteResPairUnsupported<WriteFMAZ>; // Fused Multiply Add (ZMM).
944defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
945defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
946defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
947defm : X86WriteResPairUnsupported<WriteDPPSZ>; // Floating point single dot product (ZMM).
948defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
949defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
950defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
951defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
952defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
953defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
954defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
955defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
956defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
957defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
958defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
959defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
960defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
961defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
962defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
963defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
964defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
965defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
966defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
967defm : Zn3WriteResXMMPair<WriteFVarBlend, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends.
968defm : Zn3WriteResYMMPair<WriteFVarBlendY, [Zn3FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
969defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; // Fp vector variable blends (ZMM).
970
971// Horizontal Add/Sub (float and integer)
972defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
973defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
974defm : X86WriteResPairUnsupported<WriteFHAddZ>;
975defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
976defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
977defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
978defm : X86WriteResPairUnsupported<WritePHAddZ>;
979
980// Vector integer operations.
981defm : Zn3WriteResXMM<WriteVecLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
982defm : Zn3WriteResXMM<WriteVecLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
983defm : Zn3WriteResYMM<WriteVecLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
984defm : Zn3WriteResXMM<WriteVecLoadNT, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
985defm : Zn3WriteResYMM<WriteVecLoadNTY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
986defm : Zn3WriteResXMM<WriteVecMaskedLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
987defm : Zn3WriteResYMM<WriteVecMaskedLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
988defm : Zn3WriteResXMM<WriteVecStore, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
989defm : Zn3WriteResXMM<WriteVecStoreX, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
990
991def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
992  let Latency = 4;
993  let ResourceCycles = [1];
994  let NumMicroOps = 1;
995}
996def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
997
998def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
999  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1000  let ResourceCycles = [1, 1, 1];
1001  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1002}
1003def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
1004
1005def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
1006  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1007  let ResourceCycles = [1, 1, 1];
1008  let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1009}
1010def : InstRW<[Zn3WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
1011
1012defm : Zn3WriteResYMM<WriteVecStoreY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1013defm : Zn3WriteResXMM<WriteVecStoreNT, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1014defm : Zn3WriteResYMM<WriteVecStoreNTY, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [1, 1], 1>;
1015defm : Zn3WriteResXMM<WriteVecMaskedStore32, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1016defm : Zn3WriteResXMM<WriteVecMaskedStore64, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [4, 1], 10>;
1017defm : Zn3WriteResYMM<WriteVecMaskedStore32Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [12, 1], 42>;
1018defm : Zn3WriteResYMM<WriteVecMaskedStore64Y, [Zn3FPSt, Zn3Store], Znver3Model.StoreLatency, [6, 1], 18>;
1019
1020defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1021defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1022
1023def Zn3WriteMOVMMX : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1024  let Latency = 1;
1025  let ResourceCycles = [1, 2];
1026  let NumMicroOps = 2;
1027}
1028def : InstRW<[Zn3WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1029
1030def Zn3WriteMOVMMXSlow : SchedWriteRes<[Zn3FPLd01, Zn3FPFMisc0123]> {
1031  let Latency = 1;
1032  let ResourceCycles = [1, 4];
1033  let NumMicroOps = 2;
1034}
1035def : InstRW<[Zn3WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1036
1037defm : Zn3WriteResXMMPair<WriteVecALU, [Zn3FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1038
1039def Zn3WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1040  let Latency = 3;
1041  let ResourceCycles = [1, 1];
1042  let NumMicroOps = 1;
1043}
1044def : InstRW<[Zn3WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1045
1046def Zn3WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn3FPVShuf01, Zn3FPLd01]> {
1047  let Latency = 3;
1048  let ResourceCycles = [1, 1];
1049  let NumMicroOps = 2;
1050}
1051def : InstRW<[Zn3WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1052
1053defm : Zn3WriteResXMMPair<WriteVecALUX, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1054
1055def Zn3WriteVecALUXSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1056  let Latency = 1;
1057  let ResourceCycles = [1];
1058  let NumMicroOps = 1;
1059}
1060def : InstRW<[Zn3WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1061                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1062                                            PAVGBrr, PAVGWrr,
1063                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1064                                            VPABSBrr, VPABSDrr, VPABSWrr,
1065                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1066                                            VPAVGBrr, VPAVGWrr,
1067                                            VPCMPEQQrr,
1068                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1069                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1070
1071def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
1072  let Latency = 1;
1073  let ResourceCycles = [1];
1074  let NumMicroOps = 1;
1075}
1076def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
1077                                           MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
1078                                           MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr,
1079                                           MMX_PAVGBrr, MMX_PAVGWrr,
1080                                           MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>;
1081
1082defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1083
1084def Zn3WriteVecALUYSlow : SchedWriteRes<[Zn3FPVAdd01]> {
1085  let Latency = 1;
1086  let ResourceCycles = [1];
1087  let NumMicroOps = 1;
1088}
1089def : InstRW<[Zn3WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1090                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1091                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1092                                            VPAVGBYrr, VPAVGWYrr,
1093                                            VPCMPEQQYrr,
1094                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1095
1096defm : X86WriteResPairUnsupported<WriteVecALUZ>; // Vector integer ALU op, no logicals (ZMM).
1097defm : Zn3WriteResXMMPair<WriteVecLogic, [Zn3FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1098defm : Zn3WriteResXMMPair<WriteVecLogicX, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1099defm : Zn3WriteResYMMPair<WriteVecLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1100defm : X86WriteResPairUnsupported<WriteVecLogicZ>; // Vector integer and/or/xor logicals (ZMM).
1101defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1102defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1103defm : X86WriteResPairUnsupported<WriteVecTestZ>;  // Vector integer TEST instructions (ZMM).
1104defm : Zn3WriteResXMMPair<WriteVecShift, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1105defm : Zn3WriteResXMMPair<WriteVecShiftX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (XMM).
1106defm : Zn3WriteResYMMPair<WriteVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1107defm : X86WriteResPairUnsupported<WriteVecShiftZ>; // Vector integer shifts (ZMM).
1108defm : Zn3WriteResXMMPair<WriteVecShiftImm, [Zn3FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1109defm : Zn3WriteResXMMPair<WriteVecShiftImmX, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1110defm : Zn3WriteResYMMPair<WriteVecShiftImmY, [Zn3FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1111defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; // Vector integer immediate shifts (ZMM).
1112defm : Zn3WriteResXMMPair<WriteVecIMul, [Zn3FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1113defm : Zn3WriteResXMMPair<WriteVecIMulX, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1114defm : Zn3WriteResYMMPair<WriteVecIMulY, [Zn3FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1115defm : X86WriteResPairUnsupported<WriteVecIMulZ>; // Vector integer multiply (ZMM).
1116defm : Zn3WriteResXMMPair<WritePMULLD, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD.
1117defm : Zn3WriteResYMMPair<WritePMULLDY, [Zn3FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1118defm : X86WriteResPairUnsupported<WritePMULLDZ>; // Vector PMULLD (ZMM).
1119defm : Zn3WriteResXMMPair<WriteShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1120defm : Zn3WriteResXMMPair<WriteShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1121defm : Zn3WriteResYMMPair<WriteShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1122defm : X86WriteResPairUnsupported<WriteShuffleZ>; // Vector shuffles (ZMM).
1123defm : Zn3WriteResXMMPair<WriteVarShuffle, [Zn3FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1124defm : Zn3WriteResXMMPair<WriteVarShuffleX, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1125defm : Zn3WriteResYMMPair<WriteVarShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1126defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; // Vector variable shuffles (ZMM).
1127defm : Zn3WriteResXMMPair<WriteBlend, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends.
1128defm : Zn3WriteResYMMPair<WriteBlendY, [Zn3FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1129defm : X86WriteResPairUnsupported<WriteBlendZ>; // Vector blends (ZMM).
1130defm : Zn3WriteResXMMPair<WriteVarBlend, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends.
1131defm : Zn3WriteResYMMPair<WriteVarBlendY, [Zn3FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1132defm : X86WriteResPairUnsupported<WriteVarBlendZ>; // Vector variable blends (ZMM).
1133defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1134defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1135defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1136defm : X86WriteResPairUnsupported<WritePSADBWZ>; // Vector PSADBW (ZMM).
1137defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1138defm : Zn3WriteResYMMPair<WriteMPSADY, [Zn3FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1139defm : X86WriteResPairUnsupported<WriteMPSADZ>; // Vector MPSAD (ZMM).
1140defm : Zn3WriteResXMMPair<WritePHMINPOS, [Zn3FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1141
1142// Vector insert/extract operations.
1143defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1144defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1145defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1146
1147// MOVMSK operations.
1148defm : Zn3WriteResXMM<WriteFMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1149defm : Zn3WriteResXMM<WriteVecMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1150defm : Zn3WriteResYMM<WriteVecMOVMSKY, [Zn3FPVMisc2], 1, [1], 1>;
1151defm : Zn3WriteResXMM<WriteMMXMOVMSK, [Zn3FPVMisc2], 1, [1], 1>;
1152
1153// Conversion between integer and float.
1154defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>;  // Double -> Integer.
1155defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1156defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1157defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1158
1159def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1160  let Latency = 1;
1161  let ResourceCycles = [2];
1162  let NumMicroOps = 2;
1163}
1164def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>;
1165
1166defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>;  // Float -> Integer.
1167
1168defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1169defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1170defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1171
1172defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1173defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1174defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1175defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1176
1177def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1178  let Latency = 2;
1179  let ResourceCycles = [6];
1180  let NumMicroOps = 2;
1181}
1182def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>;
1183
1184defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1185defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1186defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1187defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1188
1189def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
1190  let Latency = 3;
1191  let ResourceCycles = [1];
1192  let NumMicroOps = 2;
1193}
1194def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>;
1195
1196defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1197defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1198defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1199defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1200
1201defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1202defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1203defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1204defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1205
1206defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1207defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1208defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1209
1210defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1211defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1212defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1213defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1214defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1215defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1216
1217// CRC32 instruction.
1218defm : Zn3WriteResIntPair<WriteCRC32, [Zn3ALU1], 3, [1], 1>;
1219
1220def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1221  let Latency = 2;
1222  let ResourceCycles = [2];
1223  let NumMicroOps = 2;
1224}
1225def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1226
1227def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1228  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
1229  let ResourceCycles = [1, 1, 2];
1230  let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
1231}
1232def : InstRW<[Zn3WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1233
1234def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
1235  let Latency = 1;
1236  let ResourceCycles = [2];
1237  let NumMicroOps = 1;
1238}
1239def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1240
1241def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1242  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1243  let ResourceCycles = [1, 1, 2];
1244  let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1245}
1246def : InstRW<[Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1247
1248def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
1249  let Latency = 2;
1250  let ResourceCycles = [3];
1251  let NumMicroOps = 2;
1252}
1253def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1254
1255def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1256  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
1257  let ResourceCycles = [1, 1, 3];
1258  let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
1259}
1260def : InstRW<[Zn3Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1261
1262def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
1263  let Latency = 3;
1264  let ResourceCycles = [8];
1265  let NumMicroOps = 4;
1266}
1267def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1268
1269def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
1270  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
1271  let ResourceCycles = [1, 1, 8];
1272  let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
1273}
1274def : InstRW<[Zn3WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1275
1276def Zn3WriteSHA1RNDS4rri : SchedWriteRes<[Zn3FPU0123]> {
1277  let Latency = 6;
1278  let ResourceCycles = [8];
1279  let NumMicroOps = 1;
1280}
1281def : InstRW<[Zn3WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1282
1283def Zn3WriteSHA256RNDS2rr : SchedWriteRes<[Zn3FPU0123]> {
1284  let Latency = 4;
1285  let ResourceCycles = [8];
1286  let NumMicroOps = 1;
1287}
1288def : InstRW<[Zn3WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1289
1290// Strings instructions.
1291// Packed Compare Implicit Length Strings, Return Mask
1292defm : Zn3WriteResXMMPair<WritePCmpIStrM, [Zn3FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1293// Packed Compare Explicit Length Strings, Return Mask
1294defm : Zn3WriteResXMMPair<WritePCmpEStrM, [Zn3FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1295// Packed Compare Implicit Length Strings, Return Index
1296defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1297// Packed Compare Explicit Length Strings, Return Index
1298defm : Zn3WriteResXMMPair<WritePCmpEStrI, [Zn3FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1299
1300// AES instructions.
1301defm : Zn3WriteResXMMPair<WriteAESDecEnc, [Zn3FPAES01], 4, [1], 1>; // Decryption, encryption.
1302defm : Zn3WriteResXMMPair<WriteAESIMC, [Zn3FPAES01], 4, [1], 1>; // InvMixColumn.
1303defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Generation.
1304
1305// Carry-less multiplication instructions.
1306defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
1307
1308// EMMS/FEMMS
1309defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1310
1311// Load/store MXCSR
1312defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1313defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1314
1315// Catch-all for expensive system instructions.
1316defm : Zn3WriteResInt<WriteSystem, [Zn3ALU0123], 100, [100], 100>;
1317
1318def Zn3WriteVZEROUPPER : SchedWriteRes<[Zn3FPU0123]> {
1319  let Latency = 0; // FIXME: not from llvm-exegesis
1320  let ResourceCycles = [1];
1321  let NumMicroOps = 1;
1322}
1323def : InstRW<[Zn3WriteVZEROUPPER], (instrs VZEROUPPER)>;
1324
1325def Zn3WriteVZEROALL : SchedWriteRes<[Zn3FPU0123]> {
1326  let Latency = 10; // FIXME: not from llvm-exegesis
1327  let ResourceCycles = [24];
1328  let NumMicroOps = 18;
1329}
1330def : InstRW<[Zn3WriteVZEROALL], (instrs VZEROALL)>;
1331
1332// AVX2.
1333defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1334defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1335defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1336
1337def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
1338  let Latency = 3;
1339  let ResourceCycles = [1];
1340  let NumMicroOps = 1;
1341}
1342def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
1343
1344def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1345  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
1346  let ResourceCycles = [1, 1, 1];
1347  let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1348}
1349def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
1350
1351def Zn3WriteVPERMPSYrr : SchedWriteRes<[Zn3FPVShuf]> {
1352  let Latency = 7;
1353  let ResourceCycles = [1];
1354  let NumMicroOps = 2;
1355}
1356def : InstRW<[Zn3WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1357
1358def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1359  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMPSYrr.Latency);
1360  let ResourceCycles = [1, 1, 2];
1361  let NumMicroOps = !add(Zn3WriteVPERMPSYrr.NumMicroOps, 1);
1362}
1363def : InstRW<[Zn3WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1364
1365def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
1366  let Latency = 6;
1367  let ResourceCycles = [1];
1368  let NumMicroOps = 2;
1369}
1370def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1371
1372def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1373  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
1374  let ResourceCycles = [1, 1, 2];
1375  let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
1376}
1377def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1378
1379def Zn3WriteVPERMDYrr : SchedWriteRes<[Zn3FPVShuf]> {
1380  let Latency = 5;
1381  let ResourceCycles = [1];
1382  let NumMicroOps = 2;
1383}
1384def : InstRW<[Zn3WriteVPERMDYrr], (instrs VPERMDYrr)>;
1385
1386def Zn3WriteVPERMYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
1387  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMDYrr.Latency);
1388  let ResourceCycles = [1, 1, 2];
1389  let NumMicroOps = !add(Zn3WriteVPERMDYrr.NumMicroOps, 0);
1390}
1391def : InstRW<[Zn3WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1392
1393defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1394defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1395defm : Zn3WriteResXMMPair<WriteVarVecShift, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts.
1396defm : Zn3WriteResYMMPair<WriteVarVecShiftY, [Zn3FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1397defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Variable vector shifts (ZMM).
1398
1399// Old microcoded instructions that nobody use.
1400defm : Zn3WriteResInt<WriteMicrocoded, [Zn3ALU0123], 100, [100], 100>;
1401
1402// Fence instructions.
1403defm : Zn3WriteResInt<WriteFence, [Zn3ALU0123], 1, [100], 1>;
1404
1405def Zn3WriteLFENCE : SchedWriteRes<[Zn3LSU]> {
1406  let Latency = 1;
1407  let ResourceCycles = [30];
1408  let NumMicroOps = 1;
1409}
1410def : InstRW<[Zn3WriteLFENCE], (instrs LFENCE)>;
1411
1412def Zn3WriteSFENCE : SchedWriteRes<[Zn3LSU]> {
1413  let Latency = 1;
1414  let ResourceCycles = [1];
1415  let NumMicroOps = 1;
1416}
1417def : InstRW<[Zn3WriteSFENCE], (instrs SFENCE)>;
1418
1419// Nop, not very useful expect it provides a model for nops!
1420defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1421
1422
1423///////////////////////////////////////////////////////////////////////////////
1424// Zero Cycle Move
1425///////////////////////////////////////////////////////////////////////////////
1426
1427def Zn3WriteZeroLatency : SchedWriteRes<[]> {
1428  let Latency = 0;
1429  let ResourceCycles = [];
1430  let NumMicroOps = 1;
1431}
1432def : InstRW<[Zn3WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1433                                               MOV64rr, MOV64rr_REV,
1434                                               MOVSX32rr32)>;
1435
1436def Zn3WriteSwapRenameable : SchedWriteRes<[]> {
1437  let Latency = 0;
1438  let ResourceCycles = [];
1439  let NumMicroOps = 2;
1440}
1441def : InstRW<[Zn3WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1442                                               XCHG64rr, XCHG64ar)>;
1443
1444defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1445
1446defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
1447defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1448defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1449defm : X86WriteResUnsupported<WriteFMoveZ>;
1450
1451defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
1452defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1453defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1454defm : X86WriteResUnsupported<WriteVecMoveZ>;
1455
1456def : IsOptimizableRegisterMove<[
1457  InstructionEquivalenceClass<[
1458    // GPR variants.
1459    MOV32rr, MOV32rr_REV,
1460    MOV64rr, MOV64rr_REV,
1461    MOVSX32rr32,
1462    XCHG32rr, XCHG32ar,
1463    XCHG64rr, XCHG64ar,
1464
1465    // MMX variants.
1466    // MMX moves are *NOT* eliminated.
1467
1468    // SSE variants.
1469    MOVAPSrr, MOVAPSrr_REV,
1470    MOVUPSrr, MOVUPSrr_REV,
1471    MOVAPDrr, MOVAPDrr_REV,
1472    MOVUPDrr, MOVUPDrr_REV,
1473    MOVDQArr, MOVDQArr_REV,
1474    MOVDQUrr, MOVDQUrr_REV,
1475
1476    // AVX variants.
1477    VMOVAPSrr, VMOVAPSrr_REV,
1478    VMOVUPSrr, VMOVUPSrr_REV,
1479    VMOVAPDrr, VMOVAPDrr_REV,
1480    VMOVUPDrr, VMOVUPDrr_REV,
1481    VMOVDQArr, VMOVDQArr_REV,
1482    VMOVDQUrr, VMOVDQUrr_REV,
1483
1484    // AVX YMM variants.
1485    VMOVAPSYrr, VMOVAPSYrr_REV,
1486    VMOVUPSYrr, VMOVUPSYrr_REV,
1487    VMOVAPDYrr, VMOVAPDYrr_REV,
1488    VMOVUPDYrr, VMOVUPDYrr_REV,
1489    VMOVDQAYrr, VMOVDQAYrr_REV,
1490    VMOVDQUYrr, VMOVDQUYrr_REV,
1491  ], TruePred >
1492]>;
1493
1494///////////////////////////////////////////////////////////////////////////////
1495// Dependency breaking instructions.
1496///////////////////////////////////////////////////////////////////////////////
1497
1498def Zn3WriteZeroIdiom : SchedWriteVariant<[
1499    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1500    SchedVar<NoSchedPred,                          [WriteALU]>
1501]>;
1502def : InstRW<[Zn3WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1503                                          XOR64rr, XOR64rr_REV,
1504                                          SUB32rr, SUB32rr_REV,
1505                                          SUB64rr, SUB64rr_REV)>;
1506
1507def Zn3WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1508    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn3WriteZeroLatency]>,
1509    SchedVar<NoSchedPred,                                 [WriteALU]>
1510]>;
1511def : InstRW<[Zn3WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1512                                                CMP16rr, CMP16rr_REV,
1513                                                CMP32rr, CMP32rr_REV,
1514                                                CMP64rr, CMP64rr_REV)>;
1515
1516def Zn3WriteFZeroIdiom : SchedWriteVariant<[
1517    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1518    SchedVar<NoSchedPred,                          [WriteFLogic]>
1519]>;
1520// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1521def : InstRW<[Zn3WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1522                                           VANDNPSrr, VANDNPDrr)>;
1523
1524def Zn3WriteFZeroIdiomY : SchedWriteVariant<[
1525    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1526    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1527]>;
1528def : InstRW<[Zn3WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1529                                            VANDNPSYrr, VANDNPDYrr)>;
1530
1531def Zn3WriteVZeroIdiomLogicX : SchedWriteVariant<[
1532    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1533    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1534]>;
1535// NOTE: PXORrr,PANDNrr are not zero-cycle!
1536def : InstRW<[Zn3WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
1537
1538def Zn3WriteVZeroIdiomLogicY : SchedWriteVariant<[
1539    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1540    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1541]>;
1542def : InstRW<[Zn3WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
1543
1544def Zn3WriteVZeroIdiomALUX : SchedWriteVariant<[
1545    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1546    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1547]>;
1548// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1549//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1550def : InstRW<[Zn3WriteVZeroIdiomALUX],
1551             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1552                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
1553
1554def Zn3WriteVZeroIdiomALUY : SchedWriteVariant<[
1555    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn3WriteZeroLatency]>,
1556    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1557]>;
1558def : InstRW<[Zn3WriteVZeroIdiomALUY],
1559             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1560                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
1561
1562def : IsZeroIdiomFunction<[
1563  // GPR Zero-idioms.
1564  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1565                     XOR64rr, XOR64rr_REV,
1566                     SUB32rr, SUB32rr_REV,
1567                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1568
1569  // SSE XMM Zero-idioms.
1570  DepBreakingClass<[
1571    // fp variants.
1572    XORPSrr, XORPDrr,
1573    ANDNPSrr, ANDNPDrr,
1574
1575    // int variants.
1576    PXORrr,
1577    PANDNrr,
1578    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1579    PSUBSBrr, PSUBSWrr,
1580    PSUBUSBrr, PSUBUSWrr,
1581    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1582  ], ZeroIdiomPredicate>,
1583
1584  // AVX XMM Zero-idioms.
1585  DepBreakingClass<[
1586    // fp variants.
1587    VXORPSrr, VXORPDrr,
1588    VANDNPSrr, VANDNPDrr,
1589
1590    // int variants.
1591    VPXORrr,
1592    VPANDNrr,
1593    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1594    VPSUBSBrr, VPSUBSWrr,
1595    VPSUBUSBrr, VPSUBUSWrr,
1596    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1597  ], ZeroIdiomPredicate>,
1598
1599  // AVX YMM Zero-idioms.
1600  DepBreakingClass<[
1601    // fp variants.
1602    VXORPSYrr, VXORPDYrr,
1603    VANDNPSYrr, VANDNPDYrr,
1604
1605    // int variants.
1606    VPXORYrr,
1607    VPANDNYrr,
1608    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1609    VPSUBSBYrr, VPSUBSWYrr,
1610    VPSUBUSBYrr, VPSUBUSWYrr,
1611    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1612  ], ZeroIdiomPredicate>,
1613]>;
1614
1615def : IsDepBreakingFunction<[
1616  // GPR
1617  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
1618                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
1619  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
1620                     CMP16rr, CMP16rr_REV,
1621                     CMP32rr, CMP32rr_REV,
1622                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
1623
1624  // MMX
1625  DepBreakingClass<[
1626    MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr
1627  ], ZeroIdiomPredicate>,
1628
1629  // SSE
1630  DepBreakingClass<[
1631    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1632  ], ZeroIdiomPredicate>,
1633
1634  // AVX XMM
1635  DepBreakingClass<[
1636    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1637  ], ZeroIdiomPredicate>,
1638
1639  // AVX YMM
1640  DepBreakingClass<[
1641    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
1642  ], ZeroIdiomPredicate>,
1643]>;
1644
1645} // SchedModel
1646