xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td (revision 04eeddc0aa8e0a417a16eaf9d7d095207f4a8623)
1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD btver2 (Jaguar) to support
10// instruction scheduling and other instruction cost heuristics. Based off AMD Software
11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
12//
13//===----------------------------------------------------------------------===//
14
15def BtVer2Model : SchedMachineModel {
16  // All x86 instructions are modeled as a single micro-op, and btver2 can
17  // decode 2 instructions per cycle.
18  let IssueWidth = 2;
19  let MicroOpBufferSize = 64; // Retire Control Unit
20  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
21  let HighLatency = 25;
22  let MispredictPenalty = 14; // Minimum branch misdirection penalty
23  let PostRAScheduler = 1;
24
25  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
26  // the scheduler to assign a default model to unrecognized opcodes.
27  let CompleteModel = 0;
28}
29
30let SchedModel = BtVer2Model in {
31
32// Jaguar can issue up to 6 micro-ops in one cycle
33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
39
40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
41// speculative version of the 64-bit integer registers.
42// Reference: www.realworldtech.com/jaguar/4/
43//
44// The processor always keeps the different parts of an integer register
45// together. An instruction that writes to a part of a register will therefore
46// have a false dependence on any previous write to the same register or any
47// part of it.
48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
49// access" - Agner Fog's "microarchitecture.pdf".
50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
51                               0,  // Max moves that can be eliminated per cycle.
52                               1>; // Restrict move elimination to zero regs.
53
54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
55// registers. Operations on 256-bit data types are cracked into two COPs.
56// Reference: www.realworldtech.com/jaguar/4/
57
58// The PRF in the floating point unit can eliminate a move from a MMX or SSE
59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
60// dependency breaking instruction, or via VZEROALL).
61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
62// instructions" - Agner Fog's "microarchitecture.pdf"
63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
64                          0,  // Max moves that can be eliminated per cycle.
65                          1>; // Restrict move elimination to zero regs.
66
67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
68// retire up to two macro-ops per cycle.
69// Reference: "Software Optimization Guide for AMD Family 16h Processors"
70def JRCU : RetireControlUnit<64, 2>;
71
72// Integer Pipe Scheduler
73def JALU01 : ProcResGroup<[JALU0, JALU1]> {
74  let BufferSize=20;
75}
76
77// AGU Pipe Scheduler
78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
79  let BufferSize=12;
80}
81
82// Fpu Pipe Scheduler
83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
84  let BufferSize=18;
85}
86
87// Functional units
88def JDiv    : ProcResource<1>; // integer division
89def JMul    : ProcResource<1>; // integer multiplication
90def JVALU0  : ProcResource<1>; // vector integer
91def JVALU1  : ProcResource<1>; // vector integer
92def JVIMUL  : ProcResource<1>; // vector integer multiplication
93def JSTC    : ProcResource<1>; // vector store/convert
94def JFPM    : ProcResource<1>; // FP multiplication
95def JFPA    : ProcResource<1>; // FP addition
96
97// Functional unit groups
98def JFPX  : ProcResGroup<[JFPA, JFPM]>;
99def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
100
101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
102// cycles after the memory operand.
103def : ReadAdvance<ReadAfterLd, 3>;
104
105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
106// cycles after the memory operand.
107def : ReadAdvance<ReadAfterVecLd, 5>;
108def : ReadAdvance<ReadAfterVecXLd, 5>;
109def : ReadAdvance<ReadAfterVecYLd, 5>;
110
111/// "Additional 6 cycle transfer operation which moves a floating point
112/// operation input value from the integer unit to the floating point unit.
113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114def : ReadAdvance<ReadInt2Fpu, -6>;
115
116// Many SchedWrites are defined in pairs with and without a folded load.
117// Instructions with folded loads are usually micro-fused, so they only appear
118// as two micro-ops when dispatched by the schedulers.
119// This multiclass defines the resource usage for variants with and without
120// folded loads.
121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
122                            list<ProcResourceKind> ExePorts,
123                            int Lat, list<int> Res = [], int UOps = 1,
124                            int LoadUOps = 0> {
125  // Register variant is using a single cycle on ExePort.
126  def : WriteRes<SchedRW, ExePorts> {
127    let Latency = Lat;
128    let ResourceCycles = Res;
129    let NumMicroOps = UOps;
130  }
131
132  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
133  // latency.
134  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
135    let Latency = !add(Lat, 3);
136    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
137    let NumMicroOps = !add(UOps, LoadUOps);
138  }
139}
140
141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
142                            list<ProcResourceKind> ExePorts,
143                            int Lat, list<int> Res = [], int UOps = 1,
144                            int LoadUOps = 0> {
145  // Register variant is using a single cycle on ExePort.
146  def : WriteRes<SchedRW, ExePorts> {
147    let Latency = Lat;
148    let ResourceCycles = Res;
149    let NumMicroOps = UOps;
150  }
151
152  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
153  // latency.
154  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
155    let Latency = !add(Lat, 5);
156    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
157    let NumMicroOps = !add(UOps, LoadUOps);
158  }
159}
160
161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
162                            list<ProcResourceKind> ExePorts,
163                            int Lat, list<int> Res = [2], int UOps = 2,
164                            int LoadUOps = 0> {
165  // Register variant is using a single cycle on ExePort.
166  def : WriteRes<SchedRW, ExePorts> {
167    let Latency = Lat;
168    let ResourceCycles = Res;
169    let NumMicroOps = UOps;
170  }
171
172  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
173  // latency.
174  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
175    let Latency = !add(Lat, 5);
176    let ResourceCycles = !listconcat([2], Res);
177    let NumMicroOps = !add(UOps, LoadUOps);
178  }
179}
180
181// Instructions that have local forwarding disabled have an extra +1cy latency.
182
183// A folded store needs a cycle on the SAGU for the store data, most RMW
184// instructions don't need an extra uop.  ALU RMW operations don't seem to
185// benefit from STLF, and their observed latency is 6cy. That is the reason why
186// this write adds two extra cycles (instead of just 1cy for the store).
187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
188
189////////////////////////////////////////////////////////////////////////////////
190// Arithmetic.
191////////////////////////////////////////////////////////////////////////////////
192
193defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
194defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
195
196defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
197defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
198defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
199defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
200defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
201
202defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
203defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
206defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
209defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
212defm : X86WriteResUnsupported<WriteIMulH>;
213defm : X86WriteResUnsupported<WriteIMulHLd>;
214defm : X86WriteResPairUnsupported<WriteMULX32>;
215defm : X86WriteResPairUnsupported<WriteMULX64>;
216
217defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
218defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
219defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
220defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
221defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
222defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
223defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
224defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
225
226defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
227
228defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
229defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
230def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
231def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
232def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
233
234defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
235defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
236defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
237defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
238defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
239defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
240
241// This is for simple LEAs with one or two input operands.
242def : WriteRes<WriteLEA, [JALU01]>;
243
244// Bit counts.
245defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
246defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
247defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
248defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
249defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
250
251// BMI1 BEXTR/BLS, BMI2 BZHI
252defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
253defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
254defm : X86WriteResPairUnsupported<WriteBZHI>;
255
256////////////////////////////////////////////////////////////////////////////////
257// Integer shifts and rotates.
258////////////////////////////////////////////////////////////////////////////////
259
260defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
261defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
262defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
263defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
264
265// SHLD/SHRD.
266defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
267defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
268defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
269defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
270
271////////////////////////////////////////////////////////////////////////////////
272// Loads, stores, and moves, not folded with other operations.
273////////////////////////////////////////////////////////////////////////////////
274
275def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
276def : WriteRes<WriteStore,   [JSAGU]>;
277def : WriteRes<WriteStoreNT, [JSAGU]>;
278def : WriteRes<WriteMove,    [JALU01]>;
279defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
280
281// Load/store MXCSR.
282def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
283def : WriteRes<WriteSTMXCSR, [JSAGU]>;
284
285// Treat misc copies as a move.
286def : InstRW<[WriteMove], (instrs COPY)>;
287
288////////////////////////////////////////////////////////////////////////////////
289// Idioms that clear a register, like xorps %xmm0, %xmm0.
290// These can often bypass execution ports completely.
291////////////////////////////////////////////////////////////////////////////////
292
293def : WriteRes<WriteZero,  []>;
294
295////////////////////////////////////////////////////////////////////////////////
296// Branches don't produce values, so they have no latency, but they still
297// consume resources. Indirect branches can fold loads.
298////////////////////////////////////////////////////////////////////////////////
299
300defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
301
302////////////////////////////////////////////////////////////////////////////////
303// Special case scheduling classes.
304////////////////////////////////////////////////////////////////////////////////
305
306def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
307def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
308def : WriteRes<WriteFence,  [JSAGU]>;
309
310// Nops don't have dependencies, so there's no actual latency, but we set this
311// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
312def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
313
314def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
315  let Latency = 3;
316  let ResourceCycles = [3];
317  let NumMicroOps = 3;
318}
319
320def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
321  let Latency = 16;
322  let ResourceCycles = [3,16,16];
323  let NumMicroOps = 5;
324}
325
326def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
327  let Latency = 17;
328  let ResourceCycles = [3,17,17];
329  let NumMicroOps = 6;
330}
331
332def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
333  let Latency = 11;
334  let ResourceCycles = [3,1,1];
335  let NumMicroOps = 5;
336}
337
338def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
339  let Latency = 11;
340  let ResourceCycles = [3,1,1];
341  let NumMicroOps = 18;
342}
343
344def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
345  let Latency = 32;
346  let ResourceCycles = [6,1,1];
347  let NumMicroOps = 28;
348}
349
350def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
351  let Latency = 19;
352  let ResourceCycles = [3,19,19];
353  let NumMicroOps = 18;
354}
355
356def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
357  let Latency = 38;
358  let ResourceCycles = [6,38,38];
359  let NumMicroOps = 28;
360}
361
362def JWriteCMPXCHGVariant :  SchedWriteVariant<[
363  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
364  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
365  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
366  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
367  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
368  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
369  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
370  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
371  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
372  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
373]>;
374
375// The first five reads are contributed by the memory load operand.
376// We ignore those reads and set a read-advance for the other input operands
377// including the implicit read of RAX.
378def : InstRW<[JWriteCMPXCHGVariant,
379              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
380              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
381                                                 LCMPXCHG32, LCMPXCHG64,
382                                                 CMPXCHG8rm, CMPXCHG16rm,
383                                                 CMPXCHG32rm, CMPXCHG64rm)>;
384
385def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
386                                             CMPXCHG32rr, CMPXCHG64rr)>;
387
388def : InstRW<[JWriteCMPXCHGVariant,
389              // Ignore reads contributed by the memory operand.
390              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
391              // Add a read-advance to every implicit register read.
392              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
393                                                                           CMPXCHG8B, CMPXCHG16B)>;
394
395def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
396  let Latency = 19;
397  let ResourceCycles = [1,19,19];
398  let NumMicroOps = 1;
399}
400
401def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
402  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
403  SchedVar<NoSchedPred,                       [WriteALURMW]>
404]>;
405def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
406                                                 DEC8m, DEC16m, DEC32m, DEC64m,
407                                                 NOT8m, NOT16m, NOT32m, NOT64m,
408                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
409
410def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
411  let Latency = 2;
412  let ResourceCycles = [3];
413  let NumMicroOps = 3;
414}
415def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
416                                                      XADD32rr, XADD64rr)>;
417
418// This write defines the latency of the in/out register operand of a non-atomic
419// XADDrm. This is the first of a pair of writes that model non-atomic
420// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
421//
422// We need two writes because the instruction latency differs from the output
423// register operand latency. In particular, the first write describes the first
424// (and only) output register operand of the instruction.  However, the
425// instruction latency is set to the MAX of all the write latencies. That's why
426// a second write is needed in this case (see example below).
427//
428// Example:
429//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
430//                            ## ECX write Latency: 3cy
431//
432// Register ECX becomes available in 3 cycles. That is because the value of ECX
433// is exchanged with the value read from the stack pointer, and the load-to-use
434// latency is assumed to be 3cy.
435def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
436  let Latency = 3;  // load-to-use latency
437  let ResourceCycles = [3];
438  let NumMicroOps = 3;
439}
440
441// This write defines the latency of the in/out register operand of an atomic
442// XADDrm. This is the first of a sequence of two writes used to model atomic
443// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
444//
445//
446// Example:
447//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
448//                               ## ECX write Latency: 11cy
449//
450// The value of ECX becomes available only after 11cy from the start of
451// execution. This write is used to specifically set that operand latency.
452def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
453  let Latency = 11;
454  let ResourceCycles = [3];
455  let NumMicroOps = 3;
456}
457
458// This write defines the latency of the in/out register operand of an atomic
459// XCHGrm. This write is the first of a sequence of two writes that describe
460// atomic XCHG operations. We need two writes because the instruction latency
461// differs from the output register write latency.  We want to make sure that
462// the output register operand becomes visible after 11cy. However, we want to
463// set the instruction latency to 16cy.
464def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
465  let Latency = 11;
466  let ResourceCycles = [2];
467  let NumMicroOps = 2;
468}
469
470def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
471  let Latency = 11;
472  let ResourceCycles = [1, 1];
473  let NumMicroOps = 1;
474}
475
476def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
477  let Latency = 16;
478  let ResourceCycles = [16, 16];
479  let NumMicroOps = 1;
480}
481
482def JWriteXADDrm_Part1 : SchedWriteVariant<[
483  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
484  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
485]>;
486
487def JWriteXADDrm_Part2 : SchedWriteVariant<[
488  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
489  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
490]>;
491
492def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
493                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
494                         LXADD8, LXADD16, LXADD32, LXADD64)>;
495
496def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
497                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
498
499
500////////////////////////////////////////////////////////////////////////////////
501// Floating point. This covers both scalar and vector operations.
502////////////////////////////////////////////////////////////////////////////////
503
504defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
505defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
506defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
507defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
508defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
509defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
510defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
511defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
512
513defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
514defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
515defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
516defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
517defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
518defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
519
520defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
521defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
522defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
523defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
524
525defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
526defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
527defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
528defm : X86WriteResUnsupported<WriteFMoveZ>;
529
530defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
531
532defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
533defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
534defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
535defm : X86WriteResPairUnsupported<WriteFAddZ>;
536defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
537defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
538defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
539defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
540defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
541defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
542defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
543defm : X86WriteResPairUnsupported<WriteFCmpZ>;
544defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
545defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
546defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
547defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
548defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
549defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
550defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
551defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
552defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
553defm : X86WriteResPairUnsupported<WriteFMulZ>;
554defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
555defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
556defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
557defm : X86WriteResPairUnsupported<WriteFMul64Z>;
558defm : X86WriteResPairUnsupported<WriteFMA>;
559defm : X86WriteResPairUnsupported<WriteFMAX>;
560defm : X86WriteResPairUnsupported<WriteFMAY>;
561defm : X86WriteResPairUnsupported<WriteFMAZ>;
562defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
563defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
564defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
565defm : X86WriteResPairUnsupported<WriteDPPSZ>;
566defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
567defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
568defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
569defm : X86WriteResPairUnsupported<WriteFRcpZ>;
570defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
571defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
572defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
573defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
574defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
575defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
576defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
577defm : X86WriteResPairUnsupported<WriteFDivZ>;
578defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
579defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
580defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
581defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
582defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
583defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
584defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
585defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
586defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
587defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
588defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
589defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
590defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
591defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
592defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
593defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
594defm : X86WriteResPairUnsupported<WriteFRndZ>;
595defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
596defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
597defm : X86WriteResPairUnsupported<WriteFLogicZ>;
598defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
599defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
600defm : X86WriteResPairUnsupported<WriteFTestZ>;
601defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
602defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
603defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
604defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
605defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
606defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
607defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
608defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
609defm : X86WriteResPairUnsupported<WriteFBlendZ>;
610defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
611defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
612defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
613defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
614defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
615
616////////////////////////////////////////////////////////////////////////////////
617// Conversions.
618////////////////////////////////////////////////////////////////////////////////
619
620defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
621defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
622defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
623defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
624defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
625defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
626defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
627defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
628
629defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
630defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
631defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
632defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
633defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
634defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
635defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
636defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
637defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
638defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
639
640defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
641defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
642defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
643defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
644
645defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
646defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
647defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
648defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
649
650defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
651defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
652defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
653
654defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
655defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
656defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
657defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
658defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
659defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
660
661////////////////////////////////////////////////////////////////////////////////
662// Vector integer operations.
663////////////////////////////////////////////////////////////////////////////////
664
665defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
666defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
667defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
668defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
669defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
670defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
671defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
672
673defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
674defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
675defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
676defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
677defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
678defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
679defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
680defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
681defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
682
683defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
684defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
685defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
686defm : X86WriteResUnsupported<WriteVecMoveZ>;
687defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
688defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
689
690defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
691defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
692defm : X86WriteResPairUnsupported<WriteVecALUY>;
693defm : X86WriteResPairUnsupported<WriteVecALUZ>;
694defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
695defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
696defm : X86WriteResPairUnsupported<WriteVecShiftY>;
697defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
698defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
699defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
700defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
701defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
702defm : X86WriteResPairUnsupported<WriteVarVecShift>;
703defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
704defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
705defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
706defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
707defm : X86WriteResPairUnsupported<WriteVecIMulY>;
708defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
709defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
710defm : X86WriteResPairUnsupported<WritePMULLDY>;
711defm : X86WriteResPairUnsupported<WritePMULLDZ>;
712defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
713defm : X86WriteResPairUnsupported<WriteMPSADY>;
714defm : X86WriteResPairUnsupported<WriteMPSADZ>;
715defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
716defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
717defm : X86WriteResPairUnsupported<WritePSADBWY>;
718defm : X86WriteResPairUnsupported<WritePSADBWZ>;
719defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
720defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
721defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
722defm : X86WriteResPairUnsupported<WriteShuffleY>;
723defm : X86WriteResPairUnsupported<WriteShuffleZ>;
724defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
725defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
726defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
727defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
728defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
729defm : X86WriteResPairUnsupported<WriteBlendY>;
730defm : X86WriteResPairUnsupported<WriteBlendZ>;
731defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
732defm : X86WriteResPairUnsupported<WriteVarBlendY>;
733defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
734defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
735defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
736defm : X86WriteResPairUnsupported<WriteVecLogicY>;
737defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
738defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
739defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
740defm : X86WriteResPairUnsupported<WriteVecTestZ>;
741defm : X86WriteResPairUnsupported<WriteShuffle256>;
742defm : X86WriteResPairUnsupported<WriteVPMOV256>;
743defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
744
745////////////////////////////////////////////////////////////////////////////////
746// Vector insert/extract operations.
747////////////////////////////////////////////////////////////////////////////////
748
749defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
750defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
751defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
752defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
753
754////////////////////////////////////////////////////////////////////////////////
755// SSE42 String instructions.
756////////////////////////////////////////////////////////////////////////////////
757
758defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
759defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
760defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
761defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
762
763////////////////////////////////////////////////////////////////////////////////
764// MOVMSK Instructions.
765////////////////////////////////////////////////////////////////////////////////
766
767def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
768def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
769defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
770def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
771
772////////////////////////////////////////////////////////////////////////////////
773// AES Instructions.
774////////////////////////////////////////////////////////////////////////////////
775
776defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
777defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
778defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
779
780////////////////////////////////////////////////////////////////////////////////
781// Horizontal add/sub  instructions.
782////////////////////////////////////////////////////////////////////////////////
783
784defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
785defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
786defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
787defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
788defm : X86WriteResPairUnsupported<WritePHAddY>;
789
790////////////////////////////////////////////////////////////////////////////////
791// Carry-less multiplication instructions.
792////////////////////////////////////////////////////////////////////////////////
793
794defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
795
796////////////////////////////////////////////////////////////////////////////////
797// SSE4A instructions.
798////////////////////////////////////////////////////////////////////////////////
799
800def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
801  let Latency = 2;
802  let ResourceCycles = [1, 4];
803}
804def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
805
806////////////////////////////////////////////////////////////////////////////////
807// AVX instructions.
808////////////////////////////////////////////////////////////////////////////////
809
810def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
811def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
812
813def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
814  let Latency = 6;
815  let ResourceCycles = [1, 2, 4];
816  let NumMicroOps = 2;
817}
818def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
819                                            VBROADCASTSSYrm,
820                                            VBROADCASTF128)>;
821
822def JWriteJVZEROALL: SchedWriteRes<[]> {
823  let Latency = 90;
824  let NumMicroOps = 73;
825}
826def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
827
828def JWriteJVZEROUPPER: SchedWriteRes<[]> {
829  let Latency = 46;
830  let NumMicroOps = 37;
831}
832def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
833
834///////////////////////////////////////////////////////////////////////////////
835//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
836///////////////////////////////////////////////////////////////////////////////
837
838def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
839  let Latency = 34;
840  let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
841  let NumMicroOps = 63;
842}
843def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32,
844                                         VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>;
845
846///////////////////////////////////////////////////////////////////////////////
847//  SchedWriteVariant definitions.
848///////////////////////////////////////////////////////////////////////////////
849
850def JWriteZeroLatency : SchedWriteRes<[]> {
851  let Latency = 0;
852}
853
854def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
855  let NumMicroOps = 2;
856}
857
858// Certain instructions that use the same register for both source
859// operands do not have a real dependency on the previous contents of the
860// register, and thus, do not have to wait before completing. They can be
861// optimized out at register renaming stage.
862// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
863// 15h Processors".
864// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
865// Section 21.8 [Dependency-breaking instructions].
866
867def JWriteZeroIdiom : SchedWriteVariant<[
868    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
869    SchedVar<NoSchedPred,                          [WriteALU]>
870]>;
871def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
872                                        XOR32rr, XOR64rr)>;
873
874def JWriteFZeroIdiom : SchedWriteVariant<[
875    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
876    SchedVar<NoSchedPred,                          [WriteFLogic]>
877]>;
878def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
879                                         ANDNPSrr, VANDNPSrr,
880                                         ANDNPDrr, VANDNPDrr)>;
881
882def JWriteFZeroIdiomY : SchedWriteVariant<[
883    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
884    SchedVar<NoSchedPred,                          [WriteFLogicY]>
885]>;
886def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
887                                          VANDNPSYrr, VANDNPDYrr)>;
888
889def JWriteVZeroIdiomLogic : SchedWriteVariant<[
890    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
891    SchedVar<NoSchedPred,                          [WriteVecLogic]>
892]>;
893def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
894
895def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
896    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
897    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
898]>;
899def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
900                                               PANDNrr, VPANDNrr)>;
901
902def JWriteVZeroIdiomALU : SchedWriteVariant<[
903    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
904    SchedVar<NoSchedPred,                          [WriteVecALU]>
905]>;
906def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr,
907                                            MMX_PSUBQrr, MMX_PSUBWrr,
908                                            MMX_PSUBSBrr, MMX_PSUBSWrr,
909                                            MMX_PSUBUSBrr, MMX_PSUBUSWrr,
910                                            MMX_PCMPGTBrr, MMX_PCMPGTDrr,
911                                            MMX_PCMPGTWrr)>;
912
913def JWriteVZeroIdiomALUX : SchedWriteVariant<[
914    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
915    SchedVar<NoSchedPred,                          [WriteVecALUX]>
916]>;
917def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
918                                             PSUBDrr, VPSUBDrr,
919                                             PSUBQrr, VPSUBQrr,
920                                             PSUBWrr, VPSUBWrr,
921                                             PSUBSBrr, VPSUBSBrr,
922                                             PSUBSWrr, VPSUBSWrr,
923                                             PSUBUSBrr, VPSUBUSBrr,
924                                             PSUBUSWrr, VPSUBUSWrr,
925                                             PCMPGTBrr, VPCMPGTBrr,
926                                             PCMPGTDrr, VPCMPGTDrr,
927                                             PCMPGTQrr, VPCMPGTQrr,
928                                             PCMPGTWrr, VPCMPGTWrr)>;
929
930def JWriteVPERM2F128 : SchedWriteVariant<[
931  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
932  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
933]>;
934def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
935
936// This write is used for slow LEA instructions.
937def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
938  let Latency = 2;
939}
940
941// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
942// with a `Scale` value different than 1.
943def JSlowLEAPredicate : MCSchedPredicate<
944  CheckAny<[
945    // A 3-operand LEA (base, index, offset).
946    IsThreeOperandsLEAFn,
947    // An LEA with a "Scale" different than 1.
948    CheckAll<[
949      CheckIsImmOperand<2>,
950      CheckNot<CheckImmOperand<2, 1>>
951    ]>
952  ]>
953>;
954
955def JWriteLEA : SchedWriteVariant<[
956    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
957    SchedVar<NoSchedPred,       [WriteLEA]>
958]>;
959
960def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
961
962def JSlowLEA16r : SchedWriteRes<[JALU01]> {
963  let Latency = 3;
964  let ResourceCycles = [4];
965}
966
967def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
968
969///////////////////////////////////////////////////////////////////////////////
970// Dependency breaking instructions.
971///////////////////////////////////////////////////////////////////////////////
972
973def : IsZeroIdiomFunction<[
974  // GPR Zero-idioms.
975  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
976
977  // MMX Zero-idioms.
978  DepBreakingClass<[
979    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
980    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
981    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
982    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
983  ], ZeroIdiomPredicate>,
984
985  // SSE Zero-idioms.
986  DepBreakingClass<[
987    // fp variants.
988    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
989
990    // int variants.
991    PXORrr, PANDNrr,
992    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
993    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
994    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
995  ], ZeroIdiomPredicate>,
996
997  // AVX Zero-idioms.
998  DepBreakingClass<[
999    // xmm fp variants.
1000    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
1001
1002    // xmm int variants.
1003    VPXORrr, VPANDNrr,
1004    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1005    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
1006    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1007
1008    // ymm variants.
1009    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1010  ], ZeroIdiomPredicate>,
1011
1012  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
1013]>;
1014
1015def : IsDepBreakingFunction<[
1016  // GPR
1017  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1018  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1019
1020  // MMX
1021  DepBreakingClass<[
1022    MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
1023  ], ZeroIdiomPredicate>,
1024
1025  // SSE
1026  DepBreakingClass<[
1027    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1028  ], ZeroIdiomPredicate>,
1029
1030  // AVX
1031  DepBreakingClass<[
1032    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1033  ], ZeroIdiomPredicate>
1034]>;
1035
1036def : IsOptimizableRegisterMove<[
1037  InstructionEquivalenceClass<[
1038    // GPR variants.
1039    MOV32rr, MOV64rr,
1040
1041    // MMX variants.
1042    MMX_MOVQ64rr,
1043
1044    // SSE variants.
1045    MOVAPSrr, MOVUPSrr,
1046    MOVAPDrr, MOVUPDrr,
1047    MOVDQArr, MOVDQUrr,
1048
1049    // AVX variants.
1050    VMOVAPSrr, VMOVUPSrr,
1051    VMOVAPDrr, VMOVUPDrr,
1052    VMOVDQArr, VMOVDQUrr
1053  ], TruePred >
1054]>;
1055
1056} // SchedModel
1057