xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD btver2 (Jaguar) to support
10// instruction scheduling and other instruction cost heuristics. Based off AMD Software
11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
12//
13//===----------------------------------------------------------------------===//
14
15def BtVer2Model : SchedMachineModel {
16  // All x86 instructions are modeled as a single micro-op, and btver2 can
17  // decode 2 instructions per cycle.
18  let IssueWidth = 2;
19  let MicroOpBufferSize = 64; // Retire Control Unit
20  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
21  let HighLatency = 25;
22  let MispredictPenalty = 14; // Minimum branch misdirection penalty
23  let PostRAScheduler = 1;
24
25  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
26  // the scheduler to assign a default model to unrecognized opcodes.
27  let CompleteModel = 0;
28}
29
30let SchedModel = BtVer2Model in {
31
32// Jaguar can issue up to 6 micro-ops in one cycle
33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
39
40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
41// speculative version of the 64-bit integer registers.
42// Reference: www.realworldtech.com/jaguar/4/
43//
44// The processor always keeps the different parts of an integer register
45// together. An instruction that writes to a part of a register will therefore
46// have a false dependence on any previous write to the same register or any
47// part of it.
48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
49// access" - Agner Fog's "microarchitecture.pdf".
50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
51                               0,  // Max moves that can be eliminated per cycle.
52                               1>; // Restrict move elimination to zero regs.
53
54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
55// registers. Operations on 256-bit data types are cracked into two COPs.
56// Reference: www.realworldtech.com/jaguar/4/
57
58// The PRF in the floating point unit can eliminate a move from a MMX or SSE
59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
60// dependency breaking instruction, or via VZEROALL).
61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
62// instructions" - Agner Fog's "microarchitecture.pdf"
63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
64                          0,  // Max moves that can be eliminated per cycle.
65                          1>; // Restrict move elimination to zero regs.
66
67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
68// retire up to two macro-ops per cycle.
69// Reference: "Software Optimization Guide for AMD Family 16h Processors"
70def JRCU : RetireControlUnit<64, 2>;
71
72// Integer Pipe Scheduler
73def JALU01 : ProcResGroup<[JALU0, JALU1]> {
74  let BufferSize=20;
75}
76
77// AGU Pipe Scheduler
78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
79  let BufferSize=12;
80}
81
82// Fpu Pipe Scheduler
83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
84  let BufferSize=18;
85}
86
87// Functional units
88def JDiv    : ProcResource<1>; // integer division
89def JMul    : ProcResource<1>; // integer multiplication
90def JVALU0  : ProcResource<1>; // vector integer
91def JVALU1  : ProcResource<1>; // vector integer
92def JVIMUL  : ProcResource<1>; // vector integer multiplication
93def JSTC    : ProcResource<1>; // vector store/convert
94def JFPM    : ProcResource<1>; // FP multiplication
95def JFPA    : ProcResource<1>; // FP addition
96
97// Functional unit groups
98def JFPX  : ProcResGroup<[JFPA, JFPM]>;
99def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
100
101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
102// cycles after the memory operand.
103def : ReadAdvance<ReadAfterLd, 3>;
104
105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
106// cycles after the memory operand.
107def : ReadAdvance<ReadAfterVecLd, 5>;
108def : ReadAdvance<ReadAfterVecXLd, 5>;
109def : ReadAdvance<ReadAfterVecYLd, 5>;
110
111/// "Additional 6 cycle transfer operation which moves a floating point
112/// operation input value from the integer unit to the floating point unit.
113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114def : ReadAdvance<ReadInt2Fpu, -6>;
115
116// Many SchedWrites are defined in pairs with and without a folded load.
117// Instructions with folded loads are usually micro-fused, so they only appear
118// as two micro-ops when dispatched by the schedulers.
119// This multiclass defines the resource usage for variants with and without
120// folded loads.
121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
122                            list<ProcResourceKind> ExePorts,
123                            int Lat, list<int> Res = [], int UOps = 1,
124                            int LoadUOps = 0> {
125  // Register variant is using a single cycle on ExePort.
126  def : WriteRes<SchedRW, ExePorts> {
127    let Latency = Lat;
128    let ReleaseAtCycles = Res;
129    let NumMicroOps = UOps;
130  }
131
132  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
133  // latency.
134  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
135    let Latency = !add(Lat, 3);
136    let ReleaseAtCycles = !if(!empty(Res), [], !listconcat([1], Res));
137    let NumMicroOps = !add(UOps, LoadUOps);
138  }
139}
140
141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
142                            list<ProcResourceKind> ExePorts,
143                            int Lat, list<int> Res = [], int UOps = 1,
144                            int LoadUOps = 0> {
145  // Register variant is using a single cycle on ExePort.
146  def : WriteRes<SchedRW, ExePorts> {
147    let Latency = Lat;
148    let ReleaseAtCycles = Res;
149    let NumMicroOps = UOps;
150  }
151
152  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
153  // latency.
154  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
155    let Latency = !add(Lat, 5);
156    let ReleaseAtCycles = !if(!empty(Res), [], !listconcat([1], Res));
157    let NumMicroOps = !add(UOps, LoadUOps);
158  }
159}
160
161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
162                            list<ProcResourceKind> ExePorts,
163                            int Lat, list<int> Res = [2], int UOps = 2,
164                            int LoadUOps = 0> {
165  // Register variant is using a single cycle on ExePort.
166  def : WriteRes<SchedRW, ExePorts> {
167    let Latency = Lat;
168    let ReleaseAtCycles = Res;
169    let NumMicroOps = UOps;
170  }
171
172  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
173  // latency.
174  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
175    let Latency = !add(Lat, 5);
176    let ReleaseAtCycles = !listconcat([2], Res);
177    let NumMicroOps = !add(UOps, LoadUOps);
178  }
179}
180
181// Instructions that have local forwarding disabled have an extra +1cy latency.
182
183// A folded store needs a cycle on the SAGU for the store data, most RMW
184// instructions don't need an extra uop.  ALU RMW operations don't seem to
185// benefit from STLF, and their observed latency is 6cy. That is the reason why
186// this write adds two extra cycles (instead of just 1cy for the store).
187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
188
189////////////////////////////////////////////////////////////////////////////////
190// Arithmetic.
191////////////////////////////////////////////////////////////////////////////////
192
193defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
194defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
195
196defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
197defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
198defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
199defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
200defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
201
202defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
203defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
206defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
209defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
212defm : X86WriteResUnsupported<WriteIMulH>;
213defm : X86WriteResUnsupported<WriteIMulHLd>;
214defm : X86WriteResPairUnsupported<WriteMULX32>;
215defm : X86WriteResPairUnsupported<WriteMULX64>;
216
217defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
218defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
219defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
220defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
221defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
222defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
223defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
224defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
225
226defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
227
228defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
229defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
230def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
231def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
232def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
233
234defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
235defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
236defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
237defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
238defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
239defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
240
241// This is for simple LEAs with one or two input operands.
242def : WriteRes<WriteLEA, [JALU01]>;
243
244// Bit counts.
245defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
246defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
247defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
248defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
249defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
250
251// BMI1 BEXTR/BLS, BMI2 BZHI
252defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
253defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
254defm : X86WriteResPairUnsupported<WriteBZHI>;
255
256////////////////////////////////////////////////////////////////////////////////
257// Integer shifts and rotates.
258////////////////////////////////////////////////////////////////////////////////
259
260defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
261defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
262defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
263defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
264
265// SHLD/SHRD.
266defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
267defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
268defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
269defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
270
271////////////////////////////////////////////////////////////////////////////////
272// Loads, stores, and moves, not folded with other operations.
273////////////////////////////////////////////////////////////////////////////////
274
275def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
276def : WriteRes<WriteStore,   [JSAGU]>;
277def : WriteRes<WriteStoreNT, [JSAGU]>;
278def : WriteRes<WriteMove,    [JALU01]>;
279defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
280
281// Load/store MXCSR.
282def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
283def : WriteRes<WriteSTMXCSR, [JSAGU]>;
284
285// Treat misc copies as a move.
286def : InstRW<[WriteMove], (instrs COPY)>;
287
288////////////////////////////////////////////////////////////////////////////////
289// Idioms that clear a register, like xorps %xmm0, %xmm0.
290// These can often bypass execution ports completely.
291////////////////////////////////////////////////////////////////////////////////
292
293def : WriteRes<WriteZero,  []>;
294
295////////////////////////////////////////////////////////////////////////////////
296// Branches don't produce values, so they have no latency, but they still
297// consume resources. Indirect branches can fold loads.
298////////////////////////////////////////////////////////////////////////////////
299
300defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
301
302////////////////////////////////////////////////////////////////////////////////
303// Special case scheduling classes.
304////////////////////////////////////////////////////////////////////////////////
305
306def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
307def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
308def : WriteRes<WriteFence,  [JSAGU]>;
309
310// Nops don't have dependencies, so there's no actual latency, but we set this
311// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
312def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
313
314def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
315  let Latency = 3;
316  let ReleaseAtCycles = [3];
317  let NumMicroOps = 3;
318}
319
320def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
321  let Latency = 16;
322  let ReleaseAtCycles = [3,16,16];
323  let NumMicroOps = 5;
324}
325
326def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
327  let Latency = 17;
328  let ReleaseAtCycles = [3,17,17];
329  let NumMicroOps = 6;
330}
331
332def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
333  let Latency = 11;
334  let ReleaseAtCycles = [3,1,1];
335  let NumMicroOps = 5;
336}
337
338def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
339  let Latency = 11;
340  let ReleaseAtCycles = [3,1,1];
341  let NumMicroOps = 18;
342}
343
344def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
345  let Latency = 32;
346  let ReleaseAtCycles = [6,1,1];
347  let NumMicroOps = 28;
348}
349
350def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
351  let Latency = 19;
352  let ReleaseAtCycles = [3,19,19];
353  let NumMicroOps = 18;
354}
355
356def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
357  let Latency = 38;
358  let ReleaseAtCycles = [6,38,38];
359  let NumMicroOps = 28;
360}
361
362def JWriteCMPXCHGVariant :  SchedWriteVariant<[
363  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
364  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
365  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
366  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
367  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
368  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
369  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
370  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
371  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
372  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
373]>;
374
375// The first five reads are contributed by the memory load operand.
376// We ignore those reads and set a read-advance for the other input operands
377// including the implicit read of RAX.
378def : InstRW<[JWriteCMPXCHGVariant,
379              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
380              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
381                                                 LCMPXCHG32, LCMPXCHG64,
382                                                 CMPXCHG8rm, CMPXCHG16rm,
383                                                 CMPXCHG32rm, CMPXCHG64rm)>;
384
385def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
386                                             CMPXCHG32rr, CMPXCHG64rr)>;
387
388def : InstRW<[JWriteCMPXCHGVariant,
389              // Ignore reads contributed by the memory operand.
390              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
391              // Add a read-advance to every implicit register read.
392              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
393                                                                           CMPXCHG8B, CMPXCHG16B)>;
394
395def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
396  let Latency = 19;
397  let ReleaseAtCycles = [1,19,19];
398  let NumMicroOps = 1;
399}
400
401def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
402  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
403  SchedVar<NoSchedPred,                       [WriteALURMW]>
404]>;
405def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
406                                                 DEC8m, DEC16m, DEC32m, DEC64m,
407                                                 NOT8m, NOT16m, NOT32m, NOT64m,
408                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
409
410def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
411  let Latency = 2;
412  let ReleaseAtCycles = [3];
413  let NumMicroOps = 3;
414}
415def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
416                                                      XADD32rr, XADD64rr)>;
417
418// This write defines the latency of the in/out register operand of a non-atomic
419// XADDrm. This is the first of a pair of writes that model non-atomic
420// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
421//
422// We need two writes because the instruction latency differs from the output
423// register operand latency. In particular, the first write describes the first
424// (and only) output register operand of the instruction.  However, the
425// instruction latency is set to the MAX of all the write latencies. That's why
426// a second write is needed in this case (see example below).
427//
428// Example:
429//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
430//                            ## ECX write Latency: 3cy
431//
432// Register ECX becomes available in 3 cycles. That is because the value of ECX
433// is exchanged with the value read from the stack pointer, and the load-to-use
434// latency is assumed to be 3cy.
435def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
436  let Latency = 3;  // load-to-use latency
437  let ReleaseAtCycles = [3];
438  let NumMicroOps = 3;
439}
440
441// This write defines the latency of the in/out register operand of an atomic
442// XADDrm. This is the first of a sequence of two writes used to model atomic
443// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
444//
445//
446// Example:
447//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
448//                               ## ECX write Latency: 11cy
449//
450// The value of ECX becomes available only after 11cy from the start of
451// execution. This write is used to specifically set that operand latency.
452def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
453  let Latency = 11;
454  let ReleaseAtCycles = [3];
455  let NumMicroOps = 3;
456}
457
458// This write defines the latency of the in/out register operand of an atomic
459// XCHGrm. This write is the first of a sequence of two writes that describe
460// atomic XCHG operations. We need two writes because the instruction latency
461// differs from the output register write latency.  We want to make sure that
462// the output register operand becomes visible after 11cy. However, we want to
463// set the instruction latency to 16cy.
464def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
465  let Latency = 11;
466  let ReleaseAtCycles = [2];
467  let NumMicroOps = 2;
468}
469
470def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
471  let Latency = 11;
472  let ReleaseAtCycles = [1, 1];
473  let NumMicroOps = 1;
474}
475
476def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
477  let Latency = 16;
478  let ReleaseAtCycles = [16, 16];
479  let NumMicroOps = 1;
480}
481
482def JWriteXADDrm_Part1 : SchedWriteVariant<[
483  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
484  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
485]>;
486
487def JWriteXADDrm_Part2 : SchedWriteVariant<[
488  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
489  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
490]>;
491
492def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
493                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
494                         LXADD8, LXADD16, LXADD32, LXADD64)>;
495
496def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
497                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
498
499
500////////////////////////////////////////////////////////////////////////////////
501// Floating point. This covers both scalar and vector operations.
502////////////////////////////////////////////////////////////////////////////////
503
504defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
505defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
506defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
507defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
508defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
509defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
510defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
511defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
512
513defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
514defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
515defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
516defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
517defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
518defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
519
520defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
521defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
522defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
523defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
524
525defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
526defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
527defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
528defm : X86WriteResUnsupported<WriteFMoveZ>;
529
530defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
531
532defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
533defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
534defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
535defm : X86WriteResPairUnsupported<WriteFAddZ>;
536defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
537defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
538defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
539defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
540defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
541defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
542defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
543defm : X86WriteResPairUnsupported<WriteFCmpZ>;
544defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
545defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
546defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
547defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
548defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
549defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
550defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
551defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
552defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
553defm : X86WriteResPairUnsupported<WriteFMulZ>;
554defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
555defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
556defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
557defm : X86WriteResPairUnsupported<WriteFMul64Z>;
558defm : X86WriteResPairUnsupported<WriteFMA>;
559defm : X86WriteResPairUnsupported<WriteFMAX>;
560defm : X86WriteResPairUnsupported<WriteFMAY>;
561defm : X86WriteResPairUnsupported<WriteFMAZ>;
562defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
563defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
564defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
565defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
566defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
567defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
568defm : X86WriteResPairUnsupported<WriteFRcpZ>;
569defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
570defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
571defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
572defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
573defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
574defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
575defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
576defm : X86WriteResPairUnsupported<WriteFDivZ>;
577defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
578defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
579defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
580defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
581defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
582defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
583defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
584defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
585defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
586defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
587defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
588defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
589defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
590defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
591defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
592defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
593defm : X86WriteResPairUnsupported<WriteFRndZ>;
594defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
595defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
596defm : X86WriteResPairUnsupported<WriteFLogicZ>;
597defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
598defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
599defm : X86WriteResPairUnsupported<WriteFTestZ>;
600defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
601defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
602defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
603defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
604defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
605defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
606defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
607defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
608defm : X86WriteResPairUnsupported<WriteFBlendZ>;
609defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
610defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
611defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
612defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
613defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
614
615////////////////////////////////////////////////////////////////////////////////
616// Conversions.
617////////////////////////////////////////////////////////////////////////////////
618
619defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
620defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
621defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
622defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
623defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
624defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
625defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
626defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
627
628defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
629defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
630defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
631defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
632defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
633defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
634defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
635defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
636defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
637defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
638
639defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
640defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
641defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
642defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
643
644defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
645defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
646defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
647defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
648
649defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
650defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
651defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
652
653defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
654defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
655defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
656defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
657defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
658defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
659
660////////////////////////////////////////////////////////////////////////////////
661// Vector integer operations.
662////////////////////////////////////////////////////////////////////////////////
663
664defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
665defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
666defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
667defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
668defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
669defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
670defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
671
672defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
673defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
674defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
675defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
676defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
677defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
678defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
679defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
680defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
681
682defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
683defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
684defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
685defm : X86WriteResUnsupported<WriteVecMoveZ>;
686defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
687defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
688
689defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
690defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
691defm : X86WriteResPairUnsupported<WriteVecALUY>;
692defm : X86WriteResPairUnsupported<WriteVecALUZ>;
693defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
694defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
695defm : X86WriteResPairUnsupported<WriteVecShiftY>;
696defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
697defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
698defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
699defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
700defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
701defm : X86WriteResPairUnsupported<WriteVarVecShift>;
702defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
703defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
704defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
705defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
706defm : X86WriteResPairUnsupported<WriteVecIMulY>;
707defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
708defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
709defm : X86WriteResPairUnsupported<WritePMULLDY>;
710defm : X86WriteResPairUnsupported<WritePMULLDZ>;
711defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
712defm : X86WriteResPairUnsupported<WriteMPSADY>;
713defm : X86WriteResPairUnsupported<WriteMPSADZ>;
714defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
715defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
716defm : X86WriteResPairUnsupported<WritePSADBWY>;
717defm : X86WriteResPairUnsupported<WritePSADBWZ>;
718defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
719defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
720defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
721defm : X86WriteResPairUnsupported<WriteShuffleY>;
722defm : X86WriteResPairUnsupported<WriteShuffleZ>;
723defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
724defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
725defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
726defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
727defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
728defm : X86WriteResPairUnsupported<WriteBlendY>;
729defm : X86WriteResPairUnsupported<WriteBlendZ>;
730defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
731defm : X86WriteResPairUnsupported<WriteVarBlendY>;
732defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
733defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
734defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
735defm : X86WriteResPairUnsupported<WriteVecLogicY>;
736defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
737defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
738defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
739defm : X86WriteResPairUnsupported<WriteVecTestZ>;
740defm : X86WriteResPairUnsupported<WriteShuffle256>;
741defm : X86WriteResPairUnsupported<WriteVPMOV256>;
742defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
743
744////////////////////////////////////////////////////////////////////////////////
745// Vector insert/extract operations.
746////////////////////////////////////////////////////////////////////////////////
747
748defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
749defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
750defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
751defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
752
753////////////////////////////////////////////////////////////////////////////////
754// SSE42 String instructions.
755////////////////////////////////////////////////////////////////////////////////
756
757defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
758defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
759defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
760defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
761
762////////////////////////////////////////////////////////////////////////////////
763// MOVMSK Instructions.
764////////////////////////////////////////////////////////////////////////////////
765
766def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
767def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
768defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
769def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
770
771////////////////////////////////////////////////////////////////////////////////
772// AES Instructions.
773////////////////////////////////////////////////////////////////////////////////
774
775defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
776defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
777defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
778
779////////////////////////////////////////////////////////////////////////////////
780// Horizontal add/sub  instructions.
781////////////////////////////////////////////////////////////////////////////////
782
783defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
784defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
785defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
786defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
787defm : X86WriteResPairUnsupported<WritePHAddY>;
788
789////////////////////////////////////////////////////////////////////////////////
790// Carry-less multiplication instructions.
791////////////////////////////////////////////////////////////////////////////////
792
793defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
794
795////////////////////////////////////////////////////////////////////////////////
796// SSE4A instructions.
797////////////////////////////////////////////////////////////////////////////////
798
799def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
800  let Latency = 2;
801  let ReleaseAtCycles = [1, 4];
802}
803def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
804
805////////////////////////////////////////////////////////////////////////////////
806// AVX instructions.
807////////////////////////////////////////////////////////////////////////////////
808
809def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
810def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
811
812def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
813  let Latency = 6;
814  let ReleaseAtCycles = [1, 2, 4];
815  let NumMicroOps = 2;
816}
817def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
818                                            VBROADCASTSSYrm,
819                                            VBROADCASTF128rm)>;
820
821def JWriteJVZEROALL: SchedWriteRes<[]> {
822  let Latency = 90;
823  let NumMicroOps = 73;
824}
825def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
826
827def JWriteJVZEROUPPER: SchedWriteRes<[]> {
828  let Latency = 46;
829  let NumMicroOps = 37;
830}
831def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
832
833///////////////////////////////////////////////////////////////////////////////
834//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
835///////////////////////////////////////////////////////////////////////////////
836
837def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
838  let Latency = 34;
839  let ReleaseAtCycles = [1, 1, 2, 2, 2, 16, 42];
840  let NumMicroOps = 63;
841}
842def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
843                                         VMASKMOVDQU, VMASKMOVDQU64)>;
844
845///////////////////////////////////////////////////////////////////////////////
846//  SchedWriteVariant definitions.
847///////////////////////////////////////////////////////////////////////////////
848
849def JWriteZeroLatency : SchedWriteRes<[]> {
850  let Latency = 0;
851}
852
853def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
854  let NumMicroOps = 2;
855}
856
857// Certain instructions that use the same register for both source
858// operands do not have a real dependency on the previous contents of the
859// register, and thus, do not have to wait before completing. They can be
860// optimized out at register renaming stage.
861// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
862// 15h Processors".
863// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
864// Section 21.8 [Dependency-breaking instructions].
865
866def JWriteZeroIdiom : SchedWriteVariant<[
867    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
868    SchedVar<NoSchedPred,                          [WriteALU]>
869]>;
870def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
871                                        XOR32rr, XOR64rr)>;
872
873def JWriteFZeroIdiom : SchedWriteVariant<[
874    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
875    SchedVar<NoSchedPred,                          [WriteFLogic]>
876]>;
877def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
878                                         ANDNPSrr, VANDNPSrr,
879                                         ANDNPDrr, VANDNPDrr)>;
880
881def JWriteFZeroIdiomY : SchedWriteVariant<[
882    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
883    SchedVar<NoSchedPred,                          [WriteFLogicY]>
884]>;
885def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
886                                          VANDNPSYrr, VANDNPDYrr)>;
887
888def JWriteVZeroIdiomLogic : SchedWriteVariant<[
889    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
890    SchedVar<NoSchedPred,                          [WriteVecLogic]>
891]>;
892def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
893
894def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
895    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
896    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
897]>;
898def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
899                                               PANDNrr, VPANDNrr)>;
900
901def JWriteVZeroIdiomALU : SchedWriteVariant<[
902    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
903    SchedVar<NoSchedPred,                          [WriteVecALU]>
904]>;
905def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr,
906                                            MMX_PSUBQrr, MMX_PSUBWrr,
907                                            MMX_PSUBSBrr, MMX_PSUBSWrr,
908                                            MMX_PSUBUSBrr, MMX_PSUBUSWrr,
909                                            MMX_PCMPGTBrr, MMX_PCMPGTDrr,
910                                            MMX_PCMPGTWrr)>;
911
912def JWriteVZeroIdiomALUX : SchedWriteVariant<[
913    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
914    SchedVar<NoSchedPred,                          [WriteVecALUX]>
915]>;
916def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
917                                             PSUBDrr, VPSUBDrr,
918                                             PSUBQrr, VPSUBQrr,
919                                             PSUBWrr, VPSUBWrr,
920                                             PSUBSBrr, VPSUBSBrr,
921                                             PSUBSWrr, VPSUBSWrr,
922                                             PSUBUSBrr, VPSUBUSBrr,
923                                             PSUBUSWrr, VPSUBUSWrr,
924                                             PCMPGTBrr, VPCMPGTBrr,
925                                             PCMPGTDrr, VPCMPGTDrr,
926                                             PCMPGTQrr, VPCMPGTQrr,
927                                             PCMPGTWrr, VPCMPGTWrr)>;
928
929def JWriteVPERM2F128 : SchedWriteVariant<[
930  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
931  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
932]>;
933def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
934
935// This write is used for slow LEA instructions.
936def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
937  let Latency = 2;
938}
939
940// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
941// with a `Scale` value different than 1.
942def JSlowLEAPredicate : MCSchedPredicate<
943  CheckAny<[
944    // A 3-operand LEA (base, index, offset).
945    IsThreeOperandsLEAFn,
946    // An LEA with a "Scale" different than 1.
947    CheckAll<[
948      CheckIsImmOperand<2>,
949      CheckNot<CheckImmOperand<2, 1>>
950    ]>
951  ]>
952>;
953
954def JWriteLEA : SchedWriteVariant<[
955    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
956    SchedVar<NoSchedPred,       [WriteLEA]>
957]>;
958
959def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
960
961def JSlowLEA16r : SchedWriteRes<[JALU01]> {
962  let Latency = 3;
963  let ReleaseAtCycles = [4];
964}
965
966def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
967
968///////////////////////////////////////////////////////////////////////////////
969// Dependency breaking instructions.
970///////////////////////////////////////////////////////////////////////////////
971
972def : IsZeroIdiomFunction<[
973  // GPR Zero-idioms.
974  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
975
976  // MMX Zero-idioms.
977  DepBreakingClass<[
978    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
979    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
980    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
981    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
982  ], ZeroIdiomPredicate>,
983
984  // SSE Zero-idioms.
985  DepBreakingClass<[
986    // fp variants.
987    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
988
989    // int variants.
990    PXORrr, PANDNrr,
991    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
992    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
993    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
994  ], ZeroIdiomPredicate>,
995
996  // AVX Zero-idioms.
997  DepBreakingClass<[
998    // xmm fp variants.
999    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
1000
1001    // xmm int variants.
1002    VPXORrr, VPANDNrr,
1003    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1004    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
1005    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1006
1007    // ymm variants.
1008    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1009  ], ZeroIdiomPredicate>,
1010
1011  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
1012]>;
1013
1014def : IsDepBreakingFunction<[
1015  // GPR
1016  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1017  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1018
1019  // MMX
1020  DepBreakingClass<[
1021    MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
1022  ], ZeroIdiomPredicate>,
1023
1024  // SSE
1025  DepBreakingClass<[
1026    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1027  ], ZeroIdiomPredicate>,
1028
1029  // AVX
1030  DepBreakingClass<[
1031    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1032  ], ZeroIdiomPredicate>
1033]>;
1034
1035def : IsOptimizableRegisterMove<[
1036  InstructionEquivalenceClass<[
1037    // GPR variants.
1038    MOV32rr, MOV64rr,
1039
1040    // MMX variants.
1041    MMX_MOVQ64rr,
1042
1043    // SSE variants.
1044    MOVAPSrr, MOVUPSrr,
1045    MOVAPDrr, MOVUPDrr,
1046    MOVDQArr, MOVDQUrr,
1047
1048    // AVX variants.
1049    VMOVAPSrr, VMOVUPSrr,
1050    VMOVAPDrr, VMOVUPDrr,
1051    VMOVDQArr, VMOVDQUrr
1052  ], TruePred >
1053]>;
1054
1055} // SchedModel
1056