xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td (revision ec0ea6efa1ad229d75c394c1a9b9cac33af2b1d3)
1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD btver2 (Jaguar) to support
10// instruction scheduling and other instruction cost heuristics. Based off AMD Software
11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
12//
13//===----------------------------------------------------------------------===//
14
15def BtVer2Model : SchedMachineModel {
16  // All x86 instructions are modeled as a single micro-op, and btver2 can
17  // decode 2 instructions per cycle.
18  let IssueWidth = 2;
19  let MicroOpBufferSize = 64; // Retire Control Unit
20  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
21  let HighLatency = 25;
22  let MispredictPenalty = 14; // Minimum branch misdirection penalty
23  let PostRAScheduler = 1;
24
25  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
26  // the scheduler to assign a default model to unrecognized opcodes.
27  let CompleteModel = 0;
28}
29
30let SchedModel = BtVer2Model in {
31
32// Jaguar can issue up to 6 micro-ops in one cycle
33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
39
40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
41// speculative version of the 64-bit integer registers.
42// Reference: www.realworldtech.com/jaguar/4/
43//
44// The processor always keeps the different parts of an integer register
45// together. An instruction that writes to a part of a register will therefore
46// have a false dependence on any previous write to the same register or any
47// part of it.
48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
49// access" - Agner Fog's "microarchitecture.pdf".
50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
51                               0,  // Max moves that can be eliminated per cycle.
52                               1>; // Restrict move elimination to zero regs.
53
54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
55// registers. Operations on 256-bit data types are cracked into two COPs.
56// Reference: www.realworldtech.com/jaguar/4/
57
58// The PRF in the floating point unit can eliminate a move from a MMX or SSE
59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
60// dependency breaking instruction, or via VZEROALL).
61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
62// instructions" - Agner Fog's "microarchitecture.pdf"
63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
64                          0,  // Max moves that can be eliminated per cycle.
65                          1>; // Restrict move elimination to zero regs.
66
67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
68// retire up to two macro-ops per cycle.
69// Reference: "Software Optimization Guide for AMD Family 16h Processors"
70def JRCU : RetireControlUnit<64, 2>;
71
72// Integer Pipe Scheduler
73def JALU01 : ProcResGroup<[JALU0, JALU1]> {
74  let BufferSize=20;
75}
76
77// AGU Pipe Scheduler
78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
79  let BufferSize=12;
80}
81
82// Fpu Pipe Scheduler
83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
84  let BufferSize=18;
85}
86
87// Functional units
88def JDiv    : ProcResource<1>; // integer division
89def JMul    : ProcResource<1>; // integer multiplication
90def JVALU0  : ProcResource<1>; // vector integer
91def JVALU1  : ProcResource<1>; // vector integer
92def JVIMUL  : ProcResource<1>; // vector integer multiplication
93def JSTC    : ProcResource<1>; // vector store/convert
94def JFPM    : ProcResource<1>; // FP multiplication
95def JFPA    : ProcResource<1>; // FP addition
96
97// Functional unit groups
98def JFPX  : ProcResGroup<[JFPA, JFPM]>;
99def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
100
101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
102// cycles after the memory operand.
103def : ReadAdvance<ReadAfterLd, 3>;
104
105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
106// cycles after the memory operand.
107def : ReadAdvance<ReadAfterVecLd, 5>;
108def : ReadAdvance<ReadAfterVecXLd, 5>;
109def : ReadAdvance<ReadAfterVecYLd, 5>;
110
111/// "Additional 6 cycle transfer operation which moves a floating point
112/// operation input value from the integer unit to the floating point unit.
113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114def : ReadAdvance<ReadInt2Fpu, -6>;
115
116// Many SchedWrites are defined in pairs with and without a folded load.
117// Instructions with folded loads are usually micro-fused, so they only appear
118// as two micro-ops when dispatched by the schedulers.
119// This multiclass defines the resource usage for variants with and without
120// folded loads.
121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
122                            list<ProcResourceKind> ExePorts,
123                            int Lat, list<int> Res = [], int UOps = 1,
124                            int LoadUOps = 0> {
125  // Register variant is using a single cycle on ExePort.
126  def : WriteRes<SchedRW, ExePorts> {
127    let Latency = Lat;
128    let ResourceCycles = Res;
129    let NumMicroOps = UOps;
130  }
131
132  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
133  // latency.
134  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
135    let Latency = !add(Lat, 3);
136    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
137    let NumMicroOps = !add(UOps, LoadUOps);
138  }
139}
140
141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
142                            list<ProcResourceKind> ExePorts,
143                            int Lat, list<int> Res = [], int UOps = 1,
144                            int LoadUOps = 0> {
145  // Register variant is using a single cycle on ExePort.
146  def : WriteRes<SchedRW, ExePorts> {
147    let Latency = Lat;
148    let ResourceCycles = Res;
149    let NumMicroOps = UOps;
150  }
151
152  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
153  // latency.
154  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
155    let Latency = !add(Lat, 5);
156    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
157    let NumMicroOps = !add(UOps, LoadUOps);
158  }
159}
160
161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
162                            list<ProcResourceKind> ExePorts,
163                            int Lat, list<int> Res = [2], int UOps = 2,
164                            int LoadUOps = 0> {
165  // Register variant is using a single cycle on ExePort.
166  def : WriteRes<SchedRW, ExePorts> {
167    let Latency = Lat;
168    let ResourceCycles = Res;
169    let NumMicroOps = UOps;
170  }
171
172  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
173  // latency.
174  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
175    let Latency = !add(Lat, 5);
176    let ResourceCycles = !listconcat([2], Res);
177    let NumMicroOps = !add(UOps, LoadUOps);
178  }
179}
180
181// Instructions that have local forwarding disabled have an extra +1cy latency.
182
183// A folded store needs a cycle on the SAGU for the store data, most RMW
184// instructions don't need an extra uop.  ALU RMW operations don't seem to
185// benefit from STLF, and their observed latency is 6cy. That is the reason why
186// this write adds two extra cycles (instead of just 1cy for the store).
187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
188
189////////////////////////////////////////////////////////////////////////////////
190// Arithmetic.
191////////////////////////////////////////////////////////////////////////////////
192
193defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
194defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
195
196defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
197defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
198defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
199defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
200defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
201
202defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
203defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
206defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
209defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
212defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
213
214defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
215defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
216defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
217defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
218defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
219defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
220defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
221defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
222
223defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
224
225defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
226defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
227def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
228def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
229def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
230
231defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
232defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
233defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
234defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
235defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
236defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
237
238// This is for simple LEAs with one or two input operands.
239def : WriteRes<WriteLEA, [JALU01]>;
240
241// Bit counts.
242defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
243defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
244defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
245defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
246defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
247
248// BMI1 BEXTR/BLS, BMI2 BZHI
249defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
250defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
251defm : X86WriteResPairUnsupported<WriteBZHI>;
252
253////////////////////////////////////////////////////////////////////////////////
254// Integer shifts and rotates.
255////////////////////////////////////////////////////////////////////////////////
256
257defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
258defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
259defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
260defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
261
262// SHLD/SHRD.
263defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
264defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
265defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
266defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
267
268////////////////////////////////////////////////////////////////////////////////
269// Loads, stores, and moves, not folded with other operations.
270////////////////////////////////////////////////////////////////////////////////
271
272def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
273def : WriteRes<WriteStore,   [JSAGU]>;
274def : WriteRes<WriteStoreNT, [JSAGU]>;
275def : WriteRes<WriteMove,    [JALU01]>;
276defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
277
278// Load/store MXCSR.
279def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
280def : WriteRes<WriteSTMXCSR, [JSAGU]>;
281
282// Treat misc copies as a move.
283def : InstRW<[WriteMove], (instrs COPY)>;
284
285////////////////////////////////////////////////////////////////////////////////
286// Idioms that clear a register, like xorps %xmm0, %xmm0.
287// These can often bypass execution ports completely.
288////////////////////////////////////////////////////////////////////////////////
289
290def : WriteRes<WriteZero,  []>;
291
292////////////////////////////////////////////////////////////////////////////////
293// Branches don't produce values, so they have no latency, but they still
294// consume resources. Indirect branches can fold loads.
295////////////////////////////////////////////////////////////////////////////////
296
297defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
298
299////////////////////////////////////////////////////////////////////////////////
300// Special case scheduling classes.
301////////////////////////////////////////////////////////////////////////////////
302
303def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
304def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
305def : WriteRes<WriteFence,  [JSAGU]>;
306
307// Nops don't have dependencies, so there's no actual latency, but we set this
308// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
309def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
310
311def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
312  let Latency = 3;
313  let ResourceCycles = [3];
314  let NumMicroOps = 3;
315}
316
317def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
318  let Latency = 16;
319  let ResourceCycles = [3,16,16];
320  let NumMicroOps = 5;
321}
322
323def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
324  let Latency = 17;
325  let ResourceCycles = [3,17,17];
326  let NumMicroOps = 6;
327}
328
329def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
330  let Latency = 11;
331  let ResourceCycles = [3,1,1];
332  let NumMicroOps = 5;
333}
334
335def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
336  let Latency = 11;
337  let ResourceCycles = [3,1,1];
338  let NumMicroOps = 18;
339}
340
341def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
342  let Latency = 32;
343  let ResourceCycles = [6,1,1];
344  let NumMicroOps = 28;
345}
346
347def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
348  let Latency = 19;
349  let ResourceCycles = [3,19,19];
350  let NumMicroOps = 18;
351}
352
353def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
354  let Latency = 38;
355  let ResourceCycles = [6,38,38];
356  let NumMicroOps = 28;
357}
358
359def JWriteCMPXCHGVariant :  SchedWriteVariant<[
360  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
361  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
362  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
363  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
364  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
365  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
366  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
367  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
368  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
369  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
370]>;
371
372// The first five reads are contributed by the memory load operand.
373// We ignore those reads and set a read-advance for the other input operands
374// including the implicit read of RAX.
375def : InstRW<[JWriteCMPXCHGVariant,
376              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
377              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
378                                                 LCMPXCHG32, LCMPXCHG64,
379                                                 CMPXCHG8rm, CMPXCHG16rm,
380                                                 CMPXCHG32rm, CMPXCHG64rm)>;
381
382def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
383                                             CMPXCHG32rr, CMPXCHG64rr)>;
384
385def : InstRW<[JWriteCMPXCHGVariant,
386              // Ignore reads contributed by the memory operand.
387              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
388              // Add a read-advance to every implicit register read.
389              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
390                                                                           CMPXCHG8B, CMPXCHG16B)>;
391
392def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
393  let Latency = 19;
394  let ResourceCycles = [1,19,19];
395  let NumMicroOps = 1;
396}
397
398def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
399  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
400  SchedVar<NoSchedPred,                       [WriteALURMW]>
401]>;
402def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
403                                                 DEC8m, DEC16m, DEC32m, DEC64m,
404                                                 NOT8m, NOT16m, NOT32m, NOT64m,
405                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
406
407def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
408  let Latency = 2;
409  let ResourceCycles = [3];
410  let NumMicroOps = 3;
411}
412def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
413                                                      XADD32rr, XADD64rr)>;
414
415// This write defines the latency of the in/out register operand of a non-atomic
416// XADDrm. This is the first of a pair of writes that model non-atomic
417// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
418//
419// We need two writes because the instruction latency differs from the output
420// register operand latency. In particular, the first write describes the first
421// (and only) output register operand of the instruction.  However, the
422// instruction latency is set to the MAX of all the write latencies. That's why
423// a second write is needed in this case (see example below).
424//
425// Example:
426//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
427//                            ## ECX write Latency: 3cy
428//
429// Register ECX becomes available in 3 cycles. That is because the value of ECX
430// is exchanged with the value read from the stack pointer, and the load-to-use
431// latency is assumed to be 3cy.
432def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
433  let Latency = 3;  // load-to-use latency
434  let ResourceCycles = [3];
435  let NumMicroOps = 3;
436}
437
438// This write defines the latency of the in/out register operand of an atomic
439// XADDrm. This is the first of a sequence of two writes used to model atomic
440// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
441//
442//
443// Example:
444//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
445//                               ## ECX write Latency: 11cy
446//
447// The value of ECX becomes available only after 11cy from the start of
448// execution. This write is used to specifically set that operand latency.
449def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
450  let Latency = 11;
451  let ResourceCycles = [3];
452  let NumMicroOps = 3;
453}
454
455// This write defines the latency of the in/out register operand of an atomic
456// XCHGrm. This write is the first of a sequence of two writes that describe
457// atomic XCHG operations. We need two writes because the instruction latency
458// differs from the output register write latency.  We want to make sure that
459// the output register operand becomes visible after 11cy. However, we want to
460// set the instruction latency to 16cy.
461def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
462  let Latency = 11;
463  let ResourceCycles = [2];
464  let NumMicroOps = 2;
465}
466
467def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
468  let Latency = 11;
469  let ResourceCycles = [1, 1];
470  let NumMicroOps = 1;
471}
472
473def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
474  let Latency = 16;
475  let ResourceCycles = [16, 16];
476  let NumMicroOps = 1;
477}
478
479def JWriteXADDrm_Part1 : SchedWriteVariant<[
480  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
481  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
482]>;
483
484def JWriteXADDrm_Part2 : SchedWriteVariant<[
485  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
486  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
487]>;
488
489def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
490                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
491                         LXADD8, LXADD16, LXADD32, LXADD64)>;
492
493def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
494                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
495
496
497////////////////////////////////////////////////////////////////////////////////
498// Floating point. This covers both scalar and vector operations.
499////////////////////////////////////////////////////////////////////////////////
500
501defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
502defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
503defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
504defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
505defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
506defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
507defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
508defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
509
510defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
511defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
512defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
513defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
514defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
515defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
516
517defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
518defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
519defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
520defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
521
522defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
523defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
524defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
525
526defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
527
528defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
529defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
530defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
531defm : X86WriteResPairUnsupported<WriteFAddZ>;
532defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
533defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
534defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
535defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
536defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
537defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
538defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
539defm : X86WriteResPairUnsupported<WriteFCmpZ>;
540defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
541defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
542defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
543defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
544defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
545defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
546defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
547defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
548defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
549defm : X86WriteResPairUnsupported<WriteFMulZ>;
550defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
551defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
552defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
553defm : X86WriteResPairUnsupported<WriteFMul64Z>;
554defm : X86WriteResPairUnsupported<WriteFMA>;
555defm : X86WriteResPairUnsupported<WriteFMAX>;
556defm : X86WriteResPairUnsupported<WriteFMAY>;
557defm : X86WriteResPairUnsupported<WriteFMAZ>;
558defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
559defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
560defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
561defm : X86WriteResPairUnsupported<WriteDPPSZ>;
562defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
563defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
564defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
565defm : X86WriteResPairUnsupported<WriteFRcpZ>;
566defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
567defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
568defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
569defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
570defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
571defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
572defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
573defm : X86WriteResPairUnsupported<WriteFDivZ>;
574defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
575defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
576defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
577defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
578defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
579defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
580defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
581defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
582defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
583defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
584defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
585defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
586defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
587defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
588defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
589defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
590defm : X86WriteResPairUnsupported<WriteFRndZ>;
591defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
592defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
593defm : X86WriteResPairUnsupported<WriteFLogicZ>;
594defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
595defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
596defm : X86WriteResPairUnsupported<WriteFTestZ>;
597defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
598defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
599defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
600defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
601defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
602defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
603defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
604defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
605defm : X86WriteResPairUnsupported<WriteFBlendZ>;
606defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
607defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
608defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
609defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
610defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
611
612////////////////////////////////////////////////////////////////////////////////
613// Conversions.
614////////////////////////////////////////////////////////////////////////////////
615
616defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
617defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
618defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
619defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
620defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
621defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
622defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
623defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
624
625defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
626defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
627defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
628defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
629defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
630defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
631defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
632defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
633defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
634defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
635
636defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
637defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
638defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
639defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
640
641defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
642defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
643defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
644defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
645
646defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
647defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
648defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
649
650defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
651defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
652defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
653defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
654defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
655defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
656
657////////////////////////////////////////////////////////////////////////////////
658// Vector integer operations.
659////////////////////////////////////////////////////////////////////////////////
660
661defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
662defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
663defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
664defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
665defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
666defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
667defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
668
669defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
670defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
671defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
672defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
673defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
674defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
675defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
676defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
677defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
678
679defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
680defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
681defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
682defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
683defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
684
685defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
686defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
687defm : X86WriteResPairUnsupported<WriteVecALUY>;
688defm : X86WriteResPairUnsupported<WriteVecALUZ>;
689defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
690defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
691defm : X86WriteResPairUnsupported<WriteVecShiftY>;
692defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
693defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
694defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
695defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
696defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
697defm : X86WriteResPairUnsupported<WriteVarVecShift>;
698defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
699defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
700defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
701defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
702defm : X86WriteResPairUnsupported<WriteVecIMulY>;
703defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
704defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
705defm : X86WriteResPairUnsupported<WritePMULLDY>;
706defm : X86WriteResPairUnsupported<WritePMULLDZ>;
707defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
708defm : X86WriteResPairUnsupported<WriteMPSADY>;
709defm : X86WriteResPairUnsupported<WriteMPSADZ>;
710defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
711defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
712defm : X86WriteResPairUnsupported<WritePSADBWY>;
713defm : X86WriteResPairUnsupported<WritePSADBWZ>;
714defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
715defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
716defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
717defm : X86WriteResPairUnsupported<WriteShuffleY>;
718defm : X86WriteResPairUnsupported<WriteShuffleZ>;
719defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
720defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
721defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
722defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
723defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
724defm : X86WriteResPairUnsupported<WriteBlendY>;
725defm : X86WriteResPairUnsupported<WriteBlendZ>;
726defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
727defm : X86WriteResPairUnsupported<WriteVarBlendY>;
728defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
729defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
730defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
731defm : X86WriteResPairUnsupported<WriteVecLogicY>;
732defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
733defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
734defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
735defm : X86WriteResPairUnsupported<WriteVecTestZ>;
736defm : X86WriteResPairUnsupported<WriteShuffle256>;
737defm : X86WriteResPairUnsupported<WriteVPMOV256>;
738defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
739
740////////////////////////////////////////////////////////////////////////////////
741// Vector insert/extract operations.
742////////////////////////////////////////////////////////////////////////////////
743
744defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
745defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
746defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
747defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
748
749////////////////////////////////////////////////////////////////////////////////
750// SSE42 String instructions.
751////////////////////////////////////////////////////////////////////////////////
752
753defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
754defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
755defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
756defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
757
758////////////////////////////////////////////////////////////////////////////////
759// MOVMSK Instructions.
760////////////////////////////////////////////////////////////////////////////////
761
762def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
763def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
764defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
765def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
766
767////////////////////////////////////////////////////////////////////////////////
768// AES Instructions.
769////////////////////////////////////////////////////////////////////////////////
770
771defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
772defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
773defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
774
775////////////////////////////////////////////////////////////////////////////////
776// Horizontal add/sub  instructions.
777////////////////////////////////////////////////////////////////////////////////
778
779defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
780defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
781defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
782defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
783defm : X86WriteResPairUnsupported<WritePHAddY>;
784
785////////////////////////////////////////////////////////////////////////////////
786// Carry-less multiplication instructions.
787////////////////////////////////////////////////////////////////////////////////
788
789defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
790
791////////////////////////////////////////////////////////////////////////////////
792// SSE4A instructions.
793////////////////////////////////////////////////////////////////////////////////
794
795def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
796  let Latency = 2;
797  let ResourceCycles = [1, 4];
798}
799def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
800
801////////////////////////////////////////////////////////////////////////////////
802// AVX instructions.
803////////////////////////////////////////////////////////////////////////////////
804
805def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
806def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
807
808def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
809  let Latency = 6;
810  let ResourceCycles = [1, 2, 4];
811  let NumMicroOps = 2;
812}
813def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
814                                            VBROADCASTSSYrm,
815                                            VBROADCASTF128)>;
816
817def JWriteJVZEROALL: SchedWriteRes<[]> {
818  let Latency = 90;
819  let NumMicroOps = 73;
820}
821def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
822
823def JWriteJVZEROUPPER: SchedWriteRes<[]> {
824  let Latency = 46;
825  let NumMicroOps = 37;
826}
827def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
828
829///////////////////////////////////////////////////////////////////////////////
830//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
831///////////////////////////////////////////////////////////////////////////////
832
833def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
834  let Latency = 34;
835  let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
836  let NumMicroOps = 63;
837}
838def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32,
839                                         VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>;
840
841///////////////////////////////////////////////////////////////////////////////
842//  SchedWriteVariant definitions.
843///////////////////////////////////////////////////////////////////////////////
844
845def JWriteZeroLatency : SchedWriteRes<[]> {
846  let Latency = 0;
847}
848
849def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
850  let NumMicroOps = 2;
851}
852
853// Certain instructions that use the same register for both source
854// operands do not have a real dependency on the previous contents of the
855// register, and thus, do not have to wait before completing. They can be
856// optimized out at register renaming stage.
857// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
858// 15h Processors".
859// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
860// Section 21.8 [Dependency-breaking instructions].
861
862def JWriteZeroIdiom : SchedWriteVariant<[
863    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
864    SchedVar<NoSchedPred,                          [WriteALU]>
865]>;
866def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
867                                        XOR32rr, XOR64rr)>;
868
869def JWriteFZeroIdiom : SchedWriteVariant<[
870    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
871    SchedVar<NoSchedPred,                          [WriteFLogic]>
872]>;
873def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
874                                         ANDNPSrr, VANDNPSrr,
875                                         ANDNPDrr, VANDNPDrr)>;
876
877def JWriteFZeroIdiomY : SchedWriteVariant<[
878    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
879    SchedVar<NoSchedPred,                          [WriteFLogicY]>
880]>;
881def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
882                                          VANDNPSYrr, VANDNPDYrr)>;
883
884def JWriteVZeroIdiomLogic : SchedWriteVariant<[
885    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
886    SchedVar<NoSchedPred,                          [WriteVecLogic]>
887]>;
888def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
889
890def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
891    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
892    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
893]>;
894def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
895                                               PANDNrr, VPANDNrr)>;
896
897def JWriteVZeroIdiomALU : SchedWriteVariant<[
898    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
899    SchedVar<NoSchedPred,                          [WriteVecALU]>
900]>;
901def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
902                                            MMX_PSUBQirr, MMX_PSUBWirr,
903                                            MMX_PSUBSBirr, MMX_PSUBSWirr,
904                                            MMX_PSUBUSBirr, MMX_PSUBUSWirr,
905                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
906                                            MMX_PCMPGTWirr)>;
907
908def JWriteVZeroIdiomALUX : SchedWriteVariant<[
909    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
910    SchedVar<NoSchedPred,                          [WriteVecALUX]>
911]>;
912def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
913                                             PSUBDrr, VPSUBDrr,
914                                             PSUBQrr, VPSUBQrr,
915                                             PSUBWrr, VPSUBWrr,
916                                             PSUBSBrr, VPSUBSBrr,
917                                             PSUBSWrr, VPSUBSWrr,
918                                             PSUBUSBrr, VPSUBUSBrr,
919                                             PSUBUSWrr, VPSUBUSWrr,
920                                             PCMPGTBrr, VPCMPGTBrr,
921                                             PCMPGTDrr, VPCMPGTDrr,
922                                             PCMPGTQrr, VPCMPGTQrr,
923                                             PCMPGTWrr, VPCMPGTWrr)>;
924
925def JWriteVPERM2F128 : SchedWriteVariant<[
926  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
927  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
928]>;
929def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
930
931// This write is used for slow LEA instructions.
932def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
933  let Latency = 2;
934}
935
936// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
937// with a `Scale` value different than 1.
938def JSlowLEAPredicate : MCSchedPredicate<
939  CheckAny<[
940    // A 3-operand LEA (base, index, offset).
941    IsThreeOperandsLEAFn,
942    // An LEA with a "Scale" different than 1.
943    CheckAll<[
944      CheckIsImmOperand<2>,
945      CheckNot<CheckImmOperand<2, 1>>
946    ]>
947  ]>
948>;
949
950def JWriteLEA : SchedWriteVariant<[
951    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
952    SchedVar<NoSchedPred,       [WriteLEA]>
953]>;
954
955def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
956
957def JSlowLEA16r : SchedWriteRes<[JALU01]> {
958  let Latency = 3;
959  let ResourceCycles = [4];
960}
961
962def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
963
964///////////////////////////////////////////////////////////////////////////////
965// Dependency breaking instructions.
966///////////////////////////////////////////////////////////////////////////////
967
968def : IsZeroIdiomFunction<[
969  // GPR Zero-idioms.
970  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
971
972  // MMX Zero-idioms.
973  DepBreakingClass<[
974    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
975    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
976    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
977    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
978  ], ZeroIdiomPredicate>,
979
980  // SSE Zero-idioms.
981  DepBreakingClass<[
982    // fp variants.
983    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
984
985    // int variants.
986    PXORrr, PANDNrr,
987    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
988    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
989    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
990  ], ZeroIdiomPredicate>,
991
992  // AVX Zero-idioms.
993  DepBreakingClass<[
994    // xmm fp variants.
995    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
996
997    // xmm int variants.
998    VPXORrr, VPANDNrr,
999    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1000    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
1001    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1002
1003    // ymm variants.
1004    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1005  ], ZeroIdiomPredicate>,
1006
1007  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
1008]>;
1009
1010def : IsDepBreakingFunction<[
1011  // GPR
1012  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1013  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1014
1015  // MMX
1016  DepBreakingClass<[
1017    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
1018  ], ZeroIdiomPredicate>,
1019
1020  // SSE
1021  DepBreakingClass<[
1022    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1023  ], ZeroIdiomPredicate>,
1024
1025  // AVX
1026  DepBreakingClass<[
1027    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1028  ], ZeroIdiomPredicate>
1029]>;
1030
1031def : IsOptimizableRegisterMove<[
1032  InstructionEquivalenceClass<[
1033    // GPR variants.
1034    MOV32rr, MOV64rr,
1035
1036    // MMX variants.
1037    MMX_MOVQ64rr,
1038
1039    // SSE variants.
1040    MOVAPSrr, MOVUPSrr,
1041    MOVAPDrr, MOVUPDrr,
1042    MOVDQArr, MOVDQUrr,
1043
1044    // AVX variants.
1045    VMOVAPSrr, VMOVUPSrr,
1046    VMOVAPDrr, VMOVUPDrr,
1047    VMOVDQArr, VMOVDQUrr
1048  ], TruePred >
1049]>;
1050
1051} // SchedModel
1052