xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD btver2 (Jaguar) to support
10// instruction scheduling and other instruction cost heuristics. Based off AMD Software
11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
12//
13//===----------------------------------------------------------------------===//
14
15def BtVer2Model : SchedMachineModel {
16  // All x86 instructions are modeled as a single micro-op, and btver2 can
17  // decode 2 instructions per cycle.
18  let IssueWidth = 2;
19  let MicroOpBufferSize = 64; // Retire Control Unit
20  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
21  let HighLatency = 25;
22  let MispredictPenalty = 14; // Minimum branch misdirection penalty
23  let PostRAScheduler = 1;
24
25  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
26  // the scheduler to assign a default model to unrecognized opcodes.
27  let CompleteModel = 0;
28}
29
30let SchedModel = BtVer2Model in {
31
32// Jaguar can issue up to 6 micro-ops in one cycle
33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
39
40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
41// speculative version of the 64-bit integer registers.
42// Reference: www.realworldtech.com/jaguar/4/
43//
44// The processor always keeps the different parts of an integer register
45// together. An instruction that writes to a part of a register will therefore
46// have a false dependence on any previous write to the same register or any
47// part of it.
48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
49// access" - Agner Fog's "microarchitecture.pdf".
50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
51                               0,  // Max moves that can be eliminated per cycle.
52                               1>; // Restrict move elimination to zero regs.
53
54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
55// registers. Operations on 256-bit data types are cracked into two COPs.
56// Reference: www.realworldtech.com/jaguar/4/
57
58// The PRF in the floating point unit can eliminate a move from a MMX or SSE
59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
60// dependency breaking instruction, or via VZEROALL).
61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
62// instructions" - Agner Fog's "microarchitecture.pdf"
63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
64                          0,  // Max moves that can be eliminated per cycle.
65                          1>; // Restrict move elimination to zero regs.
66
67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
68// retire up to two macro-ops per cycle.
69// Reference: "Software Optimization Guide for AMD Family 16h Processors"
70def JRCU : RetireControlUnit<64, 2>;
71
72// Integer Pipe Scheduler
73def JALU01 : ProcResGroup<[JALU0, JALU1]> {
74  let BufferSize=20;
75}
76
77// AGU Pipe Scheduler
78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
79  let BufferSize=12;
80}
81
82// Fpu Pipe Scheduler
83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
84  let BufferSize=18;
85}
86
87// Functional units
88def JDiv    : ProcResource<1>; // integer division
89def JMul    : ProcResource<1>; // integer multiplication
90def JVALU0  : ProcResource<1>; // vector integer
91def JVALU1  : ProcResource<1>; // vector integer
92def JVIMUL  : ProcResource<1>; // vector integer multiplication
93def JSTC    : ProcResource<1>; // vector store/convert
94def JFPM    : ProcResource<1>; // FP multiplication
95def JFPA    : ProcResource<1>; // FP addition
96
97// Functional unit groups
98def JFPX  : ProcResGroup<[JFPA, JFPM]>;
99def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
100
101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
102// cycles after the memory operand.
103def : ReadAdvance<ReadAfterLd, 3>;
104
105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
106// cycles after the memory operand.
107def : ReadAdvance<ReadAfterVecLd, 5>;
108def : ReadAdvance<ReadAfterVecXLd, 5>;
109def : ReadAdvance<ReadAfterVecYLd, 5>;
110
111/// "Additional 6 cycle transfer operation which moves a floating point
112/// operation input value from the integer unit to the floating point unit.
113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114def : ReadAdvance<ReadInt2Fpu, -6>;
115
116// Many SchedWrites are defined in pairs with and without a folded load.
117// Instructions with folded loads are usually micro-fused, so they only appear
118// as two micro-ops when dispatched by the schedulers.
119// This multiclass defines the resource usage for variants with and without
120// folded loads.
121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
122                            list<ProcResourceKind> ExePorts,
123                            int Lat, list<int> Res = [], int UOps = 1,
124                            int LoadUOps = 0> {
125  // Register variant is using a single cycle on ExePort.
126  def : WriteRes<SchedRW, ExePorts> {
127    let Latency = Lat;
128    let ResourceCycles = Res;
129    let NumMicroOps = UOps;
130  }
131
132  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
133  // latency.
134  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
135    let Latency = !add(Lat, 3);
136    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
137    let NumMicroOps = !add(UOps, LoadUOps);
138  }
139}
140
141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
142                            list<ProcResourceKind> ExePorts,
143                            int Lat, list<int> Res = [], int UOps = 1,
144                            int LoadUOps = 0> {
145  // Register variant is using a single cycle on ExePort.
146  def : WriteRes<SchedRW, ExePorts> {
147    let Latency = Lat;
148    let ResourceCycles = Res;
149    let NumMicroOps = UOps;
150  }
151
152  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
153  // latency.
154  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
155    let Latency = !add(Lat, 5);
156    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
157    let NumMicroOps = !add(UOps, LoadUOps);
158  }
159}
160
161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
162                            list<ProcResourceKind> ExePorts,
163                            int Lat, list<int> Res = [2], int UOps = 2,
164                            int LoadUOps = 0> {
165  // Register variant is using a single cycle on ExePort.
166  def : WriteRes<SchedRW, ExePorts> {
167    let Latency = Lat;
168    let ResourceCycles = Res;
169    let NumMicroOps = UOps;
170  }
171
172  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
173  // latency.
174  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
175    let Latency = !add(Lat, 5);
176    let ResourceCycles = !listconcat([2], Res);
177    let NumMicroOps = !add(UOps, LoadUOps);
178  }
179}
180
181// Instructions that have local forwarding disabled have an extra +1cy latency.
182
183// A folded store needs a cycle on the SAGU for the store data, most RMW
184// instructions don't need an extra uop.  ALU RMW operations don't seem to
185// benefit from STLF, and their observed latency is 6cy. That is the reason why
186// this write adds two extra cycles (instead of just 1cy for the store).
187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
188
189////////////////////////////////////////////////////////////////////////////////
190// Arithmetic.
191////////////////////////////////////////////////////////////////////////////////
192
193defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
194defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
195
196defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
197defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
198defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
199defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
200defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
201
202defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
203defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
206defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
209defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
212defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
213
214defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
215defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
216defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
217defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
218defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
219defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
220defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
221defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
222
223defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
224
225defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
226defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
227def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
228def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
229def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
230
231defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
232defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
233defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
234defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
235defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
236defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
237
238// This is for simple LEAs with one or two input operands.
239def : WriteRes<WriteLEA, [JALU01]>;
240
241// Bit counts.
242defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
243defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
244defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
245defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
246defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
247
248// BMI1 BEXTR/BLS, BMI2 BZHI
249defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
250defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
251defm : X86WriteResPairUnsupported<WriteBZHI>;
252
253////////////////////////////////////////////////////////////////////////////////
254// Integer shifts and rotates.
255////////////////////////////////////////////////////////////////////////////////
256
257defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
258defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
259defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
260defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
261
262// SHLD/SHRD.
263defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
264defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
265defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
266defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
267
268////////////////////////////////////////////////////////////////////////////////
269// Loads, stores, and moves, not folded with other operations.
270////////////////////////////////////////////////////////////////////////////////
271
272def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
273def : WriteRes<WriteStore,   [JSAGU]>;
274def : WriteRes<WriteStoreNT, [JSAGU]>;
275def : WriteRes<WriteMove,    [JALU01]>;
276
277// Load/store MXCSR.
278def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
279def : WriteRes<WriteSTMXCSR, [JSAGU]>;
280
281// Treat misc copies as a move.
282def : InstRW<[WriteMove], (instrs COPY)>;
283
284////////////////////////////////////////////////////////////////////////////////
285// Idioms that clear a register, like xorps %xmm0, %xmm0.
286// These can often bypass execution ports completely.
287////////////////////////////////////////////////////////////////////////////////
288
289def : WriteRes<WriteZero,  []>;
290
291////////////////////////////////////////////////////////////////////////////////
292// Branches don't produce values, so they have no latency, but they still
293// consume resources. Indirect branches can fold loads.
294////////////////////////////////////////////////////////////////////////////////
295
296defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
297
298////////////////////////////////////////////////////////////////////////////////
299// Special case scheduling classes.
300////////////////////////////////////////////////////////////////////////////////
301
302def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
303def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
304def : WriteRes<WriteFence,  [JSAGU]>;
305
306// Nops don't have dependencies, so there's no actual latency, but we set this
307// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
308def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
309
310def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
311  let Latency = 3;
312  let ResourceCycles = [3];
313  let NumMicroOps = 3;
314}
315
316def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
317  let Latency = 16;
318  let ResourceCycles = [3,16,16];
319  let NumMicroOps = 5;
320}
321
322def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
323  let Latency = 17;
324  let ResourceCycles = [3,17,17];
325  let NumMicroOps = 6;
326}
327
328def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
329  let Latency = 11;
330  let ResourceCycles = [3,1,1];
331  let NumMicroOps = 5;
332}
333
334def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
335  let Latency = 11;
336  let ResourceCycles = [3,1,1];
337  let NumMicroOps = 18;
338}
339
340def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
341  let Latency = 32;
342  let ResourceCycles = [6,1,1];
343  let NumMicroOps = 28;
344}
345
346def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
347  let Latency = 19;
348  let ResourceCycles = [3,19,19];
349  let NumMicroOps = 18;
350}
351
352def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
353  let Latency = 38;
354  let ResourceCycles = [6,38,38];
355  let NumMicroOps = 28;
356}
357
358def JWriteCMPXCHGVariant :  SchedWriteVariant<[
359  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
360  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
361  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
362  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
363  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
364  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
365  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
366  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
367  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
368  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
369]>;
370
371// The first five reads are contributed by the memory load operand.
372// We ignore those reads and set a read-advance for the other input operands
373// including the implicit read of RAX.
374def : InstRW<[JWriteCMPXCHGVariant,
375              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
376              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
377                                                 LCMPXCHG32, LCMPXCHG64,
378                                                 CMPXCHG8rm, CMPXCHG16rm,
379                                                 CMPXCHG32rm, CMPXCHG64rm)>;
380
381def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
382                                             CMPXCHG32rr, CMPXCHG64rr)>;
383
384def : InstRW<[JWriteCMPXCHGVariant,
385              // Ignore reads contributed by the memory operand.
386              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
387              // Add a read-advance to every implicit register read.
388              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
389                                                                           CMPXCHG8B, CMPXCHG16B)>;
390
391def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
392  let Latency = 19;
393  let ResourceCycles = [1,19,19];
394  let NumMicroOps = 1;
395}
396
397def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
398  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
399  SchedVar<NoSchedPred,                       [WriteALURMW]>
400]>;
401def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
402                                                 DEC8m, DEC16m, DEC32m, DEC64m,
403                                                 NOT8m, NOT16m, NOT32m, NOT64m,
404                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
405
406def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
407  let Latency = 2;
408  let ResourceCycles = [3];
409  let NumMicroOps = 3;
410}
411def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
412                                                      XADD32rr, XADD64rr)>;
413
414// This write defines the latency of the in/out register operand of a non-atomic
415// XADDrm. This is the first of a pair of writes that model non-atomic
416// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
417//
418// We need two writes because the instruction latency differs from the output
419// register operand latency. In particular, the first write describes the first
420// (and only) output register operand of the instruction.  However, the
421// instruction latency is set to the MAX of all the write latencies. That's why
422// a second write is needed in this case (see example below).
423//
424// Example:
425//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
426//                            ## ECX write Latency: 3cy
427//
428// Register ECX becomes available in 3 cycles. That is because the value of ECX
429// is exchanged with the value read from the stack pointer, and the load-to-use
430// latency is assumed to be 3cy.
431def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
432  let Latency = 3;  // load-to-use latency
433  let ResourceCycles = [3];
434  let NumMicroOps = 3;
435}
436
437// This write defines the latency of the in/out register operand of an atomic
438// XADDrm. This is the first of a sequence of two writes used to model atomic
439// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
440//
441//
442// Example:
443//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
444//                               ## ECX write Latency: 11cy
445//
446// The value of ECX becomes available only after 11cy from the start of
447// execution. This write is used to specifically set that operand latency.
448def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
449  let Latency = 11;
450  let ResourceCycles = [3];
451  let NumMicroOps = 3;
452}
453
454// This write defines the latency of the in/out register operand of an atomic
455// XCHGrm. This write is the first of a sequence of two writes that describe
456// atomic XCHG operations. We need two writes because the instruction latency
457// differs from the output register write latency.  We want to make sure that
458// the output register operand becomes visible after 11cy. However, we want to
459// set the instruction latency to 16cy.
460def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
461  let Latency = 11;
462  let ResourceCycles = [2];
463  let NumMicroOps = 2;
464}
465
466def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
467  let Latency = 11;
468  let ResourceCycles = [1, 1];
469  let NumMicroOps = 1;
470}
471
472def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
473  let Latency = 16;
474  let ResourceCycles = [16, 16];
475  let NumMicroOps = 1;
476}
477
478def JWriteXADDrm_Part1 : SchedWriteVariant<[
479  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
480  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
481]>;
482
483def JWriteXADDrm_Part2 : SchedWriteVariant<[
484  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
485  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
486]>;
487
488def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
489                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
490                         LXADD8, LXADD16, LXADD32, LXADD64)>;
491
492def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
493                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
494
495
496////////////////////////////////////////////////////////////////////////////////
497// Floating point. This covers both scalar and vector operations.
498////////////////////////////////////////////////////////////////////////////////
499
500defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
501defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
502defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
503defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
504defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
505defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
506defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
507defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
508
509defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
510defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
511defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
512defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
513defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
514defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
515
516defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
517defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
518defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
519defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
520
521defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
522defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
523defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
524
525defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
526
527defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
528defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
529defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
530defm : X86WriteResPairUnsupported<WriteFAddZ>;
531defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
532defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
533defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
534defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
535defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
536defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
537defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
538defm : X86WriteResPairUnsupported<WriteFCmpZ>;
539defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
540defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
541defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
542defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
543defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
544defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
545defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
546defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
547defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
548defm : X86WriteResPairUnsupported<WriteFMulZ>;
549defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
550defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
551defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
552defm : X86WriteResPairUnsupported<WriteFMul64Z>;
553defm : X86WriteResPairUnsupported<WriteFMA>;
554defm : X86WriteResPairUnsupported<WriteFMAX>;
555defm : X86WriteResPairUnsupported<WriteFMAY>;
556defm : X86WriteResPairUnsupported<WriteFMAZ>;
557defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
558defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
559defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
560defm : X86WriteResPairUnsupported<WriteDPPSZ>;
561defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
562defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
563defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
564defm : X86WriteResPairUnsupported<WriteFRcpZ>;
565defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
566defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
567defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
568defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
569defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
570defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
571defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
572defm : X86WriteResPairUnsupported<WriteFDivZ>;
573defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
574defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
575defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
576defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
577defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
578defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
579defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
580defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
581defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
582defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
583defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
584defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
585defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
586defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
587defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
588defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
589defm : X86WriteResPairUnsupported<WriteFRndZ>;
590defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
591defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
592defm : X86WriteResPairUnsupported<WriteFLogicZ>;
593defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
594defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
595defm : X86WriteResPairUnsupported<WriteFTestZ>;
596defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
597defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
598defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
599defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
600defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
601defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
602defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
603defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
604defm : X86WriteResPairUnsupported<WriteFBlendZ>;
605defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
606defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
607defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
608defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
609defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
610
611////////////////////////////////////////////////////////////////////////////////
612// Conversions.
613////////////////////////////////////////////////////////////////////////////////
614
615defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
616defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
617defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
618defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
619defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
620defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
621defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
622defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
623
624defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
625defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
626defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
627defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
628defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
629defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
630defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
631defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
632defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
633defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
634
635defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
636defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
637defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
638defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
639
640defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
641defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
642defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
643defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
644
645defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
646defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
647defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
648
649defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
650defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
651defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
652defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
653defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
654defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
655
656////////////////////////////////////////////////////////////////////////////////
657// Vector integer operations.
658////////////////////////////////////////////////////////////////////////////////
659
660defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
661defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
662defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
663defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
664defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
665defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
666defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
667
668defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
669defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
670defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
671defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
672defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
673defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
674defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
675defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
676defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
677
678defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
679defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
680defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
681defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
682defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
683
684defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
685defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
686defm : X86WriteResPairUnsupported<WriteVecALUY>;
687defm : X86WriteResPairUnsupported<WriteVecALUZ>;
688defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
689defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
690defm : X86WriteResPairUnsupported<WriteVecShiftY>;
691defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
692defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
693defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
694defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
695defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
696defm : X86WriteResPairUnsupported<WriteVarVecShift>;
697defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
698defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
699defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
700defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
701defm : X86WriteResPairUnsupported<WriteVecIMulY>;
702defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
703defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
704defm : X86WriteResPairUnsupported<WritePMULLDY>;
705defm : X86WriteResPairUnsupported<WritePMULLDZ>;
706defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
707defm : X86WriteResPairUnsupported<WriteMPSADY>;
708defm : X86WriteResPairUnsupported<WriteMPSADZ>;
709defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
710defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
711defm : X86WriteResPairUnsupported<WritePSADBWY>;
712defm : X86WriteResPairUnsupported<WritePSADBWZ>;
713defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
714defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
715defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
716defm : X86WriteResPairUnsupported<WriteShuffleY>;
717defm : X86WriteResPairUnsupported<WriteShuffleZ>;
718defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
719defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
720defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
721defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
722defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
723defm : X86WriteResPairUnsupported<WriteBlendY>;
724defm : X86WriteResPairUnsupported<WriteBlendZ>;
725defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
726defm : X86WriteResPairUnsupported<WriteVarBlendY>;
727defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
728defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
729defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
730defm : X86WriteResPairUnsupported<WriteVecLogicY>;
731defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
732defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
733defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
734defm : X86WriteResPairUnsupported<WriteVecTestZ>;
735defm : X86WriteResPairUnsupported<WriteShuffle256>;
736defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
737
738////////////////////////////////////////////////////////////////////////////////
739// Vector insert/extract operations.
740////////////////////////////////////////////////////////////////////////////////
741
742defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
743defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
744defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
745defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
746
747////////////////////////////////////////////////////////////////////////////////
748// SSE42 String instructions.
749////////////////////////////////////////////////////////////////////////////////
750
751defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
752defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
753defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
754defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
755
756////////////////////////////////////////////////////////////////////////////////
757// MOVMSK Instructions.
758////////////////////////////////////////////////////////////////////////////////
759
760def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
761def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
762defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
763def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
764
765////////////////////////////////////////////////////////////////////////////////
766// AES Instructions.
767////////////////////////////////////////////////////////////////////////////////
768
769defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
770defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
771defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
772
773////////////////////////////////////////////////////////////////////////////////
774// Horizontal add/sub  instructions.
775////////////////////////////////////////////////////////////////////////////////
776
777defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
778defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
779defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
780defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
781defm : X86WriteResPairUnsupported<WritePHAddY>;
782
783////////////////////////////////////////////////////////////////////////////////
784// Carry-less multiplication instructions.
785////////////////////////////////////////////////////////////////////////////////
786
787defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
788
789////////////////////////////////////////////////////////////////////////////////
790// SSE4A instructions.
791////////////////////////////////////////////////////////////////////////////////
792
793def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
794  let Latency = 2;
795  let ResourceCycles = [1, 4];
796}
797def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
798
799////////////////////////////////////////////////////////////////////////////////
800// AVX instructions.
801////////////////////////////////////////////////////////////////////////////////
802
803def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
804def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
805
806def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
807  let Latency = 6;
808  let ResourceCycles = [1, 2, 4];
809  let NumMicroOps = 2;
810}
811def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
812                                            VBROADCASTSSYrm,
813                                            VBROADCASTF128)>;
814
815def JWriteJVZEROALL: SchedWriteRes<[]> {
816  let Latency = 90;
817  let NumMicroOps = 73;
818}
819def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
820
821def JWriteJVZEROUPPER: SchedWriteRes<[]> {
822  let Latency = 46;
823  let NumMicroOps = 37;
824}
825def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
826
827///////////////////////////////////////////////////////////////////////////////
828//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
829///////////////////////////////////////////////////////////////////////////////
830
831def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
832  let Latency = 34;
833  let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
834  let NumMicroOps = 63;
835}
836def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
837                                         VMASKMOVDQU, VMASKMOVDQU64)>;
838
839///////////////////////////////////////////////////////////////////////////////
840//  SchedWriteVariant definitions.
841///////////////////////////////////////////////////////////////////////////////
842
843def JWriteZeroLatency : SchedWriteRes<[]> {
844  let Latency = 0;
845}
846
847def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
848  let NumMicroOps = 2;
849}
850
851// Certain instructions that use the same register for both source
852// operands do not have a real dependency on the previous contents of the
853// register, and thus, do not have to wait before completing. They can be
854// optimized out at register renaming stage.
855// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
856// 15h Processors".
857// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
858// Section 21.8 [Dependency-breaking instructions].
859
860def JWriteZeroIdiom : SchedWriteVariant<[
861    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
862    SchedVar<NoSchedPred,                          [WriteALU]>
863]>;
864def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
865                                        XOR32rr, XOR64rr)>;
866
867def JWriteFZeroIdiom : SchedWriteVariant<[
868    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
869    SchedVar<NoSchedPred,                          [WriteFLogic]>
870]>;
871def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
872                                         ANDNPSrr, VANDNPSrr,
873                                         ANDNPDrr, VANDNPDrr)>;
874
875def JWriteFZeroIdiomY : SchedWriteVariant<[
876    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
877    SchedVar<NoSchedPred,                          [WriteFLogicY]>
878]>;
879def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
880                                          VANDNPSYrr, VANDNPDYrr)>;
881
882def JWriteVZeroIdiomLogic : SchedWriteVariant<[
883    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
884    SchedVar<NoSchedPred,                          [WriteVecLogic]>
885]>;
886def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
887
888def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
889    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
890    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
891]>;
892def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
893                                               PANDNrr, VPANDNrr)>;
894
895def JWriteVZeroIdiomALU : SchedWriteVariant<[
896    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
897    SchedVar<NoSchedPred,                          [WriteVecALU]>
898]>;
899def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
900                                            MMX_PSUBQirr, MMX_PSUBWirr,
901                                            MMX_PSUBSBirr, MMX_PSUBSWirr,
902                                            MMX_PSUBUSBirr, MMX_PSUBUSWirr,
903                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
904                                            MMX_PCMPGTWirr)>;
905
906def JWriteVZeroIdiomALUX : SchedWriteVariant<[
907    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
908    SchedVar<NoSchedPred,                          [WriteVecALUX]>
909]>;
910def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
911                                             PSUBDrr, VPSUBDrr,
912                                             PSUBQrr, VPSUBQrr,
913                                             PSUBWrr, VPSUBWrr,
914                                             PSUBSBrr, VPSUBSBrr,
915                                             PSUBSWrr, VPSUBSWrr,
916                                             PSUBUSBrr, VPSUBUSBrr,
917                                             PSUBUSWrr, VPSUBUSWrr,
918                                             PCMPGTBrr, VPCMPGTBrr,
919                                             PCMPGTDrr, VPCMPGTDrr,
920                                             PCMPGTQrr, VPCMPGTQrr,
921                                             PCMPGTWrr, VPCMPGTWrr)>;
922
923def JWriteVPERM2F128 : SchedWriteVariant<[
924  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
925  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
926]>;
927def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
928
929// This write is used for slow LEA instructions.
930def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
931  let Latency = 2;
932}
933
934// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
935// with a `Scale` value different than 1.
936def JSlowLEAPredicate : MCSchedPredicate<
937  CheckAny<[
938    // A 3-operand LEA (base, index, offset).
939    IsThreeOperandsLEAFn,
940    // An LEA with a "Scale" different than 1.
941    CheckAll<[
942      CheckIsImmOperand<2>,
943      CheckNot<CheckImmOperand<2, 1>>
944    ]>
945  ]>
946>;
947
948def JWriteLEA : SchedWriteVariant<[
949    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
950    SchedVar<NoSchedPred,       [WriteLEA]>
951]>;
952
953def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
954
955def JSlowLEA16r : SchedWriteRes<[JALU01]> {
956  let Latency = 3;
957  let ResourceCycles = [4];
958}
959
960def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
961
962///////////////////////////////////////////////////////////////////////////////
963// Dependency breaking instructions.
964///////////////////////////////////////////////////////////////////////////////
965
966def : IsZeroIdiomFunction<[
967  // GPR Zero-idioms.
968  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
969
970  // MMX Zero-idioms.
971  DepBreakingClass<[
972    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
973    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
974    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
975    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
976  ], ZeroIdiomPredicate>,
977
978  // SSE Zero-idioms.
979  DepBreakingClass<[
980    // fp variants.
981    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
982
983    // int variants.
984    PXORrr, PANDNrr,
985    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
986    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
987    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
988  ], ZeroIdiomPredicate>,
989
990  // AVX Zero-idioms.
991  DepBreakingClass<[
992    // xmm fp variants.
993    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
994
995    // xmm int variants.
996    VPXORrr, VPANDNrr,
997    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
998    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
999    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1000
1001    // ymm variants.
1002    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1003  ], ZeroIdiomPredicate>,
1004
1005  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
1006]>;
1007
1008def : IsDepBreakingFunction<[
1009  // GPR
1010  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1011  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1012
1013  // MMX
1014  DepBreakingClass<[
1015    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
1016  ], ZeroIdiomPredicate>,
1017
1018  // SSE
1019  DepBreakingClass<[
1020    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1021  ], ZeroIdiomPredicate>,
1022
1023  // AVX
1024  DepBreakingClass<[
1025    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1026  ], ZeroIdiomPredicate>
1027]>;
1028
1029def : IsOptimizableRegisterMove<[
1030  InstructionEquivalenceClass<[
1031    // GPR variants.
1032    MOV32rr, MOV64rr,
1033
1034    // MMX variants.
1035    MMX_MOVQ64rr,
1036
1037    // SSE variants.
1038    MOVAPSrr, MOVUPSrr,
1039    MOVAPDrr, MOVUPDrr,
1040    MOVDQArr, MOVDQUrr,
1041
1042    // AVX variants.
1043    VMOVAPSrr, VMOVUPSrr,
1044    VMOVAPDrr, VMOVUPDrr,
1045    VMOVDQArr, VMOVDQUrr
1046  ], TruePred >
1047]>;
1048
1049} // SchedModel
1050