xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td (revision 6966ac055c3b7a39266fb982493330df7a097997)
1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD btver2 (Jaguar) to support
10// instruction scheduling and other instruction cost heuristics. Based off AMD Software
11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
12//
13//===----------------------------------------------------------------------===//
14
15def BtVer2Model : SchedMachineModel {
16  // All x86 instructions are modeled as a single micro-op, and btver2 can
17  // decode 2 instructions per cycle.
18  let IssueWidth = 2;
19  let MicroOpBufferSize = 64; // Retire Control Unit
20  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
21  let HighLatency = 25;
22  let MispredictPenalty = 14; // Minimum branch misdirection penalty
23  let PostRAScheduler = 1;
24
25  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
26  // the scheduler to assign a default model to unrecognized opcodes.
27  let CompleteModel = 0;
28}
29
30let SchedModel = BtVer2Model in {
31
32// Jaguar can issue up to 6 micro-ops in one cycle
33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
39
40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
41// speculative version of the 64-bit integer registers.
42// Reference: www.realworldtech.com/jaguar/4/
43//
44// The processor always keeps the different parts of an integer register
45// together. An instruction that writes to a part of a register will therefore
46// have a false dependence on any previous write to the same register or any
47// part of it.
48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
49// access" - Agner Fog's "microarchitecture.pdf".
50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
51                               0,  // Max moves that can be eliminated per cycle.
52                               1>; // Restrict move elimination to zero regs.
53
54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
55// registers. Operations on 256-bit data types are cracked into two COPs.
56// Reference: www.realworldtech.com/jaguar/4/
57
58// The PRF in the floating point unit can eliminate a move from a MMX or SSE
59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
60// dependency breaking instruction, or via VZEROALL).
61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
62// instructions" - Agner Fog's "microarchitecture.pdf"
63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
64                          0,  // Max moves that can be eliminated per cycle.
65                          1>; // Restrict move elimination to zero regs.
66
67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
68// retire up to two macro-ops per cycle.
69// Reference: "Software Optimization Guide for AMD Family 16h Processors"
70def JRCU : RetireControlUnit<64, 2>;
71
72// Integer Pipe Scheduler
73def JALU01 : ProcResGroup<[JALU0, JALU1]> {
74  let BufferSize=20;
75}
76
77// AGU Pipe Scheduler
78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
79  let BufferSize=12;
80}
81
82// Fpu Pipe Scheduler
83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
84  let BufferSize=18;
85}
86
87// Functional units
88def JDiv    : ProcResource<1>; // integer division
89def JMul    : ProcResource<1>; // integer multiplication
90def JVALU0  : ProcResource<1>; // vector integer
91def JVALU1  : ProcResource<1>; // vector integer
92def JVIMUL  : ProcResource<1>; // vector integer multiplication
93def JSTC    : ProcResource<1>; // vector store/convert
94def JFPM    : ProcResource<1>; // FP multiplication
95def JFPA    : ProcResource<1>; // FP addition
96
97// Functional unit groups
98def JFPX  : ProcResGroup<[JFPA, JFPM]>;
99def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
100
101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
102// cycles after the memory operand.
103def : ReadAdvance<ReadAfterLd, 3>;
104
105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
106// cycles after the memory operand.
107def : ReadAdvance<ReadAfterVecLd, 5>;
108def : ReadAdvance<ReadAfterVecXLd, 5>;
109def : ReadAdvance<ReadAfterVecYLd, 5>;
110
111/// "Additional 6 cycle transfer operation which moves a floating point
112/// operation input value from the integer unit to the floating point unit.
113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114def : ReadAdvance<ReadInt2Fpu, -6>;
115
116// Many SchedWrites are defined in pairs with and without a folded load.
117// Instructions with folded loads are usually micro-fused, so they only appear
118// as two micro-ops when dispatched by the schedulers.
119// This multiclass defines the resource usage for variants with and without
120// folded loads.
121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
122                            list<ProcResourceKind> ExePorts,
123                            int Lat, list<int> Res = [], int UOps = 1,
124                            int LoadUOps = 0> {
125  // Register variant is using a single cycle on ExePort.
126  def : WriteRes<SchedRW, ExePorts> {
127    let Latency = Lat;
128    let ResourceCycles = Res;
129    let NumMicroOps = UOps;
130  }
131
132  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
133  // latency.
134  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
135    let Latency = !add(Lat, 3);
136    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
137    let NumMicroOps = !add(UOps, LoadUOps);
138  }
139}
140
141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
142                            list<ProcResourceKind> ExePorts,
143                            int Lat, list<int> Res = [], int UOps = 1,
144                            int LoadUOps = 0> {
145  // Register variant is using a single cycle on ExePort.
146  def : WriteRes<SchedRW, ExePorts> {
147    let Latency = Lat;
148    let ResourceCycles = Res;
149    let NumMicroOps = UOps;
150  }
151
152  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
153  // latency.
154  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
155    let Latency = !add(Lat, 5);
156    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
157    let NumMicroOps = !add(UOps, LoadUOps);
158  }
159}
160
161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
162                            list<ProcResourceKind> ExePorts,
163                            int Lat, list<int> Res = [2], int UOps = 2,
164                            int LoadUOps = 0> {
165  // Register variant is using a single cycle on ExePort.
166  def : WriteRes<SchedRW, ExePorts> {
167    let Latency = Lat;
168    let ResourceCycles = Res;
169    let NumMicroOps = UOps;
170  }
171
172  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
173  // latency.
174  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
175    let Latency = !add(Lat, 5);
176    let ResourceCycles = !listconcat([2], Res);
177    let NumMicroOps = !add(UOps, LoadUOps);
178  }
179}
180
181// Instructions that have local forwarding disabled have an extra +1cy latency.
182
183// A folded store needs a cycle on the SAGU for the store data,
184// most RMW instructions don't need an extra uop.
185defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
186
187////////////////////////////////////////////////////////////////////////////////
188// Arithmetic.
189////////////////////////////////////////////////////////////////////////////////
190
191defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
192defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
193
194defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
195defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
196defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>;
197defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>;
198defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [1], 1>;
199
200defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 2>;
201defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 1], 2>;
202defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>;
203defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>;
204defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 1], 2>;
205defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>;
206defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>;
207defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
208defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>;
209defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>;
210defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
211
212defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
213defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
214defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
215defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
216defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
217defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
218defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
219defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
220
221defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
222
223defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
224defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
225def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
226def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
227def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
228
229defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
230defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
231defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
232defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
233defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
234defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
235
236// This is for simple LEAs with one or two input operands.
237def : WriteRes<WriteLEA, [JALU01]>;
238
239// Bit counts.
240defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
241defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
242defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
243defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
244defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
245
246// BMI1 BEXTR/BLS, BMI2 BZHI
247defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
248defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
249defm : X86WriteResPairUnsupported<WriteBZHI>;
250
251////////////////////////////////////////////////////////////////////////////////
252// Integer shifts and rotates.
253////////////////////////////////////////////////////////////////////////////////
254
255defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
256defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
257defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
258defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
259
260// SHLD/SHRD.
261defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
262defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
263defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
264defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
265
266////////////////////////////////////////////////////////////////////////////////
267// Loads, stores, and moves, not folded with other operations.
268////////////////////////////////////////////////////////////////////////////////
269
270def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
271def : WriteRes<WriteStore,   [JSAGU]>;
272def : WriteRes<WriteStoreNT, [JSAGU]>;
273def : WriteRes<WriteMove,    [JALU01]>;
274
275// Load/store MXCSR.
276def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
277def : WriteRes<WriteSTMXCSR, [JSAGU]>;
278
279// Treat misc copies as a move.
280def : InstRW<[WriteMove], (instrs COPY)>;
281
282////////////////////////////////////////////////////////////////////////////////
283// Idioms that clear a register, like xorps %xmm0, %xmm0.
284// These can often bypass execution ports completely.
285////////////////////////////////////////////////////////////////////////////////
286
287def : WriteRes<WriteZero,  []>;
288
289////////////////////////////////////////////////////////////////////////////////
290// Branches don't produce values, so they have no latency, but they still
291// consume resources. Indirect branches can fold loads.
292////////////////////////////////////////////////////////////////////////////////
293
294defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
295
296////////////////////////////////////////////////////////////////////////////////
297// Special case scheduling classes.
298////////////////////////////////////////////////////////////////////////////////
299
300def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
301def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
302def : WriteRes<WriteFence,  [JSAGU]>;
303
304// Nops don't have dependencies, so there's no actual latency, but we set this
305// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
306def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
307
308////////////////////////////////////////////////////////////////////////////////
309// Floating point. This covers both scalar and vector operations.
310////////////////////////////////////////////////////////////////////////////////
311
312defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
313defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
314defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
315defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
316defm : X86WriteRes<WriteFLoadX,        [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
317defm : X86WriteRes<WriteFLoadY,        [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
318defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
319defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
320
321defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
322defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
323defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
324defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
325defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
326defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
327defm : X86WriteRes<WriteFMaskedStore,  [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
328defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
329
330defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
331defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
332defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
333
334defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
335
336defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
337defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
338defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
339defm : X86WriteResPairUnsupported<WriteFAddZ>;
340defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
341defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
342defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
343defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
344defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
345defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
346defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
347defm : X86WriteResPairUnsupported<WriteFCmpZ>;
348defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
349defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
350defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
351defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
352defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
353defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
354defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
355defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
356defm : X86WriteResPairUnsupported<WriteFMulZ>;
357defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
358defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
359defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
360defm : X86WriteResPairUnsupported<WriteFMul64Z>;
361defm : X86WriteResPairUnsupported<WriteFMA>;
362defm : X86WriteResPairUnsupported<WriteFMAX>;
363defm : X86WriteResPairUnsupported<WriteFMAY>;
364defm : X86WriteResPairUnsupported<WriteFMAZ>;
365defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
366defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
367defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
368defm : X86WriteResPairUnsupported<WriteDPPSZ>;
369defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
370defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
371defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
372defm : X86WriteResPairUnsupported<WriteFRcpZ>;
373defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
374defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
375defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
376defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
377defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
378defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
379defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
380defm : X86WriteResPairUnsupported<WriteFDivZ>;
381defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
382defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
383defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
384defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
385defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
386defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
387defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
388defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
389defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
390defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
391defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
392defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
393defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
394defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
395defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
396defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
397defm : X86WriteResPairUnsupported<WriteFRndZ>;
398defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
399defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
400defm : X86WriteResPairUnsupported<WriteFLogicZ>;
401defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
402defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
403defm : X86WriteResPairUnsupported<WriteFTestZ>;
404defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
405defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
406defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
407defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
408defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
409defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
410defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
411defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
412defm : X86WriteResPairUnsupported<WriteFBlendZ>;
413defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
414defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
415defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
416defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
417defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
418
419////////////////////////////////////////////////////////////////////////////////
420// Conversions.
421////////////////////////////////////////////////////////////////////////////////
422
423defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
424defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
425defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
426defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
427defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
428defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
429defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
430defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
431
432defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
433defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
434defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
435defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
436defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
437defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
438defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
439defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
440defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
441defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
442
443defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
444defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
445defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
446defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
447
448defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
449defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
450defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
451defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
452
453defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
454defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
455defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
456
457defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
458defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
459defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
460defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
461defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
462defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
463
464////////////////////////////////////////////////////////////////////////////////
465// Vector integer operations.
466////////////////////////////////////////////////////////////////////////////////
467
468defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
469defm : X86WriteRes<WriteVecLoadX,         [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
470defm : X86WriteRes<WriteVecLoadY,         [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
471defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
472defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
473defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
474defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
475
476defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
477defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
478defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
479defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
480defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
481defm : X86WriteRes<WriteVecMaskedStore,   [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
482defm : X86WriteRes<WriteVecMaskedStoreY,  [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
483
484defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
485defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
486defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
487defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
488defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
489
490defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
491defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
492defm : X86WriteResPairUnsupported<WriteVecALUY>;
493defm : X86WriteResPairUnsupported<WriteVecALUZ>;
494defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
495defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
496defm : X86WriteResPairUnsupported<WriteVecShiftY>;
497defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
498defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
499defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
500defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
501defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
502defm : X86WriteResPairUnsupported<WriteVarVecShift>;
503defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
504defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
505defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
506defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
507defm : X86WriteResPairUnsupported<WriteVecIMulY>;
508defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
509defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
510defm : X86WriteResPairUnsupported<WritePMULLDY>;
511defm : X86WriteResPairUnsupported<WritePMULLDZ>;
512defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
513defm : X86WriteResPairUnsupported<WriteMPSADY>;
514defm : X86WriteResPairUnsupported<WriteMPSADZ>;
515defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
516defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
517defm : X86WriteResPairUnsupported<WritePSADBWY>;
518defm : X86WriteResPairUnsupported<WritePSADBWZ>;
519defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
520defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
521defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
522defm : X86WriteResPairUnsupported<WriteShuffleY>;
523defm : X86WriteResPairUnsupported<WriteShuffleZ>;
524defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
525defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
526defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
527defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
528defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
529defm : X86WriteResPairUnsupported<WriteBlendY>;
530defm : X86WriteResPairUnsupported<WriteBlendZ>;
531defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
532defm : X86WriteResPairUnsupported<WriteVarBlendY>;
533defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
534defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
535defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
536defm : X86WriteResPairUnsupported<WriteVecLogicY>;
537defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
538defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
539defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
540defm : X86WriteResPairUnsupported<WriteVecTestZ>;
541defm : X86WriteResPairUnsupported<WriteShuffle256>;
542defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
543
544////////////////////////////////////////////////////////////////////////////////
545// Vector insert/extract operations.
546////////////////////////////////////////////////////////////////////////////////
547
548defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
549defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
550defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
551defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
552
553////////////////////////////////////////////////////////////////////////////////
554// SSE42 String instructions.
555////////////////////////////////////////////////////////////////////////////////
556
557defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
558defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
559defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
560defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
561
562////////////////////////////////////////////////////////////////////////////////
563// MOVMSK Instructions.
564////////////////////////////////////////////////////////////////////////////////
565
566def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
567def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
568defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
569def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
570
571////////////////////////////////////////////////////////////////////////////////
572// AES Instructions.
573////////////////////////////////////////////////////////////////////////////////
574
575defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
576defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
577defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
578
579////////////////////////////////////////////////////////////////////////////////
580// Horizontal add/sub  instructions.
581////////////////////////////////////////////////////////////////////////////////
582
583defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
584defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
585defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
586defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
587defm : X86WriteResPairUnsupported<WritePHAddY>;
588
589////////////////////////////////////////////////////////////////////////////////
590// Carry-less multiplication instructions.
591////////////////////////////////////////////////////////////////////////////////
592
593defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
594
595////////////////////////////////////////////////////////////////////////////////
596// SSE4A instructions.
597////////////////////////////////////////////////////////////////////////////////
598
599def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
600  let Latency = 2;
601  let ResourceCycles = [1, 4];
602}
603def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
604
605////////////////////////////////////////////////////////////////////////////////
606// AVX instructions.
607////////////////////////////////////////////////////////////////////////////////
608
609def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
610def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
611
612def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
613  let Latency = 6;
614  let ResourceCycles = [1, 2, 4];
615  let NumMicroOps = 2;
616}
617def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
618                                            VBROADCASTSSYrm,
619                                            VBROADCASTF128)>;
620
621def JWriteJVZEROALL: SchedWriteRes<[]> {
622  let Latency = 90;
623  let NumMicroOps = 73;
624}
625def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
626
627def JWriteJVZEROUPPER: SchedWriteRes<[]> {
628  let Latency = 46;
629  let NumMicroOps = 37;
630}
631def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
632
633///////////////////////////////////////////////////////////////////////////////
634//  SchedWriteVariant definitions.
635///////////////////////////////////////////////////////////////////////////////
636
637def JWriteZeroLatency : SchedWriteRes<[]> {
638  let Latency = 0;
639}
640
641def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
642  let NumMicroOps = 2;
643}
644
645// Certain instructions that use the same register for both source
646// operands do not have a real dependency on the previous contents of the
647// register, and thus, do not have to wait before completing. They can be
648// optimized out at register renaming stage.
649// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
650// 15h Processors".
651// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
652// Section 21.8 [Dependency-breaking instructions].
653
654def JWriteZeroIdiom : SchedWriteVariant<[
655    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
656    SchedVar<NoSchedPred,                          [WriteALU]>
657]>;
658def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
659                                        XOR32rr, XOR64rr)>;
660
661def JWriteFZeroIdiom : SchedWriteVariant<[
662    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
663    SchedVar<NoSchedPred,                          [WriteFLogic]>
664]>;
665def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
666                                         ANDNPSrr, VANDNPSrr,
667                                         ANDNPDrr, VANDNPDrr)>;
668
669def JWriteFZeroIdiomY : SchedWriteVariant<[
670    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
671    SchedVar<NoSchedPred,                          [WriteFLogicY]>
672]>;
673def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
674                                          VANDNPSYrr, VANDNPDYrr)>;
675
676def JWriteVZeroIdiomLogic : SchedWriteVariant<[
677    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
678    SchedVar<NoSchedPred,                          [WriteVecLogic]>
679]>;
680def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
681
682def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
683    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
684    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
685]>;
686def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
687                                               PANDNrr, VPANDNrr)>;
688
689def JWriteVZeroIdiomALU : SchedWriteVariant<[
690    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
691    SchedVar<NoSchedPred,                          [WriteVecALU]>
692]>;
693def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
694                                            MMX_PSUBQirr, MMX_PSUBWirr,
695                                            MMX_PSUBSBirr, MMX_PSUBSWirr,
696                                            MMX_PSUBUSBirr, MMX_PSUBUSWirr,
697                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
698                                            MMX_PCMPGTWirr)>;
699
700def JWriteVZeroIdiomALUX : SchedWriteVariant<[
701    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
702    SchedVar<NoSchedPred,                          [WriteVecALUX]>
703]>;
704def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
705                                             PSUBDrr, VPSUBDrr,
706                                             PSUBQrr, VPSUBQrr,
707                                             PSUBWrr, VPSUBWrr,
708                                             PSUBSBrr, VPSUBSBrr,
709                                             PSUBSWrr, VPSUBSWrr,
710                                             PSUBUSBrr, VPSUBUSBrr,
711                                             PSUBUSWrr, VPSUBUSWrr,
712                                             PCMPGTBrr, VPCMPGTBrr,
713                                             PCMPGTDrr, VPCMPGTDrr,
714                                             PCMPGTQrr, VPCMPGTQrr,
715                                             PCMPGTWrr, VPCMPGTWrr)>;
716
717def JWriteVPERM2F128 : SchedWriteVariant<[
718  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
719  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
720]>;
721def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
722
723// This write is used for slow LEA instructions.
724def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
725  let Latency = 2;
726}
727
728// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
729// with a `Scale` value different than 1.
730def JSlowLEAPredicate : MCSchedPredicate<
731  CheckAny<[
732    // A 3-operand LEA (base, index, offset).
733    IsThreeOperandsLEAFn,
734    // An LEA with a "Scale" different than 1.
735    CheckAll<[
736      CheckIsImmOperand<2>,
737      CheckNot<CheckImmOperand<2, 1>>
738    ]>
739  ]>
740>;
741
742def JWriteLEA : SchedWriteVariant<[
743    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
744    SchedVar<NoSchedPred,       [WriteLEA]>
745]>;
746
747def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
748
749def JSlowLEA16r : SchedWriteRes<[JALU01]> {
750  let Latency = 3;
751  let ResourceCycles = [4];
752}
753
754def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
755
756///////////////////////////////////////////////////////////////////////////////
757// Dependency breaking instructions.
758///////////////////////////////////////////////////////////////////////////////
759
760def : IsZeroIdiomFunction<[
761  // GPR Zero-idioms.
762  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
763
764  // MMX Zero-idioms.
765  DepBreakingClass<[
766    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
767    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
768    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
769    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
770  ], ZeroIdiomPredicate>,
771
772  // SSE Zero-idioms.
773  DepBreakingClass<[
774    // fp variants.
775    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
776
777    // int variants.
778    PXORrr, PANDNrr,
779    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
780    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
781    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
782  ], ZeroIdiomPredicate>,
783
784  // AVX Zero-idioms.
785  DepBreakingClass<[
786    // xmm fp variants.
787    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
788
789    // xmm int variants.
790    VPXORrr, VPANDNrr,
791    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
792    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
793    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
794
795    // ymm variants.
796    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
797  ], ZeroIdiomPredicate>,
798
799  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
800]>;
801
802def : IsDepBreakingFunction<[
803  // GPR
804  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
805  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
806
807  // MMX
808  DepBreakingClass<[
809    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
810  ], ZeroIdiomPredicate>,
811
812  // SSE
813  DepBreakingClass<[
814    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
815  ], ZeroIdiomPredicate>,
816
817  // AVX
818  DepBreakingClass<[
819    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
820  ], ZeroIdiomPredicate>
821]>;
822
823def : IsOptimizableRegisterMove<[
824  InstructionEquivalenceClass<[
825    // GPR variants.
826    MOV32rr, MOV64rr,
827
828    // MMX variants.
829    MMX_MOVQ64rr,
830
831    // SSE variants.
832    MOVAPSrr, MOVUPSrr,
833    MOVAPDrr, MOVUPDrr,
834    MOVDQArr, MOVDQUrr,
835
836    // AVX variants.
837    VMOVAPSrr, VMOVUPSrr,
838    VMOVAPDrr, VMOVUPDrr,
839    VMOVDQArr, VMOVDQUrr
840  ], TruePred >
841]>;
842
843} // SchedModel
844