xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td (revision 1f1e2261e341e6ca6862f82261066ef1705f0a7a)
1//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD bdver2 (Piledriver) to support
10// instruction scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 15h Processors.
13//    https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf
14//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
15//    http://www.agner.org/optimize/microarchitecture.pdf
16//  * https://www.realworldtech.com/bulldozer/
17//    Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2.
18//
19//===----------------------------------------------------------------------===//
20
21def BdVer2Model : SchedMachineModel {
22  let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired.
23  let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed.
24  let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer.
25  let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency.
26  let HighLatency = 25; // FIXME: any better choice?
27  let MispredictPenalty = 20; // Minimum branch misdirection penalty.
28
29  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
30
31  // FIXME: Incomplete. This flag is set to allow the scheduler to assign
32  //        a default model to unrecognized opcodes.
33  let CompleteModel = 0;
34} // SchedMachineModel
35
36let SchedModel = BdVer2Model in {
37
38
39//===----------------------------------------------------------------------===//
40// Pipes
41//===----------------------------------------------------------------------===//
42
43// There are total of eight pipes.
44
45//===----------------------------------------------------------------------===//
46// Integer execution pipes
47//
48
49// Two EX (ALU) pipes.
50def PdEX0  : ProcResource<1>; // ALU, Integer Pipe0
51def PdEX1  : ProcResource<1>; // ALU, Integer Pipe1
52def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>;
53
54// Two AGLU pipes, identical.
55def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23]
56
57//===----------------------------------------------------------------------===//
58// Floating point execution pipes
59//
60
61// Four FPU pipes.
62
63def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0
64def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1
65def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2
66def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3
67
68// FPU grouping
69def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>;
70def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>;
71
72
73//===----------------------------------------------------------------------===//
74// RCU
75//===----------------------------------------------------------------------===//
76
77// The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle.
78// On the other hand, the RCU reorder buffer size for Piledriver does not
79// seem be specified in any trustworthy source.
80// But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had
81// RCU reorder buffer size of 128. So that is a good guess for now.
82def PdRCU : RetireControlUnit<128, 4>;
83
84
85//===----------------------------------------------------------------------===//
86// Pipelines
87//===----------------------------------------------------------------------===//
88
89// There are total of two pipelines, each one with it's own scheduler.
90
91//===----------------------------------------------------------------------===//
92// Integer Pipeline Scheduling
93//
94
95// There is one Integer Scheduler per core.
96
97// Integer physical register file has 96 registers of 64-bit.
98def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>;
99
100// Unified Integer, Memory Scheduler has 40 entries.
101def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> {
102  // Up to 4 IPC can be decoded, issued, retired.
103  let BufferSize = 40;
104}
105
106
107//===----------------------------------------------------------------------===//
108// FPU Pipeline Scheduling
109//
110
111// The FPU unit is shared between the two cores.
112
113// FP physical register file has 160 registers of 128-bit.
114// Operations on 256-bit data types are cracked into two COPs.
115def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
116
117// Unified FP Scheduler has 64 entries,
118def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
119  // Up to 4 IPC can be decoded, issued, retired.
120  let BufferSize = 64;
121}
122
123
124//===----------------------------------------------------------------------===//
125// Functional units
126//===----------------------------------------------------------------------===//
127
128//===----------------------------------------------------------------------===//
129// Load-Store Units
130//
131
132let Super = PdAGLU01 in
133def PdLoad  : ProcResource<2> {
134  // For Piledriver, the load queue is 40 entries deep.
135  let BufferSize = 40;
136}
137
138def PdLoadQueue : LoadQueue<PdLoad>;
139
140let Super = PdAGLU01 in
141def PdStore : ProcResource<1> {
142  // For Piledriver, the store queue is 24 entries deep.
143  let BufferSize = 24;
144}
145
146def PdStoreQueue : StoreQueue<PdStore>;
147
148//===----------------------------------------------------------------------===//
149// Integer Execution Units
150//
151
152def PdDiv    : ProcResource<1>; // PdEX0; unpipelined integer division
153def PdCount  : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT
154
155def PdMul    : ProcResource<1>; // PdEX1; integer multiplication
156def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches
157
158//===----------------------------------------------------------------------===//
159// Floating-Point Units
160//
161
162// Two FMAC/FPFMA units.
163def PdFPFMA  : ProcResource<2>; // PdFPU0, PdFPU1
164
165// One 128-bit integer multiply-accumulate unit.
166def PdFPMMA  : ProcResource<1>; // PdFPU0
167
168// One fp conversion unit.
169def PdFPCVT  : ProcResource<1>; // PdFPU0
170
171// One unit for shuffles, packs, permutes, shifts.
172def PdFPXBR  : ProcResource<1>; // PdFPU1
173
174// Two 128-bit packed integer units.
175def PdFPMAL  : ProcResource<2>; // PdFPU2, PdFPU3
176
177// One FP store unit.
178def PdFPSTO  : ProcResource<1>; // PdFPU3
179
180
181//===----------------------------------------------------------------------===//
182// Basic helper classes.
183//===----------------------------------------------------------------------===//
184
185// Many SchedWrites are defined in pairs with and without a folded load.
186// Instructions with folded loads are usually micro-fused, so they only appear
187// as two micro-ops when dispatched by the schedulers.
188// This multiclass defines the resource usage for variants with and without
189// folded loads.
190multiclass PdWriteRes<SchedWrite SchedRW,
191                      list<ProcResourceKind> ExePorts, int Lat = 1,
192                      list<int> Res = [], int UOps = 1> {
193  def : WriteRes<SchedRW, ExePorts> {
194    let Latency = Lat;
195    let ResourceCycles = Res;
196    let NumMicroOps = UOps;
197  }
198}
199
200multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
201                            list<ProcResourceKind> ExePorts, int Lat,
202                            list<int> Res, int UOps,
203                            int LoadLat, int LoadRes, int LoadUOps> {
204  defm : PdWriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
205
206  defm : PdWriteRes<SchedRW.Folded,
207                    !listconcat([PdLoad], ExePorts),
208                    !add(Lat, LoadLat),
209                    !if(!and(!empty(Res), !eq(LoadRes, 1)),
210                      [],
211                      !listconcat([LoadRes],
212                        !if(!empty(Res),
213                          !listsplat(1, !size(ExePorts)),
214                          Res))),
215                    !add(UOps, LoadUOps)>;
216}
217
218multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
219                            list<ProcResourceKind> ExePorts, int Lat = 1,
220                            list<int> Res = [], int UOps = 1,
221                            int LoadUOps = 0> {
222  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
223                          /*LoadLat*/4, /*LoadRes*/3, LoadUOps>;
224}
225
226multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
227                             list<ProcResourceKind> ExePorts, int Lat = 1,
228                             list<int> Res = [], int UOps = 1,
229                             int LoadUOps = 0> {
230  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
231                           /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
232}
233
234multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
235                             list<ProcResourceKind> ExePorts, int Lat,
236                             list<int> Res = [], int UOps = 2,
237                             int LoadUOps = 0> {
238  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
239                           /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
240}
241
242//===----------------------------------------------------------------------===//
243// Here be dragons.
244//===----------------------------------------------------------------------===//
245
246// L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers
247// needn't be available until 4 cycles after the memory operand.
248def : ReadAdvance<ReadAfterLd, 4>;
249
250// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available
251// until 5 cycles after the memory operand.
252def : ReadAdvance<ReadAfterVecLd, 5>;
253def : ReadAdvance<ReadAfterVecXLd, 5>;
254def : ReadAdvance<ReadAfterVecYLd, 5>;
255
256// Transfer from int domain to ivec domain incurs additional latency of 8..10cy
257// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller
258// and Excavator pipeline", "Data delay between different execution domains"
259def : ReadAdvance<ReadInt2Fpu, -10>;
260
261// A folded store needs a cycle on the PdStore for the store data.
262def : WriteRes<WriteRMW, [PdStore]>;
263
264////////////////////////////////////////////////////////////////////////////////
265// Loads, stores, and moves, not folded with other operations.
266////////////////////////////////////////////////////////////////////////////////
267
268def : WriteRes<WriteLoad,    [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; }
269def : WriteRes<WriteStore,   [PdStore]>;
270def : WriteRes<WriteStoreNT, [PdStore]>;
271def : WriteRes<WriteMove,    [PdEX01]> { let ResourceCycles = [2]; }
272defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
273
274// Load/store MXCSR.
275// FIXME: These are copy and pasted from WriteLoad/Store.
276def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
277def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; }
278
279// Treat misc copies as a move.
280def : InstRW<[WriteMove], (instrs COPY)>;
281
282////////////////////////////////////////////////////////////////////////////////
283// Idioms that clear a register, like xorps %xmm0, %xmm0.
284// These can often bypass execution ports completely.
285////////////////////////////////////////////////////////////////////////////////
286
287def : WriteRes<WriteZero, [/*No ExePorts*/]>;
288
289////////////////////////////////////////////////////////////////////////////////
290// Branches don't produce values, so they have no latency, but they still
291// consume resources. Indirect branches can fold loads.
292////////////////////////////////////////////////////////////////////////////////
293
294defm : PdWriteResExPair<WriteJump,  [PdEX1, PdBranch]>;
295
296////////////////////////////////////////////////////////////////////////////////
297// Special case scheduling classes.
298////////////////////////////////////////////////////////////////////////////////
299
300def : WriteRes<WriteSystem,     [PdEX01]> { let Latency = 100; }
301def : WriteRes<WriteMicrocoded, [PdEX01]> { let Latency = 100; }
302def : WriteRes<WriteFence,      [PdStore]>;
303
304def PdWriteXLAT : SchedWriteRes<[PdEX01]> {
305  let Latency = 6;
306}
307def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
308
309def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
310  let Latency = 184;
311  let ResourceCycles = [375];
312  let NumMicroOps = 45;
313}
314def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
315                                        "LSL(16|32|64)rr")>;
316
317// Nops don't have dependencies, so there's no actual latency, but we set this
318// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
319def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; }
320
321////////////////////////////////////////////////////////////////////////////////
322// Arithmetic.
323////////////////////////////////////////////////////////////////////////////////
324
325defm : PdWriteResExPair<WriteALU,     [PdEX01], 1, [2]>;
326
327def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> {
328  let Latency = 6;
329  let ResourceCycles = [3, 2, 1];
330  let NumMicroOps = 1;
331}
332def : SchedAlias<WriteALURMW, PdWriteALURMW>;
333
334def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
335  let Latency = 6;
336  let ResourceCycles = [88];
337  let NumMicroOps = 4;
338}
339def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
340
341def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
342  let Latency = 2;
343  let ResourceCycles = [2];
344  let NumMicroOps = 2;
345}
346def : InstRW<[PdWriteBMI1],
347             (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr,
348                     BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr,
349                     BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr,
350                     BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
351                     TZMSK32rr, TZMSK64rr)>;
352
353def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> {
354  let Latency = 6;
355  let ResourceCycles = [3, 3];
356  let NumMicroOps = 2;
357}
358def : InstRW<[PdWriteBMI1m],
359             (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm,
360                     BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm,
361                     BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm,
362                     BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm,
363                     TZMSK32rm, TZMSK64rm)>;
364
365defm : PdWriteResExPair<WriteADC,    [PdEX01],                  1,  [2]>;
366
367def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> {
368  let ResourceCycles = [3];
369}
370def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>;
371
372defm : PdWriteRes<WriteBSWAP32,      [PdEX01]>;
373defm : PdWriteRes<WriteBSWAP64,      [PdEX01]>;
374defm : PdWriteRes<WriteCMPXCHG,      [PdEX1],                   3,  [3],        5>;
375defm : PdWriteRes<WriteCMPXCHGRMW,   [PdEX1, PdStore, PdLoad],  3,  [44, 1, 1], 2>;
376defm : PdWriteRes<WriteXCHG,         [PdEX1],                   1,  [],         2>;
377
378def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
379  let Latency = 3;
380  let ResourceCycles = [3];
381  let NumMicroOps = 3;
382}
383def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
384
385def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
386  let Latency = 3;
387  let ResourceCycles = [23];
388  let NumMicroOps = 5;
389}
390def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
391
392def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
393  let Latency = 3;
394  let ResourceCycles = [21];
395  let NumMicroOps = 6;
396}
397def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
398             (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
399
400def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
401  let Latency = 3;
402  let ResourceCycles = [26];
403  let NumMicroOps = 18;
404}
405def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
406
407def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
408  let Latency = 3;
409  let ResourceCycles = [69];
410  let NumMicroOps = 22;
411}
412def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
413
414def PdWriteXADD : SchedWriteRes<[PdEX1]> {
415  let Latency = 1;
416  let ResourceCycles = [1];
417  let NumMicroOps = 2;
418}
419def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
420
421def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
422  let Latency = 6;
423  let ResourceCycles = [20];
424  let NumMicroOps = 4;
425}
426def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
427
428defm : PdWriteResExPair<WriteIMul8,     [PdEX1, PdMul],          4,  [1, 4]>;
429defm : PdWriteResExPair<WriteIMul16,    [PdEX1, PdMul],          4,  [1, 5],    2>;
430defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul],          5,  [1, 5],    2>;
431defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul],          4,  [1, 2]>;
432defm : PdWriteResExPair<WriteIMul32,    [PdEX1, PdMul],          4,  [1, 4]>;
433defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul],          4,  [1, 2],    1, 1>;
434defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul],          4,  [1, 2]>;
435defm : PdWriteResExPair<WriteIMul64,    [PdEX1, PdMul],          6,  [1, 6]>;
436defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul],          6,  [1, 4],1, 1>;
437defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul],          6,  [1, 4]>;
438
439// BMI2 MULX
440defm : X86WriteResUnsupported<WriteIMulH>;
441defm : X86WriteResUnsupported<WriteIMulHLd>;
442defm : X86WriteResPairUnsupported<WriteMULX32>;
443defm : X86WriteResPairUnsupported<WriteMULX64>;
444
445defm : PdWriteResExPair<WriteDiv8,    [PdEX1, PdDiv],           12,  [1, 12]>;
446defm : PdWriteResExPair<WriteDiv16,   [PdEX1, PdDiv],           15,  [1, 15],   2>;
447defm : PdWriteResExPair<WriteDiv32,   [PdEX1, PdDiv],           14,  [1, 14],   2>;
448defm : PdWriteResExPair<WriteDiv64,   [PdEX1, PdDiv],           14,  [1, 14],   2>;
449
450defm : PdWriteResExPair<WriteIDiv8,   [PdEX1, PdDiv],           12,  [1, 12]>;
451defm : PdWriteResExPair<WriteIDiv16,  [PdEX1, PdDiv],           15,  [1, 17],   2>;
452defm : PdWriteResExPair<WriteIDiv32,  [PdEX1, PdDiv],           14,  [1, 25],   2>;
453defm : PdWriteResExPair<WriteIDiv64,  [PdEX1, PdDiv],           14,  [1, 14],   2>;
454
455defm : PdWriteResExPair<WriteCRC32,   [PdEX01],                  2,  [4],       3>;
456
457def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
458  let Latency = 5;
459  let ResourceCycles = [10];
460  let NumMicroOps = 5;
461}
462def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
463
464def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
465  let Latency = 6;
466  let ResourceCycles = [12];
467  let NumMicroOps = 7;
468}
469def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
470
471def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
472  let Latency = 10;
473  let ResourceCycles = [17];
474  let NumMicroOps = 11;
475}
476def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
477
478defm : PdWriteResExPair<WriteCMOV,    [PdEX01]>; // Conditional move.
479
480def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> {
481  let Latency = 5;
482  let ResourceCycles = [3, 3];
483  let NumMicroOps = 2;
484}
485
486def PdWriteCMOVmVar : SchedWriteVariant<[
487  SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>,
488  SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>,  [PdWriteCMOVm]>,
489  SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>,  [PdWriteCMOVm]>,
490  SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>,
491  SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>,
492  SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>,  [PdWriteCMOVm]>,
493  SchedVar<NoSchedPred, [WriteCMOV.Folded]>
494]>;
495
496def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
497
498defm : PdWriteRes<WriteFCMOV,        [PdFPU0, PdFPFMA]>; // x87 conditional move.
499
500def : WriteRes<WriteSETCC,           [PdEX01]>; // Setcc.
501def : WriteRes<WriteSETCCStore,      [PdEX01, PdStore]>;
502
503def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
504  let ResourceCycles = [2];
505  let NumMicroOps = 2;
506}
507
508def PdSETGEmSETGmSETLEmSETLm :  SchedWriteVariant<[
509  SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
510  SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>,  [PdWriteSETGEmSETGmSETLEmSETLm]>,
511  SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
512  SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>,  [PdWriteSETGEmSETGmSETLEmSETLm]>,
513  SchedVar<NoSchedPred,                                            [WriteSETCCStore]>
514]>;
515def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>;
516
517defm : PdWriteRes<WriteLAHFSAHF,      [PdEX01],          2,  [4],       2>;
518
519def PdWriteLAHF : SchedWriteRes<[PdEX01]> {
520  let Latency = 2;
521  let ResourceCycles = [4];
522  let NumMicroOps = 4;
523}
524def : InstRW<[PdWriteLAHF], (instrs LAHF)>;
525
526def PdWriteSAHF : SchedWriteRes<[PdEX01]> {
527  let Latency = 2;
528  let ResourceCycles = [2];
529  let NumMicroOps = 2;
530}
531def : InstRW<[PdWriteSAHF], (instrs SAHF)>;
532
533defm : PdWriteRes<WriteBitTest,          [PdEX01],         1, [2],      1>;
534defm : PdWriteRes<WriteBitTestImmLd,     [PdEX01, PdLoad], 5, [2,  3],  1>;
535defm : PdWriteRes<WriteBitTestRegLd,     [PdEX01, PdLoad], 5, [7,  2],  7>;
536defm : PdWriteRes<WriteBitTestSet,       [PdEX01],         2, [2],      2>;
537defm : PdWriteRes<WriteBitTestSetImmLd,  [PdEX01, PdLoad], 6, [1,  1],  4>;
538defm : PdWriteRes<WriteBitTestSetRegLd,  [PdEX01, PdLoad], 6, [1,  1], 10>;
539
540def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> {
541  let Latency = 7;
542  let ResourceCycles = [42, 1];
543  let NumMicroOps = 4;
544}
545def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>;
546def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
547  let Latency = 7;
548  let ResourceCycles = [44, 1];
549  let NumMicroOps = 10;
550}
551def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
552
553// This is for simple LEAs with one or two input operands.
554def : WriteRes<WriteLEA,              [PdEX01]> { let ResourceCycles = [2]; }
555
556// This write is used for slow LEA instructions.
557def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> {
558  let Latency = 2;
559  let ResourceCycles = [2];
560}
561
562// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
563// or an LEA with a `Scale` value different than 1.
564def PdSlowLEAPredicate : MCSchedPredicate<
565  CheckAny<[
566    // A 3-operand LEA (base, index, offset).
567    IsThreeOperandsLEAFn,
568    // An LEA with a "Scale" different than 1.
569    CheckAll<[
570      CheckIsImmOperand<2>,
571      CheckNot<CheckImmOperand<2, 1>>
572    ]>
573  ]>
574>;
575
576def PdWriteLEA : SchedWriteVariant<[
577    SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>,
578    SchedVar<NoSchedPred,        [WriteLEA]>
579]>;
580
581def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
582
583def PdWriteLEA16r : SchedWriteRes<[PdEX01]> {
584  let ResourceCycles = [3];
585  let NumMicroOps = 2;
586}
587def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>;
588
589// Bit counts.
590defm : PdWriteResExPair<WriteBSF,     [PdEX01],          3,  [6],     6, 2>;
591defm : PdWriteResExPair<WriteBSR,     [PdEX01],          4,  [8],     7, 2>;
592defm : PdWriteResExPair<WritePOPCNT,  [PdEX01],          4,  [4]>;
593defm : PdWriteResExPair<WriteLZCNT,   [PdEX0],           2,  [2],     2>;
594defm : PdWriteResExPair<WriteTZCNT,   [PdEX0],           2,  [2],     2>;
595
596// BMI1 BEXTR, BMI2 BZHI
597defm : PdWriteResExPair<WriteBEXTR,   [PdEX01],          2,  [2],    2>;
598defm : PdWriteResExPair<WriteBLS,     [PdEX01],          2,  [2],    2>;
599defm : PdWriteResExPair<WriteBZHI,    [PdEX01]>;
600
601def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> {
602  let Latency = 2;
603  let ResourceCycles = [4];
604  let NumMicroOps = 2;
605}
606def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>;
607
608def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> {
609  let Latency = 2;
610  let ResourceCycles = [5];
611  let NumMicroOps = 2;
612}
613def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>;
614
615////////////////////////////////////////////////////////////////////////////////
616// Integer shifts and rotates.
617////////////////////////////////////////////////////////////////////////////////
618
619defm : PdWriteResExPair<WriteShift,    [PdEX01], 1, [2]>;
620defm : PdWriteResExPair<WriteShiftCL,  [PdEX01]>;
621defm : PdWriteResExPair<WriteRotate,   [PdEX01], 1, [2]>;
622defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
623
624def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
625  let Latency = 12;
626  let ResourceCycles = [24];
627  let NumMicroOps = 26;
628}
629def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
630
631def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
632  let Latency = 12;
633  let ResourceCycles = [23];
634  let NumMicroOps = 23;
635}
636def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
637
638def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
639  let Latency = 11;
640  let ResourceCycles = [22];
641  let NumMicroOps = 24;
642}
643def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
644
645def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
646  let Latency = 10;
647  let ResourceCycles = [20];
648  let NumMicroOps = 22;
649}
650def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
651
652def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
653  let Latency = 10;
654  let ResourceCycles = [19];
655  let NumMicroOps = 19;
656}
657def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
658
659def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> {
660  let Latency = 7;
661  let ResourceCycles = [14];
662  let NumMicroOps = 17;
663}
664def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>;
665
666def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> {
667  let Latency = 7;
668  let ResourceCycles = [13];
669  let NumMicroOps = 16;
670}
671def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>;
672
673def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
674  let Latency = 7;
675  let ResourceCycles = [14];
676  let NumMicroOps = 15;
677}
678def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
679
680
681def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
682  let Latency = 9;
683  let ResourceCycles = [18];
684  let NumMicroOps = 20;
685}
686def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
687
688def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
689  let Latency = 11;
690  let ResourceCycles = [21];
691  let NumMicroOps = 21;
692}
693def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
694
695def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
696  let Latency = 8;
697  let ResourceCycles = [15];
698  let NumMicroOps = 16;
699}
700def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
701
702def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
703  let Latency = 13;
704  let ResourceCycles = [25];
705  let NumMicroOps = 25;
706}
707def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
708
709// SHLD/SHRD.
710defm : PdWriteRes<WriteSHDrri,       [PdEX01],         3, [6], 6>;
711defm : PdWriteRes<WriteSHDrrcl,      [PdEX01],         3, [8], 7>;
712
713def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
714  let Latency = 3;
715  let ResourceCycles = [6];
716  let NumMicroOps = 6;
717}
718def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
719
720def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
721  let Latency = 3;
722  let ResourceCycles = [6];
723  let NumMicroOps = 7;
724}
725def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
726                                                              SHLD32rrCL,
727                                                              SHRD32rrCL)>;
728
729defm : PdWriteRes<WriteSHDmri,       [PdLoad, PdEX01], 4, [1, 22], 8>;
730defm : PdWriteRes<WriteSHDmrcl,      [PdLoad, PdEX01], 4, [1, 22], 8>;
731
732////////////////////////////////////////////////////////////////////////////////
733// Floating point. This covers both scalar and vector operations.
734////////////////////////////////////////////////////////////////////////////////
735
736defm : PdWriteRes<WriteFLD0,               [PdFPU1, PdFPSTO], 3>;
737defm : PdWriteRes<WriteFLD1,               [PdFPU1, PdFPSTO], 3>;
738defm : PdWriteRes<WriteFLDC,               [PdFPU1, PdFPSTO], 3>;
739
740defm : PdWriteRes<WriteFLoad,              [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
741defm : PdWriteRes<WriteFLoadX,             [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
742defm : PdWriteRes<WriteFLoadY,             [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>;
743
744defm : PdWriteRes<WriteFMaskedLoad,        [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>;
745defm : PdWriteRes<WriteFMaskedLoadY,       [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>;
746
747defm : PdWriteRes<WriteFStore,             [PdStore, PdFPU23, PdFPSTO], 2, [1,  3, 1]>;
748defm : PdWriteRes<WriteFStoreX,            [PdStore, PdFPU23, PdFPSTO], 1, [1,  3, 1]>;
749defm : PdWriteRes<WriteFStoreY,            [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>;
750
751def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23,  PdFPSTO]> {
752  let Latency = 2;
753  let ResourceCycles = [1, 3, 1];
754  let NumMicroOps = 2;
755}
756def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
757
758def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1,  PdFPSTO]> {
759  let NumMicroOps = 8;
760}
761def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>;
762
763defm : PdWriteRes<WriteFStoreNT,           [PdStore, PdFPU1,  PdFPSTO], 3>;
764defm : PdWriteRes<WriteFStoreNTX,          [PdStore, PdFPU1,  PdFPSTO], 3>;
765defm : PdWriteRes<WriteFStoreNTY,          [PdStore, PdFPU1,  PdFPSTO], 3, [2, 2, 2], 4>;
766
767defm : PdWriteRes<WriteFMaskedStore32,     [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
768defm : PdWriteRes<WriteFMaskedStore64,     [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
769defm : PdWriteRes<WriteFMaskedStore32Y,    [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
770defm : PdWriteRes<WriteFMaskedStore64Y,    [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
771
772defm : PdWriteRes<WriteFMove,              [PdFPU01, PdFPFMA]>;
773defm : PdWriteRes<WriteFMoveX,             [PdFPU01, PdFPFMA], 1, [1, 2]>;
774defm : PdWriteRes<WriteFMoveY,             [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
775defm : X86WriteResUnsupported<WriteFMoveZ>;
776
777defm : PdWriteRes<WriteEMMS,               [PdFPU01, PdFPFMA], 2>;
778
779defm : PdWriteResXMMPair<WriteFAdd,         [PdFPU0, PdFPFMA],  5>;
780defm : PdWriteResXMMPair<WriteFAddX,        [PdFPU0, PdFPFMA],  5>;
781defm : PdWriteResYMMPair<WriteFAddY,        [PdFPU0, PdFPFMA],  5, [1, 2]>;
782defm : X86WriteResPairUnsupported<WriteFAddZ>;
783
784def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
785  let Latency = 5;
786  let ResourceCycles = [3, 1, 10];
787}
788def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m,  ADD_FI32m,  ADD_F32m,  ADD_F64m,
789                                      SUB_FI16m,  SUB_FI32m,  SUB_F32m,  SUB_F64m,
790                                      SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>;
791
792defm : PdWriteResXMMPair<WriteFAdd64,       [PdFPU0, PdFPFMA],  5>;
793defm : PdWriteResXMMPair<WriteFAdd64X,      [PdFPU0, PdFPFMA],  5>;
794defm : PdWriteResYMMPair<WriteFAdd64Y,      [PdFPU0, PdFPFMA],  5, [1, 2]>;
795defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
796
797defm : PdWriteResXMMPair<WriteFCmp,         [PdFPU0, PdFPFMA],  2>;
798defm : PdWriteResXMMPair<WriteFCmpX,        [PdFPU0, PdFPFMA],  2>;
799defm : PdWriteResYMMPair<WriteFCmpY,        [PdFPU0, PdFPFMA],  2, [1, 2]>;
800defm : X86WriteResPairUnsupported<WriteFCmpZ>;
801
802defm : PdWriteResXMMPair<WriteFCmp64,       [PdFPU0, PdFPFMA],  2>;
803defm : PdWriteResXMMPair<WriteFCmp64X,      [PdFPU0, PdFPFMA],  2>;
804defm : PdWriteResYMMPair<WriteFCmp64Y,      [PdFPU0, PdFPFMA],  2, [1, 2]>;
805defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
806
807defm : PdWriteResXMMPair<WriteFCom,         [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
808defm : PdWriteResXMMPair<WriteFComX,        [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
809
810def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
811  let Latency = 6;
812}
813def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>;
814
815def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>;
816def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
817
818defm : PdWriteResXMMPair<WriteFMul,         [PdFPU1, PdFPFMA],  5>;
819defm : PdWriteResXMMPair<WriteFMulX,        [PdFPU1, PdFPFMA],  5>;
820defm : PdWriteResYMMPair<WriteFMulY,        [PdFPU1, PdFPFMA],  5, [1, 2]>;
821defm : X86WriteResPairUnsupported<WriteFMulZ>;
822
823def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> {
824  let Latency = 5;
825  let ResourceCycles = [3, 1, 10];
826}
827def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>;
828
829defm : PdWriteResXMMPair<WriteFMul64,       [PdFPU1, PdFPFMA],  5>;
830defm : PdWriteResXMMPair<WriteFMul64X,      [PdFPU1, PdFPFMA],  5>;
831defm : PdWriteResYMMPair<WriteFMul64Y,      [PdFPU1, PdFPFMA],  5, [1, 2]>;
832defm : X86WriteResPairUnsupported<WriteFMul64Z>;
833
834defm : PdWriteResXMMPair<WriteFMA,          [PdFPU, PdFPFMA], 5, [1, 3]>;
835defm : PdWriteResXMMPair<WriteFMAX,         [PdFPU, PdFPFMA], 5, [1, 3]>;
836defm : PdWriteResYMMPair<WriteFMAY,         [PdFPU, PdFPFMA], 5, [1, 3]>;
837defm : X86WriteResPairUnsupported<WriteFMAZ>;
838
839
840defm : PdWriteResXMMPair<WriteDPPD,         [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>;
841
842defm : PdWriteResXMMPair<WriteDPPS,         [PdFPU1, PdFPFMA], 25, [1, 14],  16, 2>;
843defm : PdWriteResYMMPair<WriteDPPSY,        [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>;
844defm : X86WriteResPairUnsupported<WriteDPPSZ>;
845
846def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
847  let Latency = 27;
848  let ResourceCycles = [1, 14];
849  let NumMicroOps = 17;
850}
851def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
852
853defm : PdWriteResXMMPair<WriteFRcp,         [PdFPU1, PdFPFMA],  5>;
854defm : PdWriteResXMMPair<WriteFRcpX,        [PdFPU1, PdFPFMA],  5>;
855defm : PdWriteResYMMPair<WriteFRcpY,        [PdFPU1, PdFPFMA],  5, [2, 1]>;
856defm : X86WriteResPairUnsupported<WriteFRcpZ>;
857
858defm : PdWriteResXMMPair<WriteFRsqrt,       [PdFPU1, PdFPFMA],  5, [1, 2]>;
859defm : PdWriteResXMMPair<WriteFRsqrtX,      [PdFPU1, PdFPFMA],  5>;
860defm : PdWriteResYMMPair<WriteFRsqrtY,      [PdFPU1, PdFPFMA],  5, [2, 2]>;
861defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
862
863defm : PdWriteResXMMPair<WriteFDiv,         [PdFPU1, PdFPFMA], 9, [1, 9]>;
864defm : PdWriteResXMMPair<WriteFDivX,        [PdFPU1, PdFPFMA], 9, [1, 9]>;
865defm : PdWriteResYMMPair<WriteFDivY,        [PdFPU1, PdFPFMA], 9, [2, 18]>;
866defm : X86WriteResPairUnsupported<WriteFDivZ>;
867
868def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
869  let Latency = 9;
870  let ResourceCycles = [3, 1, 18];
871}
872def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m,  DIV_FI32m,
873                                      DIVR_FI16m, DIVR_FI32m,
874                                      DIV_F32m,   DIV_F64m,
875                                      DIVR_F32m,  DIVR_F64m)>;
876
877defm : PdWriteResXMMPair<WriteFDiv64,       [PdFPU1, PdFPFMA], 9, [1, 9]>;
878defm : PdWriteResXMMPair<WriteFDiv64X,      [PdFPU1, PdFPFMA], 9, [1, 9]>;
879defm : PdWriteResYMMPair<WriteFDiv64Y,      [PdFPU1, PdFPFMA], 9, [2, 18]>;
880defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
881
882defm : PdWriteResXMMPair<WriteFSqrt,        [PdFPU1, PdFPFMA], 9, [1, 9]>;
883defm : PdWriteResXMMPair<WriteFSqrtX,       [PdFPU1, PdFPFMA], 9, [1, 9]>;
884defm : PdWriteResYMMPair<WriteFSqrtY,       [PdFPU1, PdFPFMA], 9, [2, 18]>;
885defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
886
887defm : PdWriteResXMMPair<WriteFSqrt64,      [PdFPU1, PdFPFMA], 9, [1, 9]>;
888defm : PdWriteResXMMPair<WriteFSqrt64X,     [PdFPU1, PdFPFMA], 9, [1, 9]>;
889defm : PdWriteResYMMPair<WriteFSqrt64Y,     [PdFPU1, PdFPFMA], 9, [2, 18]>;
890defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
891
892defm : PdWriteResXMMPair<WriteFSqrt80,      [PdFPU1, PdFPFMA],  1, [1, 18]>;
893defm : PdWriteResXMMPair<WriteFSign,        [PdFPU1, PdFPFMA],  1, [1, 4]>;
894
895defm : PdWriteResXMMPair<WriteFRnd,         [PdFPU1, PdFPSTO],  4, []>;
896defm : PdWriteResYMMPair<WriteFRndY,        [PdFPU1, PdFPSTO],  4, [2, 1], 2>;
897defm : X86WriteResPairUnsupported<WriteFRndZ>;
898
899def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> {
900  let Latency = 10;
901  let ResourceCycles = [2, 1];
902  let NumMicroOps = 2;
903}
904def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>;
905
906def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> {
907  let Latency = 10;
908  let ResourceCycles = [10, 1];
909  let NumMicroOps = 2;
910}
911def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>;
912
913def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
914  let Latency = 15;
915  let ResourceCycles = [2, 1];
916  let NumMicroOps = 3;
917}
918def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
919                                      VFRCZSDrm, VFRCZSSrm)>;
920
921def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
922  let Latency = 10;
923  let ResourceCycles = [3, 1];
924  let NumMicroOps = 4;
925}
926def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
927
928def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
929  let Latency = 15;
930  let ResourceCycles = [4, 1];
931  let NumMicroOps = 8;
932}
933def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
934
935defm : PdWriteResXMMPair<WriteFLogic,       [PdFPU01, PdFPFMA],  2, [1, 2]>;
936defm : PdWriteResYMMPair<WriteFLogicY,      [PdFPU01, PdFPFMA],  2, [2, 2]>;
937defm : X86WriteResPairUnsupported<WriteFLogicZ>;
938
939defm : PdWriteResXMMPair<WriteFTest,        [PdFPU0, PdFPFMA, PdEX0],  1, [], 2>;
940defm : PdWriteResYMMPair<WriteFTestY,       [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>;
941defm : X86WriteResPairUnsupported<WriteFTestZ>;
942
943defm : PdWriteResXMMPair<WriteFShuffle,     [PdFPU01, PdFPFMA],  2, [1, 2]>;
944defm : PdWriteResYMMPair<WriteFShuffleY,    [PdFPU01, PdFPFMA],  2, [2, 4], 2>;
945defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
946
947def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
948  let Latency = 7;
949  let ResourceCycles = [1, 3];
950  let NumMicroOps = 2;
951}
952def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
953
954defm : PdWriteResXMMPair<WriteFVarShuffle,  [PdFPU01, PdFPFMA],  3, [1, 2]>;
955defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA],  3, [2, 4], 2>;
956defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
957
958defm : PdWriteResXMMPair<WriteFBlend,       [PdFPU01, PdFPFMA],  2, [1, 3]>;
959defm : PdWriteResYMMPair<WriteFBlendY,      [PdFPU01, PdFPFMA],  2, [2, 3], 2>;
960defm : X86WriteResPairUnsupported<WriteFBlendZ>;
961
962defm : PdWriteResXMMPair<WriteFVarBlend,    [PdFPU01, PdFPFMA],  2, [1, 3]>;
963defm : PdWriteResYMMPair<WriteFVarBlendY,   [PdFPU01, PdFPFMA],  2, [2, 4], 2>;
964defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
965
966defm : PdWriteResXMMPair<WriteFShuffle256,  [PdFPU01, PdFPFMA],  2, [1, 3], 2>;
967defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
968
969def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
970  let Latency = 2;
971  let ResourceCycles = [1, 2];
972}
973def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
974
975def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
976  let Latency = 7;
977  let ResourceCycles = [1, 4];
978  let NumMicroOps = 2;
979}
980def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
981
982def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
983  let Latency = 4;
984  let ResourceCycles = [1, 6];
985  let NumMicroOps = 8;
986}
987def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
988
989def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
990  let Latency = 8; // 4 + 4
991  let ResourceCycles = [1, 8];
992  let NumMicroOps = 10;
993}
994def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
995
996////////////////////////////////////////////////////////////////////////////////
997// Conversions.
998////////////////////////////////////////////////////////////////////////////////
999
1000defm : PdWriteResXMMPair<WriteCvtSS2I,   [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
1001
1002defm : PdWriteResXMMPair<WriteCvtPS2I,   [PdFPU0, PdFPCVT, PdFPSTO], 4>;
1003defm : PdWriteResYMMPair<WriteCvtPS2IY,  [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
1004defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
1005
1006defm : PdWriteResXMMPair<WriteCvtSD2I,   [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
1007
1008defm : PdWriteResXMMPair<WriteCvtPD2I,   [PdFPU0, PdFPCVT, PdFPSTO],          8, [],        2>;
1009defm : PdWriteResYMMPair<WriteCvtPD2IY,  [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
1010defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
1011
1012def PdWriteMMX_CVTTPD2PIrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
1013  let Latency = 6;
1014  let NumMicroOps = 2;
1015}
1016def : InstRW<[PdWriteMMX_CVTTPD2PIrr], (instrs MMX_CVTTPD2PIrr)>;
1017
1018// FIXME: f+3 ST, LD+STC latency
1019defm : PdWriteResXMMPair<WriteCvtI2SS,   [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
1020// FIXME: .Folded version is one NumMicroOp *less*..
1021
1022defm : PdWriteResXMMPair<WriteCvtI2PS,   [PdFPU0, PdFPCVT, PdFPSTO], 4>;
1023defm : PdWriteResYMMPair<WriteCvtI2PSY,  [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
1024defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
1025
1026defm : PdWriteResXMMPair<WriteCvtI2SD,   [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
1027// FIXME: .Folded version is one NumMicroOp *less*..
1028
1029def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
1030  let Latency = 13;
1031  let ResourceCycles = [1, 3, 1];
1032  let NumMicroOps = 2;
1033}
1034def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>;
1035
1036defm : PdWriteResXMMPair<WriteCvtI2PD,   [PdFPU0, PdFPCVT, PdFPSTO], 8, [],     2>;
1037defm : PdWriteResYMMPair<WriteCvtI2PDY,  [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
1038defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
1039
1040defm : PdWriteResXMMPair<WriteCvtSS2SD,  [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
1041
1042defm : PdWriteResXMMPair<WriteCvtPS2PD,  [PdFPU0, PdFPCVT, PdFPSTO], 8, [],     2>;
1043defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
1044defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
1045
1046defm : PdWriteResXMMPair<WriteCvtSD2SS,  [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
1047
1048defm : PdWriteResXMMPair<WriteCvtPD2PS,  [PdFPU0, PdFPCVT, PdFPSTO],          8, [],        2>;
1049defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
1050defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
1051
1052def PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
1053  let Latency = 6;
1054  let NumMicroOps = 2;
1055}
1056def : InstRW<[PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr], (instrs MMX_CVTPD2PIrr,
1057                                                            MMX_CVTPI2PDrr)>;
1058
1059def PdWriteMMX_CVTPI2PSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
1060  let Latency = 4;
1061  let NumMicroOps = 2;
1062}
1063def : InstRW<[PdWriteMMX_CVTPI2PSrr], (instrs MMX_CVTPI2PSrr)>;
1064
1065defm : PdWriteResXMMPair<WriteCvtPH2PS,  [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
1066defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
1067defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
1068
1069defm : PdWriteRes<WriteCvtPS2PH,        [PdFPU0, PdFPCVT, PdFPSTO],          8, [1, 2, 1],    2>;
1070defm : PdWriteRes<WriteCvtPS2PHY,       [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
1071defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
1072
1073defm : PdWriteRes<WriteCvtPS2PHSt,      [PdFPU0, PdFPCVT, PdFPSTO, PdStore],          4, [1, 2, 1, 1],    3>;
1074defm : PdWriteRes<WriteCvtPS2PHYSt,     [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>;
1075defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
1076
1077////////////////////////////////////////////////////////////////////////////////
1078// Vector integer operations.
1079////////////////////////////////////////////////////////////////////////////////
1080
1081defm : PdWriteRes<WriteVecLoad,             [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
1082defm : PdWriteRes<WriteVecLoadX,            [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
1083defm : PdWriteRes<WriteVecLoadY,            [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>;
1084
1085defm : PdWriteRes<WriteVecLoadNT,           [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>;
1086defm : PdWriteRes<WriteVecLoadNTY,          [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>;
1087
1088defm : PdWriteRes<WriteVecMaskedLoad,       [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>;
1089defm : PdWriteRes<WriteVecMaskedLoadY,      [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>;
1090
1091defm : PdWriteRes<WriteVecStore,            [PdStore, PdFPU23, PdFPSTO], 2, [1, 3,  1]>;
1092defm : PdWriteRes<WriteVecStoreX,           [PdStore, PdFPU23, PdFPSTO], 1, [1, 3,  1]>;
1093defm : PdWriteRes<WriteVecStoreY,           [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>;
1094
1095def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1,   PdFPSTO]> {
1096  let NumMicroOps = 8;
1097}
1098def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
1099
1100defm : PdWriteRes<WriteVecStoreNT,          [PdStore, PdFPU1,   PdFPSTO], 2>;
1101defm : PdWriteRes<WriteVecStoreNTY,         [PdStore, PdFPU1,   PdFPSTO], 2, [2, 2, 2], 4>;
1102
1103defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
1104defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
1105defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
1106defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
1107
1108defm : PdWriteRes<WriteVecMove,             [PdFPU01, PdFPMAL], 2>;
1109defm : PdWriteRes<WriteVecMoveX,            [PdFPU01, PdFPMAL], 1, [1, 2]>;
1110defm : PdWriteRes<WriteVecMoveY,            [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
1111defm : X86WriteResUnsupported<WriteVecMoveZ>;
1112
1113def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
1114}
1115def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>;
1116
1117def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
1118  let Latency = 4;
1119}
1120def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>;
1121
1122defm : PdWriteRes<WriteVecMoveToGpr,        [PdFPU0, PdFPFMA, PdEX0], 11>;
1123defm : PdWriteRes<WriteVecMoveFromGpr,      [PdFPU01, PdFPFMA], 11, [1, 2], 2>;
1124
1125defm : PdWriteResXMMPair<WriteVecALU,        [PdFPU01, PdFPMAL], 2>;
1126defm : PdWriteResXMMPair<WriteVecALUX,       [PdFPU01, PdFPMAL], 2, [1, 2]>;
1127defm : X86WriteResPairUnsupported<WriteVecALUY>;
1128defm : X86WriteResPairUnsupported<WriteVecALUZ>;
1129
1130defm : PdWriteResXMMPair<WriteVecShift,      [PdFPU01, PdFPMAL], 3, [1, 2]>;
1131defm : PdWriteResXMMPair<WriteVecShiftX,     [PdFPU01, PdFPMAL], 3, [1, 2]>;
1132defm : X86WriteResPairUnsupported<WriteVecShiftY>;
1133defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
1134
1135defm : PdWriteResXMMPair<WriteVecShiftImm,   [PdFPU01, PdFPMAL], 2, [1, 2]>;
1136defm : PdWriteResXMMPair<WriteVecShiftImmX,  [PdFPU01, PdFPMAL], 2, [1, 2]>;
1137defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
1138defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
1139
1140defm : PdWriteResXMMPair<WriteVecIMul,       [PdFPU0, PdFPMMA], 4>;
1141defm : PdWriteResXMMPair<WriteVecIMulX,      [PdFPU0, PdFPMMA], 4>;
1142defm : X86WriteResPairUnsupported<WriteVecIMulY>;
1143defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
1144
1145defm : PdWriteResXMMPair<WritePMULLD,        [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL], 5, [2, 1, 2, 1]>;
1146defm : X86WriteResPairUnsupported<WritePMULLDY>;
1147defm : X86WriteResPairUnsupported<WritePMULLDZ>;
1148
1149def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> {
1150  let Latency = 4;
1151}
1152def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
1153                                      VPMACSSDQLrr)>;
1154
1155defm : PdWriteResXMMPair<WriteMPSAD,         [PdFPU0, PdFPMMA], 9, [1, 4], 8>;
1156defm : X86WriteResPairUnsupported<WriteMPSADY>;
1157defm : X86WriteResPairUnsupported<WriteMPSADZ>;
1158
1159def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> {
1160  let Latency = 8;
1161  let ResourceCycles = [1, 4];
1162  let NumMicroOps = 10;
1163}
1164def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>;
1165
1166defm : PdWriteResXMMPair<WritePSADBW,        [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
1167defm : PdWriteResXMMPair<WritePSADBWX,       [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
1168defm : X86WriteResPairUnsupported<WritePSADBWY>;
1169defm : X86WriteResPairUnsupported<WritePSADBWZ>;
1170
1171defm : PdWriteResXMMPair<WritePHMINPOS,      [PdFPU0,  PdFPMAL], 4, [], 2>;
1172
1173defm : PdWriteResXMMPair<WriteShuffle,       [PdFPU01, PdFPMAL], 2, [1, 2]>;
1174defm : PdWriteResXMMPair<WriteShuffleX,      [PdFPU01, PdFPMAL], 2, [1, 2]>;
1175defm : PdWriteResYMMPair<WriteShuffleY,      [PdFPU01, PdFPMAL], 2, [1, 4]>;
1176defm : X86WriteResPairUnsupported<WriteShuffleZ>;
1177
1178defm : PdWriteResXMMPair<WriteVarShuffle,    [PdFPU01, PdFPMAL], 3, [1, 2]>;
1179defm : PdWriteResXMMPair<WriteVarShuffleX,   [PdFPU01, PdFPMAL], 3, [1, 3]>;
1180defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
1181defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
1182
1183def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> {
1184  let Latency = 2;
1185  let ResourceCycles = [1, 3];
1186}
1187def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>;
1188
1189defm : PdWriteResXMMPair<WriteBlend,         [PdFPU01, PdFPMAL], 2>;
1190defm : X86WriteResPairUnsupported<WriteBlendY>;
1191defm : X86WriteResPairUnsupported<WriteBlendZ>;
1192
1193defm : PdWriteResXMMPair<WriteVarBlend,      [PdFPU01, PdFPMAL], 2, [1, 2]>;
1194defm : X86WriteResPairUnsupported<WriteVarBlendY>;
1195defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
1196
1197defm : PdWriteResXMMPair<WriteVecLogic,      [PdFPU01, PdFPMAL], 2>;
1198defm : PdWriteResXMMPair<WriteVecLogicX,     [PdFPU01, PdFPMAL], 2, [1, 2]>;
1199defm : X86WriteResPairUnsupported<WriteVecLogicY>;
1200defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
1201
1202defm : PdWriteResXMMPair<WriteVecTest,       [PdFPU0, PdFPFMA, PdEX0],  1, [], 2>;
1203defm : PdWriteResYMMPair<WriteVecTestY,      [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>;
1204defm : X86WriteResPairUnsupported<WriteVecTestZ>;
1205
1206defm : PdWriteResXMMPair<WriteShuffle256,    [PdFPU01, PdFPMAL]>;
1207defm : PdWriteResXMMPair<WriteVPMOV256,      [PdFPU01, PdFPMAL]>;
1208defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
1209
1210defm : PdWriteResXMMPair<WriteVarVecShift,   [PdFPU01, PdFPMAL], 3, [1, 2]>;
1211defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
1212defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
1213
1214////////////////////////////////////////////////////////////////////////////////
1215// Vector insert/extract operations.
1216////////////////////////////////////////////////////////////////////////////////
1217
1218defm : PdWriteRes<WriteVecInsert,    [PdFPU01, PdFPMAL], 2, [1, 3], 2>;
1219defm : PdWriteRes<WriteVecInsertLd,  [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>;
1220
1221defm : PdWriteRes<WriteVecExtract,   [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>;
1222defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>;
1223
1224def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
1225  let Latency = 3;
1226  let ResourceCycles = [1, 3];
1227}
1228def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
1229
1230////////////////////////////////////////////////////////////////////////////////
1231// SSE42 String instructions.
1232////////////////////////////////////////////////////////////////////////////////
1233
1234defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>;
1235defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0],  7, [1, 8, 1], 7, 2>;
1236
1237defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>;
1238defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>;
1239
1240////////////////////////////////////////////////////////////////////////////////
1241// MOVMSK Instructions.
1242////////////////////////////////////////////////////////////////////////////////
1243
1244defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0],   12, [], 2>;
1245
1246defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
1247defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
1248// defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
1249
1250defm : PdWriteRes<WriteMMXMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
1251
1252////////////////////////////////////////////////////////////////////////////////
1253// AES Instructions.
1254////////////////////////////////////////////////////////////////////////////////
1255
1256defm : PdWriteResXMMPair<WriteAESIMC,    [PdFPU0, PdFPMMA], 5>;
1257defm : PdWriteResXMMPair<WriteAESKeyGen, [PdFPU0, PdFPMMA], 5>;
1258defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
1259
1260////////////////////////////////////////////////////////////////////////////////
1261// Horizontal add/sub  instructions.
1262////////////////////////////////////////////////////////////////////////////////
1263
1264defm : PdWriteResXMMPair<WriteFHAdd,  [PdFPU0, PdFPFMA], 11, [1, 5],     3, 1>;
1265defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>;
1266defm : X86WriteResPairUnsupported<WriteFHAddZ>;
1267
1268defm : PdWriteResXMMPair<WritePHAdd,  [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>;
1269defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
1270defm : X86WriteResPairUnsupported<WritePHAddY>;
1271defm : X86WriteResPairUnsupported<WritePHAddZ>;
1272
1273def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr,
1274                                   PHADDWrr, PHSUBWrr,
1275                                   PHADDSWrr, PHSUBSWrr,
1276                                   VPHADDDrr, VPHSUBDrr,
1277                                   VPHADDWrr, VPHSUBWrr,
1278                                   VPHADDSWrr, VPHSUBSWrr)>;
1279
1280def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
1281                                          PHADDWrm, PHSUBWrm,
1282                                          PHADDSWrm, PHSUBSWrm,
1283                                          VPHADDDrm, VPHSUBDrm,
1284                                          VPHADDWrm, VPHSUBWrm,
1285                                          VPHADDSWrm, VPHSUBSWrm)>;
1286
1287////////////////////////////////////////////////////////////////////////////////
1288// Carry-less multiplication instructions.
1289////////////////////////////////////////////////////////////////////////////////
1290
1291defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
1292
1293def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
1294  let Latency = 12;
1295  let ResourceCycles = [1, 7];
1296  let NumMicroOps = 6;
1297}
1298def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
1299
1300////////////////////////////////////////////////////////////////////////////////
1301// SSE4A instructions.
1302////////////////////////////////////////////////////////////////////////////////
1303
1304def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
1305  let Latency = 3;
1306  let ResourceCycles = [1, 2];
1307}
1308def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>;
1309
1310def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> {
1311  let Latency = 3;
1312  let ResourceCycles = [1, 3];
1313}
1314def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>;
1315
1316////////////////////////////////////////////////////////////////////////////////
1317// AVX instructions.
1318////////////////////////////////////////////////////////////////////////////////
1319
1320def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> {
1321  let Latency = 6;
1322  let ResourceCycles = [1, 2, 4];
1323  let NumMicroOps = 2;
1324}
1325def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
1326                                                          VBROADCASTSSYrm)>;
1327
1328def PdWriteVZEROALL : SchedWriteRes<[]> {
1329  let Latency = 90;
1330  let NumMicroOps = 32;
1331}
1332def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>;
1333
1334def PdWriteVZEROUPPER : SchedWriteRes<[]> {
1335  let Latency = 46;
1336  let NumMicroOps = 16;
1337}
1338def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>;
1339
1340///////////////////////////////////////////////////////////////////////////////
1341//  SchedWriteVariant definitions.
1342///////////////////////////////////////////////////////////////////////////////
1343
1344def PdWriteZeroLatency : SchedWriteRes<[]> {
1345  let Latency = 0;
1346}
1347
1348def PdWriteZeroIdiom : SchedWriteVariant<[
1349  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
1350  SchedVar<MCSchedPredicate<TruePred>,           [WriteALU]>
1351]>;
1352def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
1353                                         XOR32rr, XOR64rr)>;
1354
1355def PdWriteFZeroIdiom : SchedWriteVariant<[
1356  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
1357  SchedVar<MCSchedPredicate<TruePred>,           [WriteFLogic]>
1358]>;
1359def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr,  VXORPSrr,
1360                                          XORPDrr,  VXORPDrr,
1361                                          ANDNPSrr, VANDNPSrr,
1362                                          ANDNPDrr, VANDNPDrr)>;
1363
1364// VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1.
1365
1366def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
1367  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
1368  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogic]>
1369]>;
1370def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
1371
1372def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
1373  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
1374  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogicX]>
1375]>;
1376def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr,  VPXORrr,
1377                                                PANDNrr, VPANDNrr)>;
1378
1379def PdWriteVZeroIdiomALU : SchedWriteVariant<[
1380  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
1381  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALU]>
1382]>;
1383def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBrr,   MMX_PSUBDrr,
1384                                             MMX_PSUBQrr,   MMX_PSUBWrr,
1385                                             MMX_PCMPGTBrr,
1386                                             MMX_PCMPGTDrr,
1387                                             MMX_PCMPGTWrr)>;
1388
1389def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
1390    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
1391    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALUX]>
1392]>;
1393def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
1394                                              PSUBDrr, VPSUBDrr,
1395                                              PSUBQrr, VPSUBQrr,
1396                                              PSUBWrr, VPSUBWrr,
1397                                              PCMPGTBrr, VPCMPGTBrr,
1398                                              PCMPGTDrr, VPCMPGTDrr,
1399                                              PCMPGTWrr, VPCMPGTWrr)>;
1400
1401///////////////////////////////////////////////////////////////////////////////
1402// Dependency breaking instructions.
1403///////////////////////////////////////////////////////////////////////////////
1404
1405// VPCMPGTQ, but not PCMPGTQ!
1406
1407def : IsZeroIdiomFunction<[
1408  // GPR Zero-idioms.
1409  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
1410
1411  // MMX Zero-idioms.
1412  DepBreakingClass<[
1413    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
1414    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
1415    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
1416    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
1417  ], ZeroIdiomPredicate>,
1418
1419  // SSE Zero-idioms.
1420  DepBreakingClass<[
1421    // fp variants.
1422    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
1423
1424    // int variants.
1425    PXORrr, PANDNrr,
1426    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1427    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
1428    PCMPGTBrr, PCMPGTDrr, PCMPGTWrr
1429  ], ZeroIdiomPredicate>,
1430
1431  // AVX Zero-idioms.
1432  DepBreakingClass<[
1433    // xmm fp variants.
1434    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
1435
1436    // xmm int variants.
1437    VPXORrr, VPANDNrr,
1438    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1439    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
1440    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1441
1442    // ymm variants.
1443    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1444  ], ZeroIdiomPredicate>
1445]>;
1446
1447def : IsDepBreakingFunction<[
1448  // GPR
1449  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1450  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1451
1452  // MMX
1453  DepBreakingClass<[
1454    MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
1455  ], ZeroIdiomPredicate>,
1456
1457  // SSE
1458  DepBreakingClass<[
1459    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr
1460    // But not PCMPEQQrr.
1461  ], ZeroIdiomPredicate>,
1462
1463  // AVX
1464  DepBreakingClass<[
1465    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr
1466    // But not VPCMPEQQrr.
1467  ], ZeroIdiomPredicate>
1468]>;
1469
1470
1471} // SchedModel
1472