xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedCyclone.td (revision a8197ad3aa952a03fc2aeebc2eafe9bb9de54550)
1//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AArch64 Cyclone to support
10// instruction scheduling and other instruction cost heuristics.
11//
12//===----------------------------------------------------------------------===//
13
14def CycloneModel : SchedMachineModel {
15  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
16  let MicroOpBufferSize = 192; // Based on the reorder buffer.
17  let LoadLatency = 4; // Optimistic load latency.
18  let MispredictPenalty = 16; // 14-19 cycles are typical.
19  let CompleteModel = 1;
20
21  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
22}
23
24//===----------------------------------------------------------------------===//
25// Define each kind of processor resource and number available on Cyclone.
26
27// 4 integer pipes
28def CyUnitI : ProcResource<4> {
29  let BufferSize = 48;
30}
31
32// 2 branch units: I[0..1]
33def CyUnitB : ProcResource<2> {
34  let Super  = CyUnitI;
35  let BufferSize = 24;
36}
37
38// 1 indirect-branch unit: I[0]
39def CyUnitBR : ProcResource<1> {
40  let Super  = CyUnitB;
41}
42
43// 2 shifter pipes: I[2..3]
44// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
45def CyUnitIS : ProcResource<2> {
46  let Super = CyUnitI;
47  let BufferSize = 24;
48}
49
50// 1 mul pipe: I[0]
51def CyUnitIM : ProcResource<1> {
52  let Super = CyUnitBR;
53  let BufferSize = 32;
54}
55
56// 1 div pipe: I[1]
57def CyUnitID : ProcResource<1> {
58  let Super = CyUnitB;
59  let BufferSize = 16;
60}
61
62// 1 integer division unit. This is driven by the ID pipe, but only
63// consumes the pipe for one cycle at issue and another cycle at writeback.
64def CyUnitIntDiv : ProcResource<1>;
65
66// 2 ld/st pipes.
67def CyUnitLS : ProcResource<2> {
68  let BufferSize = 28;
69}
70
71// 3 fp/vector pipes.
72def CyUnitV : ProcResource<3> {
73  let BufferSize = 48;
74}
75// 2 fp/vector arithmetic and multiply pipes: V[0-1]
76def CyUnitVM : ProcResource<2> {
77  let Super = CyUnitV;
78  let BufferSize = 32;
79}
80// 1 fp/vector division/sqrt pipe: V[2]
81def CyUnitVD : ProcResource<1> {
82  let Super = CyUnitV;
83  let BufferSize = 16;
84}
85// 1 fp compare pipe: V[0]
86def CyUnitVC : ProcResource<1> {
87  let Super = CyUnitVM;
88  let BufferSize = 16;
89}
90
91// 2 fp division/square-root units.  These are driven by the VD pipe,
92// but only consume the pipe for one cycle at issue and a cycle at writeback.
93def CyUnitFloatDiv : ProcResource<2>;
94
95//===----------------------------------------------------------------------===//
96// Define scheduler read/write resources and latency on Cyclone.
97// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
98
99let SchedModel = CycloneModel in {
100
101//---
102// 7.8.1. Moves
103//---
104
105// A single nop micro-op (uX).
106def WriteX : SchedWriteRes<[]> { let Latency = 0; }
107
108// Move zero is a register rename (to machine register zero).
109// The move is replaced by a single nop micro-op.
110// MOVZ Rd, #0
111// AND Rd, Rzr, #imm
112def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
113def WriteImmZ  : SchedWriteVariant<[
114                   SchedVar<WriteZPred, [WriteX]>,
115                   SchedVar<NoSchedPred, [WriteImm]>]>;
116def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
117
118// Move GPR is a register rename and single nop micro-op.
119// ORR Xd, XZR, Xm
120// ADD Xd, Xn, #0
121def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
122def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
123def WriteMov      : SchedWriteVariant<[
124                      SchedVar<WriteIMovPred, [WriteX]>,
125                      SchedVar<WriteVMovPred, [WriteX]>,
126                      SchedVar<NoSchedPred,   [WriteI]>]>;
127def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
128
129// Move non-zero immediate is an integer ALU op.
130// MOVN,MOVZ,MOVK
131def : WriteRes<WriteImm, [CyUnitI]>;
132
133//---
134// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
135//              Shifts and Bitfield Operations
136//---
137
138// ADR,ADRP
139// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
140// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
141// ADC(S),SBC(S)
142// Aliases: CMN, CMP, TST
143//
144// Conditional operations.
145// CCMNi,CCMPi,CCMNr,CCMPr,
146// CSEL,CSINC,CSINV,CSNEG
147//
148// Bit counting and reversal operations.
149// CLS,CLZ,RBIT,REV,REV16,REV32
150def : WriteRes<WriteI, [CyUnitI]>;
151
152// ADD with shifted register operand is a single micro-op that
153// consumes a shift pipeline for two cycles.
154// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
155// EXAMPLE: ADDrs Xn, Xm LSL #imm
156def : WriteRes<WriteISReg, [CyUnitIS]> {
157  let Latency = 2;
158  let ResourceCycles = [2];
159}
160
161// ADD with extended register operand is the same as shifted reg operand.
162// ADD(S)re,SUB(S)re
163// EXAMPLE: ADDXre Xn, Xm, UXTB #1
164def : WriteRes<WriteIEReg, [CyUnitIS]> {
165  let Latency = 2;
166  let ResourceCycles = [2];
167}
168
169// Variable shift and bitfield operations.
170// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
171def : WriteRes<WriteIS, [CyUnitIS]>;
172
173// EXTR Shifts a pair of registers and requires two micro-ops.
174// The second micro-op is delayed, as modeled by ReadExtrHi.
175// EXTR Xn, Xm, #imm
176def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
177  let Latency = 2;
178  let NumMicroOps = 2;
179}
180
181// EXTR's first register read is delayed by one cycle, effectively
182// shortening its writer's latency.
183// EXTR Xn, Xm, #imm
184def : ReadAdvance<ReadExtrHi, 1>;
185
186//---
187// 7.8.6. Multiplies
188//---
189
190// MUL/MNEG are aliases for MADD/MSUB.
191// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
192def : WriteRes<WriteIM32, [CyUnitIM]> {
193  let Latency = 4;
194}
195// MADDX,MSUBX,SMULH,UMULH
196def : WriteRes<WriteIM64, [CyUnitIM]> {
197  let Latency = 5;
198}
199
200//---
201// 7.8.7. Divide
202//---
203
204// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
205// The ID pipe is consumed for 2 cycles: issue and writeback.
206// SDIVW,UDIVW
207def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
208  let Latency = 10;
209  let ResourceCycles = [2, 10];
210}
211// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
212// The ID pipe is consumed for 2 cycles: issue and writeback.
213// SDIVX,UDIVX
214def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
215  let Latency = 13;
216  let ResourceCycles = [2, 13];
217}
218
219//---
220// 7.8.8,7.8.10. Load/Store, single element
221//---
222
223// Integer loads take 4 cycles and use one LS unit for one cycle.
224def : WriteRes<WriteLD, [CyUnitLS]> {
225  let Latency = 4;
226}
227
228// Store-load forwarding is 4 cycles.
229//
230// Note: The store-exclusive sequence incorporates this
231// latency. However, general heuristics should not model the
232// dependence between a store and subsequent may-alias load because
233// hardware speculation works.
234def : WriteRes<WriteST, [CyUnitLS]> {
235  let Latency = 4;
236}
237
238// Load from base address plus an optionally scaled register offset.
239// Rt latency is latency WriteIS + WriteLD.
240// EXAMPLE: LDR Xn, Xm [, lsl 3]
241def CyWriteLDIdx : SchedWriteVariant<[
242  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
243  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
244def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
245
246// EXAMPLE: STR Xn, Xm [, lsl 3]
247def CyWriteSTIdx : SchedWriteVariant<[
248  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
249  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
250def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
251
252// Read the (unshifted) base register Xn in the second micro-op one cycle later.
253// EXAMPLE: LDR Xn, Xm [, lsl 3]
254def ReadBaseRS : SchedReadAdvance<1>;
255def CyReadAdrBase : SchedReadVariant<[
256  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
257  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
258def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
259
260//---
261// 7.8.9,7.8.11. Load/Store, paired
262//---
263
264// Address pre/post increment is a simple ALU op with one cycle latency.
265def : WriteRes<WriteAdr, [CyUnitI]>;
266
267// LDP high register write is fused with the load, but a nop micro-op remains.
268def : WriteRes<WriteLDHi, []> {
269  let Latency = 4;
270}
271
272// STP is a vector op and store, except for QQ, which is just two stores.
273def : SchedAlias<WriteSTP, WriteVSTShuffle>;
274def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
275
276//---
277// 7.8.13. Branches
278//---
279
280// Branches take a single micro-op.
281// The misprediction penalty is defined as a SchedMachineModel property.
282def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
283def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
284
285//---
286// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
287//---
288
289// NOP,SEV,SEVL,WFE,WFI,YIELD
290def : WriteRes<WriteHint, []> {let Latency = 0;}
291// ISB
292def : InstRW<[WriteI], (instrs ISB)>;
293// SLREX,DMB,DSB
294def : WriteRes<WriteBarrier, [CyUnitLS]>;
295
296// System instructions get an invalid latency because the latency of
297// other operations across them is meaningless.
298def : WriteRes<WriteSys, []> {let Latency = -1;}
299
300//===----------------------------------------------------------------------===//
301// 7.9 Vector Unit Instructions
302
303// Simple vector operations take 2 cycles.
304def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
305
306// Define some longer latency vector op types for Cyclone.
307def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
308def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
309def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
310def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
311
312// Simple floating-point operations take 2 cycles.
313def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
314
315//---
316// 7.9.1 Vector Moves
317//---
318
319// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
320// generates expensive int-float conversion instead:
321// FMOVDi Dd, #0.0
322// FMOVv2f64ns Vd.2d, #0.0
323
324// FMOVSi,FMOVDi
325def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
326
327// MOVI,MVNI are WriteV
328// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
329
330// Move FPR is a register rename and single nop micro-op.
331// ORR.16b Vd,Vn,Vn
332// COPY is handled above in the WriteMov Variant.
333def WriteVMov    : SchedWriteVariant<[
334                     SchedVar<WriteVMovPred, [WriteX]>,
335                     SchedVar<NoSchedPred,   [WriteV]>]>;
336def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
337
338// FMOVSr,FMOVDr are WriteF.
339
340// MOV V,V is a WriteV.
341
342// CPY D,V[x] is a WriteV
343
344// INS V[x],V[y] is a WriteV.
345
346// FMOVWSr,FMOVXDr,FMOVXDHighr
347def : WriteRes<WriteFCopy, [CyUnitLS]> {
348  let Latency = 5;
349}
350
351// FMOVSWr,FMOVDXr
352def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
353
354// INS V[x],R
355def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
356def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
357
358// SMOV,UMOV R,V[x]
359def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
360def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
361
362// DUP V,R
363def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
364
365// DUP V,V[x] is a WriteV.
366
367//---
368// 7.9.2 Integer Arithmetic, Logical, and Comparisons
369//---
370
371// BIC,ORR V,#imm are WriteV
372
373def : InstRW<[CyWriteV3], (instregex "ABSv")>;
374
375// MVN,NEG,NOT are WriteV
376
377def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
378
379// ADDP is a WriteV.
380def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
381def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
382
383def : InstRW<[CyWriteV3],
384             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
385
386def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
387
388// ADD,SUB are WriteV
389
390// Forward declare.
391def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
392
393// Add/Diff and accumulate uses the vector multiply unit.
394def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
395def CyReadVAccum  : SchedReadAdvance<1,
396                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
397
398def : InstRW<[CyWriteVAccum, CyReadVAccum],
399             (instregex "SADALP","UADALP")>;
400
401def : InstRW<[CyWriteVAccum, CyReadVAccum],
402             (instregex "SABAv","UABAv","SABALv","UABALv")>;
403
404def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
405
406def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
407
408def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
409
410// WriteV includes:
411// AND,BIC,CMTST,EOR,ORN,ORR
412// ADDP
413// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
414// SADDL,SSUBL,UADDL,USUBL
415// SADDW,SSUBW,UADDW,USUBW
416
417def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
418                                     "CMLEv","CMLTv",
419                                     "CMHIv","CMHSv")>;
420
421def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
422                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
423
424def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
425                                       "SABDLv","UABDLv")>;
426
427//---
428// 7.9.3 Floating Point Arithmetic and Comparisons
429//---
430
431// FABS,FNEG are WriteF
432
433def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
434def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
435
436def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
437                                     "FMINPv2i","FMINNMPv2i")>;
438
439def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
440
441def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
442                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
443                                  FADDPv2f32,FADDPv4f32,
444                                  FABD32,FABDv2f32,FABDv4f32)>;
445def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
446                                  FSUBDrr,FSUBv2f64,
447                                  FADDPv2f64,
448                                  FABD64,FABDv2f64)>;
449
450def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
451
452def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
453                                     "FMAXS","FMAXD","FMAXv",
454                                     "FMINS","FMIND","FMINv",
455                                     "FMAXNMS","FMAXNMD","FMAXNMv",
456                                     "FMINNMS","FMINNMD","FMINNMv",
457                                     "FMAXPv2f","FMAXPv4f",
458                                     "FMINPv2f","FMINPv4f",
459                                     "FMAXNMPv2f","FMAXNMPv4f",
460                                     "FMINNMPv2f","FMINNMPv4f")>;
461
462// FCMP,FCMPE,FCCMP,FCCMPE
463def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
464
465// FCSEL is a WriteF.
466
467//---
468// 7.9.4 Shifts and Bitfield Operations
469//---
470
471// SHL is a WriteV
472
473def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
474def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
475
476def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
477def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
478
479// Shift and accumulate uses the vector multiply unit.
480def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
481def CyReadVShiftAcc  : SchedReadAdvance<1,
482                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
483def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
484             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
485
486// SSHL,USHL are WriteV.
487
488def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
489
490// SQSHL,SQSHLU,UQSHL are WriteV.
491
492def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
493
494// WriteV includes:
495// SHLL,SSHLL,USHLL
496// SLI,SRI
497// BIF,BIT,BSL
498// EXT
499// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
500// XTN2
501
502def : InstRW<[CyWriteV4],
503             (instregex "RSHRNv","SHRNv",
504                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
505                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
506
507//---
508// 7.9.5 Multiplication
509//---
510
511def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
512def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
513                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
514
515// FMUL,FMULX,FNMUL default to WriteFMul.
516def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
517
518def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
519def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
520                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
521
522def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
523def : InstRW<[CyWriteVMul, CyReadVMulAcc],
524             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
525              "SQDMLAL","SQDMLSL")>;
526
527def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
528def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
529def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
530def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
531
532def : InstRW<[CyWriteSMul, CyReadSMul],
533             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
534              FMLAv2f32,FMLAv4f32,
535              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
536def : InstRW<[CyWriteDMul, CyReadDMul],
537             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
538              FMLAv2f64,FMLAv2i64_indexed,
539              FMLSv2f64,FMLSv2i64_indexed)>;
540
541def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
542def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
543
544//---
545// 7.9.6 Divide and Square Root
546//---
547
548// FDIV,FSQRT
549// TODO: Add 64-bit variant with 19 cycle latency.
550// TODO: Specialize FSQRT for longer latency.
551def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
552  let Latency = 17;
553  let ResourceCycles = [2, 17];
554}
555
556def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
557
558def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
559def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
560
561def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
562def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
563def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
564def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
565
566//---
567// 7.9.7 Integer-FP Conversions
568//---
569
570// FCVT lengthen f16/s32
571def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
572
573// FCVT,FCVTN,FCVTXN
574// SCVTF,UCVTF V,V
575// FRINT(AIMNPXZ) V,V
576def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
577
578// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
579def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
580def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
581
582// FCVT Rd, S/D = V6+LD4: 10 cycles
583def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
584def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
585
586// FCVTL is a WriteV
587
588//---
589// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
590//---
591
592def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
593def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
594                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
595                                       SHA1SU0rrr)>;
596
597def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
598def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
599
600def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
601def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
602                                       SHA256Hrrr,SHA256H2rrr)>;
603
604// TRN,UZP,ZUP are WriteV.
605
606// TBL,TBX are WriteV.
607
608//---
609// 7.9.11-7.9.14 Load/Store, single element and paired
610//---
611
612// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
613def : WriteRes<WriteVLD, [CyUnitLS]> {
614  let Latency = 5;
615}
616
617// Store-load forwarding is 4 cycles.
618def : WriteRes<WriteVST, [CyUnitLS]> {
619  let Latency = 4;
620}
621
622// WriteVLDPair/VSTPair sequences are expanded by the target description.
623
624//---
625// 7.9.15 Load, element operations
626//---
627
628// Only the first WriteVLD and WriteAdr for writeback matches def operands.
629// Subsequent WriteVLDs consume resources. Since all loaded values have the
630// same latency, this is acceptable.
631
632// Vd is read 5 cycles after issuing the vector load.
633def : ReadAdvance<ReadVLD, 5>;
634
635def : InstRW<[WriteVLD],
636             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
637def : InstRW<[WriteVLD, WriteAdr],
638             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
639
640// Register writes from the load's high half are fused micro-ops.
641def : InstRW<[WriteVLD],
642             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
643def : InstRW<[WriteVLD, WriteAdr],
644             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
645def : InstRW<[WriteVLD, WriteVLD],
646             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
647def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
648             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
649
650def : InstRW<[WriteVLD, WriteVLD],
651             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
652def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
653             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
654def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
655             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
656def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
657             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
658
659def : InstRW<[WriteVLD, WriteVLD],
660             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
661def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
662             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
663def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
664             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
665def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
666             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
667
668def : InstRW<[WriteVLDShuffle, ReadVLD],
669             (instregex "LD1i(8|16|32)$")>;
670def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
671             (instregex "LD1i(8|16|32)_POST")>;
672
673def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
674def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
675
676def : InstRW<[WriteVLDShuffle],
677             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
678def : InstRW<[WriteVLDShuffle, WriteAdr],
679             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
680
681def : InstRW<[WriteVLDShuffle, WriteV],
682             (instregex "LD2Twov(8b|4h|2s)$")>;
683def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
684             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
685def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
686             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
687def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
688             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
689
690def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
691             (instregex "LD2i(8|16|32)$")>;
692def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
693             (instregex "LD2i(8|16|32)_POST")>;
694def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
695             (instregex "LD2i64$")>;
696def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
697             (instregex "LD2i64_POST")>;
698
699def : InstRW<[WriteVLDShuffle, WriteV],
700             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
701def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
702             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
703
704def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
705             (instregex "LD3Threev(8b|4h|2s)$")>;
706def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
707             (instregex "LD3Threev(8b|4h|2s)_POST")>;
708def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
709             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
710def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
711             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
712
713def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
714             (instregex "LD3i(8|16|32)$")>;
715def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
716             (instregex "LD3i(8|16|32)_POST")>;
717
718def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
719             (instregex "LD3i64$")>;
720def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
721             (instregex "LD3i64_POST")>;
722
723def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
724             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
725def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
726             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
727
728def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
729             (instrs LD3Rv1d,LD3Rv2d)>;
730def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
731             (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
732
733def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
734             (instregex "LD4Fourv(8b|4h|2s)$")>;
735def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
736             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
737def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
738              WriteVLDPairShuffle, WriteVLDPairShuffle],
739             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
740def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
741              WriteVLDPairShuffle, WriteVLDPairShuffle],
742             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
743
744def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
745             (instregex "LD4i(8|16|32)$")>;
746def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
747             (instregex "LD4i(8|16|32)_POST")>;
748
749
750def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
751             (instrs LD4i64)>;
752def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
753             (instrs LD4i64_POST)>;
754
755def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
756             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
757def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
758             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
759
760def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
761             (instrs LD4Rv1d,LD4Rv2d)>;
762def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
763             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
764
765//---
766// 7.9.16 Store, element operations
767//---
768
769// Only the WriteAdr for writeback matches a def operands.
770// Subsequent WriteVLDs only consume resources.
771
772def : InstRW<[WriteVST],
773             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
774def : InstRW<[WriteAdr, WriteVST],
775             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
776
777def : InstRW<[WriteVSTShuffle],
778             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
779def : InstRW<[WriteAdr, WriteVSTShuffle],
780             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
781def : InstRW<[WriteVST, WriteVST],
782             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
783def : InstRW<[WriteAdr, WriteVST, WriteVST],
784             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
785
786def : InstRW<[WriteVSTShuffle, WriteVST],
787             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
788def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
789             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
790def : InstRW<[WriteVST, WriteVST, WriteVST],
791             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
792def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
793             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
794
795def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
796             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
797def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
798             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
799def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
800             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
801def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
802             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
803
804def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
805def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
806
807def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
808def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
809
810def : InstRW<[WriteVSTShuffle],
811             (instregex "ST2Twov(8b|4h|2s)$")>;
812def : InstRW<[WriteAdr, WriteVSTShuffle],
813             (instregex "ST2Twov(8b|4h|2s)_POST")>;
814def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
815             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
816def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
817             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
818
819def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
820def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
821def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
822def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
823
824def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
825             (instregex "ST3Threev(8b|4h|2s)$")>;
826def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
827             (instregex "ST3Threev(8b|4h|2s)_POST")>;
828def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
829             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
830def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
831             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
832
833def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
834def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
835
836def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
837def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
838
839def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
840            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
841def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
842            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
843def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
844              WriteVSTPairShuffle, WriteVSTPairShuffle],
845             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
846def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
847              WriteVSTPairShuffle, WriteVSTPairShuffle],
848             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
849
850def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
851def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
852
853def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
854def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
855
856// Atomic operations are not supported.
857def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
858
859//---
860// Unused SchedRead types
861//---
862
863def : ReadAdvance<ReadI, 0>;
864def : ReadAdvance<ReadISReg, 0>;
865def : ReadAdvance<ReadIEReg, 0>;
866def : ReadAdvance<ReadIM, 0>;
867def : ReadAdvance<ReadIMA, 0>;
868def : ReadAdvance<ReadID, 0>;
869
870} // SchedModel = CycloneModel
871