xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td (revision 716fd348e01c5f2ba125f878a634a753436c2994)
1//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for ARM Cortex-A57 to support
10// instruction scheduling and other instruction cost heuristics.
11//
12//===----------------------------------------------------------------------===//
13
14//===----------------------------------------------------------------------===//
15// *** Common description and scheduling model parameters taken from AArch64 ***
16// The Cortex-A57 is a traditional superscalar microprocessor with a
17// conservative 3-wide in-order stage for decode and dispatch. Combined with the
18// much wider out-of-order issue stage, this produced a need to carefully
19// schedule micro-ops so that all three decoded each cycle are successfully
20// issued as the reservation station(s) simply don't stay occupied for long.
21// Therefore, IssueWidth is set to the narrower of the two at three, while still
22// modeling the machine as out-of-order.
23
24def IsCPSRDefinedAndPredicated : CheckAll<[IsCPSRDefined, IsPredicated]>;
25def IsCPSRDefinedAndPredicatedPred :
26    MCSchedPredicate<IsCPSRDefinedAndPredicated>;
27
28// Cortex A57 rev. r1p0 or later (false = r0px)
29def IsR1P0AndLaterPred : MCSchedPredicate<FalsePred>;
30
31def IsLdrAm3RegOffPred : MCSchedPredicate<CheckInvalidRegOperand<2>>;
32def IsLdrAm3RegOffPredX2 : MCSchedPredicate<CheckInvalidRegOperand<3>>;
33def IsLdrAm3RegOffPredX3 : MCSchedPredicate<CheckInvalidRegOperand<4>>;
34
35// If Addrmode3 contains "minus register"
36class Am3NegativeRegOffset<int n> : MCSchedPredicate<CheckAll<[
37                                      CheckValidRegOperand<n>,
38                                      CheckAM3OpSub<!add(n, 1)>]>>;
39
40def IsLdrAm3NegRegOffPred : Am3NegativeRegOffset<2>;
41def IsLdrAm3NegRegOffPredX2 : Am3NegativeRegOffset<3>;
42def IsLdrAm3NegRegOffPredX3 : Am3NegativeRegOffset<4>;
43
44// Load, scaled register offset, not plus LSL2
45class ScaledRegNotPlusLsl2<int n> : CheckNot<
46                                      CheckAny<[
47                                        CheckAM2NoShift<n>,
48                                        CheckAll<[
49                                          CheckAM2OpAdd<n>,
50                                          CheckAM2ShiftLSL<n>,
51                                          CheckAM2Offset<n, 2>
52                                        ]>
53                                      ]>
54                                    >;
55
56def IsLdstsoScaledNotOptimalPredX0 : MCSchedPredicate<ScaledRegNotPlusLsl2<2>>;
57def IsLdstsoScaledNotOptimalPred : MCSchedPredicate<ScaledRegNotPlusLsl2<3>>;
58def IsLdstsoScaledNotOptimalPredX2 : MCSchedPredicate<ScaledRegNotPlusLsl2<4>>;
59
60def IsLdstsoScaledPredX2 : MCSchedPredicate<CheckNot<CheckAM2NoShift<4>>>;
61
62def IsLdstsoMinusRegPredX0 : MCSchedPredicate<CheckAM2OpSub<2>>;
63def IsLdstsoMinusRegPred : MCSchedPredicate<CheckAM2OpSub<3>>;
64def IsLdstsoMinusRegPredX2 : MCSchedPredicate<CheckAM2OpSub<4>>;
65
66class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
67  list <SchedWriteRes> Writes = writes;
68  SchedMachineModel SchedModel = ?;
69}
70
71// *** Common description and scheduling model parameters taken from AArch64 ***
72// (AArch64SchedA57.td)
73def CortexA57Model : SchedMachineModel {
74  let IssueWidth        =   3; // 3-way decode and dispatch
75  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
76  let LoadLatency       =   4; // Optimistic load latency
77  let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
78
79  // Enable partial & runtime unrolling.
80  let LoopMicroOpBufferSize = 16;
81  let CompleteModel = 1;
82
83  // FIXME: Remove when all errors have been fixed.
84  let FullInstRWOverlapCheck = 0;
85
86  let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat,
87                             HasFPRegsV8_1M, HasFP16FML, HasMatMulInt8, HasBF16];
88}
89
90//===----------------------------------------------------------------------===//
91// Define each kind of processor resource and number available on Cortex-A57.
92// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
93// micro-ops wait for their operands and then issue out-of-order.
94
95def A57UnitB : ProcResource<1>;  // Type B micro-ops
96def A57UnitI : ProcResource<2>;  // Type I micro-ops
97def A57UnitM : ProcResource<1>;  // Type M micro-ops
98def A57UnitL : ProcResource<1>;  // Type L micro-ops
99def A57UnitS : ProcResource<1>;  // Type S micro-ops
100
101def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
102def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
103
104let SchedModel = CortexA57Model in {
105  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
106}
107
108let SchedModel = CortexA57Model in {
109
110//===----------------------------------------------------------------------===//
111// Define customized scheduler read/write types specific to the Cortex-A57.
112
113include "ARMScheduleA57WriteRes.td"
114
115// To have "CompleteModel = 1", support of pseudos and special instructions
116def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
117  "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
118  "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
119  "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
120  "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
121  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
122  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
123  "t__brkdiv0")>;
124
125def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
126
127// Specific memory instrs
128def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
129  "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
130
131// coprocessor moves
132def : InstRW<[WriteNoop, WriteNoop], (instregex
133  "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
134  "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
135  "(t2)?MSR(banked|i|_AR|_M)?$")>;
136
137// Deprecated instructions
138def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
139
140// Pseudos
141def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
142  "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
143  "tLDRpci_pic", "(t2)?SUBS_PC_LR",
144  "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
145  "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
146  "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
147  "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
148  "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
149  "WIN__CHKSTK", "WIN__DBZCHK")>;
150
151// Miscellaneous
152// -----------------------------------------------------------------------------
153
154def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
155
156// --- 3.2 Branch Instructions ---
157// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
158
159def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
160  "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
161def : InstRW<[A57Write_1cyc_1B_1I],
162  (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
163def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
164// Pseudos
165def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
166def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
167  "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
168def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
169
170// --- 3.3 Arithmetic and Logical Instructions ---
171// ADD{S}, ADC{S}, ADR,	AND{S},	BIC{S},	CMN, CMP, EOR{S}, ORN{S}, ORR{S},
172// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
173
174def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
175
176// Check branch forms of ALU ops:
177// check reg 0 for ARM_AM::PC
178// if so adds 2 cyc to latency, 1 uop, 1 res cycle for A57UnitB
179class A57BranchForm<SchedWriteRes non_br> :
180  BranchWriteRes<2, 1, [A57UnitB], [1], non_br>;
181
182// shift by register, conditional or unconditional
183// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
184// Why more complex instruction uses more simple pipeline?
185// May be an error in doc.
186def A57WriteALUsr : SchedWriteVariant<[
187  SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
188  SchedVar<NoSchedPred,      [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
189]>;
190def A57WriteALUSsr : SchedWriteVariant<[
191  SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
192  SchedVar<NoSchedPred,      [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
193]>;
194def A57ReadALUsr : SchedReadVariant<[
195  SchedVar<IsPredicatedPred, [ReadDefault]>,
196  SchedVar<NoSchedPred,      [ReadDefault]>
197]>;
198def : SchedAlias<WriteALUsi,  CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>>;
199def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
200def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
201def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
202
203def A57WriteCMPsr : SchedWriteVariant<[
204  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
205  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
206]>;
207def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
208def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
209def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
210
211// --- 3.4 Move and Shift Instructions ---
212// Move, basic
213// MOV{S}, MOVW, MVN{S}
214def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
215  "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
216  "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
217
218// Move, shift by immed, setflags/no setflags
219// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
220// setflags = isCPSRDefined
221def A57WriteMOVsi : SchedWriteVariant<[
222  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
223  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
224]>;
225def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
226  "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
227  "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
228
229// shift by register, conditional or unconditional, setflags/no setflags
230def A57WriteMOVsr : SchedWriteVariant<[
231  SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
232  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
233  SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
234  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
235]>;
236def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
237  "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
238  "(t2|t)RORrr")>;
239
240// Move, top
241// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
242def A57WriteMOVT : SchedWriteVariant<[
243  SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
244  SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
245]>;
246def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
247
248def A57WriteI2pc :
249  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
250def A57WriteI2ld :
251  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
252def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
253def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
254
255// +2cyc for branch forms
256def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
257
258// --- 3.5 Divide and Multiply Instructions ---
259// Divide: SDIV, UDIV
260// latency from documentration: 4 ­‐ 20, maximum taken
261def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
262// Multiply: tMul not bound to common WriteRes types
263def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
264def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
265def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
266def : ReadAdvance<ReadMUL, 0>;
267
268// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
269// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
270// Multiply-accumulate pipelines support late-forwarding of accumulate operands
271// from similar μops, allowing a typical sequence of multiply-accumulate μops
272// to issue one every 1 cycle (sched advance = 2).
273def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
274def A57WriteMLAL : SchedWriteVariant<[
275  SchedVar<IsCPSRDefinedPred, [A57Write_5cyc_1I_1M]>,
276  SchedVar<NoSchedPred,       [A57Write_4cyc_1M]>
277]>;
278
279def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
280
281def : InstRW<[A57WriteMLA],
282  (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
283
284def : SchedAlias<WriteMAC16, A57WriteMLA>;
285def : SchedAlias<WriteMAC32, A57WriteMLA>;
286def : SchedAlias<ReadMAC,    A57ReadMLA>;
287
288def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
289def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
290
291// Multiply long: SMULL, UMULL
292def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
293def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
294
295// --- 3.6 Saturating and Parallel Arithmetic Instructions ---
296// Parallel	arith
297// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
298// Conditional GE-setting instructions require three extra μops
299// and two additional cycles to conditionally update the GE field.
300def A57WriteParArith : SchedWriteVariant<[
301  SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
302  SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
303]>;
304def : InstRW< [A57WriteParArith], (instregex
305  "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
306  "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
307
308// Parallel	arith with exchange: SASX, SSAX, UASX, USAX
309def A57WriteParArithExch : SchedWriteVariant<[
310  SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
311  SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
312]>;
313def : InstRW<[A57WriteParArithExch],
314  (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
315
316// Parallel	halving	arith
317// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,	UHSUB8
318def : InstRW<[A57Write_2cyc_1M], (instregex
319  "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
320  "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
321
322// Parallel halving arith with exchange
323// SHASX, SHSAX, UHASX, UHSAX
324def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
325  "(t2)?UHASX", "(t2)?UHSAX")>;
326
327// Parallel	saturating arith
328// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
329def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
330  "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
331
332// Parallel	saturating arith with exchange
333// QASX, QSAX, UQASX, UQSAX
334def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
335  "(t2)?UQASX", "(t2)?UQSAX")>;
336
337// Saturate: SSAT, SSAT16, USAT, USAT16
338def : InstRW<[A57Write_2cyc_1M],
339  (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
340
341// Saturating arith: QADD, QSUB
342def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
343
344// Saturating doubling arith: QDADD, QDSUB
345def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
346
347// --- 3.7 Miscellaneous Data-Processing Instructions ---
348// Bit field extract: SBFX, UBFX
349def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
350
351// Bit field insert/clear: BFI, BFC
352def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
353
354// Select bytes, conditional/unconditional
355def A57WriteSEL : SchedWriteVariant<[
356  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
357  SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
358]>;
359def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
360
361// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
362def : InstRW<[A57Write_1cyc_1I],
363  (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
364
365// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
366def : InstRW<[A57Write_2cyc_1M],
367  (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
368
369// Sign/zero extend and add, parallel: SXTAB16, UXTAB16
370def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
371
372// Sum of absolute differences: USAD8, USADA8
373def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
374
375// --- 3.8 Load Instructions ---
376
377// Load, immed offset
378// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
379def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
380  "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
381  "PICLDR", "tLDR")>;
382
383def : InstRW<[A57Write_4cyc_1L],
384  (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
385
386// For "Load, register offset, minus" we need +1cyc, +1I
387def A57WriteLdrAm3 : SchedWriteVariant<[
388  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
389  SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
390]>;
391def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
392def A57WriteLdrAm3X2 : SchedWriteVariant<[
393  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
394  SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
395]>;
396def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
397def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
398
399def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
400  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
401  SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
402  SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
403]>;
404def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
405
406def A57WrBackOne : SchedWriteRes<[]> {
407  let Latency = 1;
408  let NumMicroOps = 0;
409}
410def A57WrBackTwo : SchedWriteRes<[]> {
411  let Latency = 2;
412  let NumMicroOps = 0;
413}
414def A57WrBackThree : SchedWriteRes<[]> {
415  let Latency = 3;
416  let NumMicroOps = 0;
417}
418
419// --- LDR pre-indexed ---
420// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
421def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
422  "LDRB_PRE_IMM", "t2LDRB_PRE")>;
423
424// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
425// (5 cyc load result for not-lsl2 scaled)
426def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
427  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
428  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
429]>;
430def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
431  (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
432
433def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
434  SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
435  SchedVar<NoSchedPred,          [A57WrBackOne]>
436]>;
437def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
438  (instregex "LDR(H|SH|SB)_PRE")>;
439def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
440  (instregex "t2LDR(H|SH|SB)?_PRE")>;
441
442// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
443def A57WriteLdrDAm3Pre : SchedWriteVariant<[
444  SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
445  SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
446]>;
447def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
448  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
449  SchedVar<NoSchedPred,          [A57WrBackOne]>
450]>;
451def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
452  (instregex "LDRD_PRE")>;
453def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
454  (instregex "t2LDRD_PRE")>;
455
456// --- LDR post-indexed ---
457def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
458  "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
459
460def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
461  SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
462  SchedVar<NoSchedPred,        [A57WrBackOne]>
463]>;
464def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
465  (instregex "LDR(H|SH|SB)_POST")>;
466def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
467  (instregex "t2LDR(H|SH|SB)?_POST")>;
468
469def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
470  "LDRB_POST_REG", "LDR(B?)T_POST$")>;
471
472def A57WriteLdrTRegPost : SchedWriteVariant<[
473  SchedVar<IsLdstsoScaledPredX2, [A57Write_4cyc_1I_1L_1M]>,
474  SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
475]>;
476def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
477  SchedVar<IsLdstsoScaledPredX2, [A57WrBackThree]>,
478  SchedVar<NoSchedPred,        [A57WrBackTwo]>
479]>;
480// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
481def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
482  (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
483
484def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
485
486def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
487  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
488  SchedVar<NoSchedPred,          [A57WrBackOne]>
489]>;
490// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
491def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
492  A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
493def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
494  (instregex "t2LDRD_POST")>;
495
496// --- Preload instructions ---
497// Preload, immed offset
498def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
499  "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
500
501// Preload, register offset,
502// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
503// otherwise 4cyc "L"
504def A57WritePLD : SchedWriteVariant<[
505  SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
506  SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
507  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
508]>;
509def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
510
511// --- Load multiple instructions ---
512foreach NumAddr = 1-8 in {
513  def A57LMAddrPred#NumAddr : MCSchedPredicate<CheckAny<[
514                                CheckNumOperands<!add(!shl(NumAddr, 1), 2)>,
515                                CheckNumOperands<!add(!shl(NumAddr, 1), 3)>]>>;
516  def A57LMAddrUpdPred#NumAddr : MCSchedPredicate<CheckAny<[
517                                   CheckNumOperands<!add(!shl(NumAddr, 1), 3)>,
518                                   CheckNumOperands<!add(!shl(NumAddr, 1), 4)>]>>;
519}
520
521def A57LDMOpsListNoregin : A57WriteLMOpsListType<
522                [A57Write_3cyc_1L, A57Write_3cyc_1L,
523                 A57Write_4cyc_1L, A57Write_4cyc_1L,
524                 A57Write_5cyc_1L, A57Write_5cyc_1L,
525                 A57Write_6cyc_1L, A57Write_6cyc_1L,
526                 A57Write_7cyc_1L, A57Write_7cyc_1L,
527                 A57Write_8cyc_1L, A57Write_8cyc_1L,
528                 A57Write_9cyc_1L, A57Write_9cyc_1L,
529                 A57Write_10cyc_1L, A57Write_10cyc_1L]>;
530def A57WriteLDMnoreginlist : SchedWriteVariant<[
531  SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
532  SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
533  SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
534  SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
535  SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
536  SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
537  SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
538  SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
539  SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
540]> { let Variadic=1; }
541
542def A57LDMOpsListRegin : A57WriteLMOpsListType<
543                [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
544                 A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
545                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
546                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
547                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
548                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
549                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
550                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
551def A57WriteLDMreginlist : SchedWriteVariant<[
552  SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
553  SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
554  SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
555  SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
556  SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
557  SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
558  SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
559  SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
560  SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
561]> { let Variadic=1; }
562
563def A57LDMOpsList_Upd : A57WriteLMOpsListType<
564              [A57WrBackOne,
565               A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
566               A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
567               A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
568               A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
569               A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
570               A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
571               A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
572               A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
573def A57WriteLDM_Upd : SchedWriteVariant<[
574  SchedVar<A57LMAddrUpdPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
575  SchedVar<A57LMAddrUpdPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
576  SchedVar<A57LMAddrUpdPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
577  SchedVar<A57LMAddrUpdPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
578  SchedVar<A57LMAddrUpdPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
579  SchedVar<A57LMAddrUpdPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
580  SchedVar<A57LMAddrUpdPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
581  SchedVar<A57LMAddrUpdPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
582  SchedVar<NoSchedPred,           A57LDMOpsList_Upd.Writes[0-16]>
583]> { let Variadic=1; }
584
585def A57WriteLDM : SchedWriteVariant<[
586  SchedVar<IsLDMBaseRegInListPred, [A57WriteLDMreginlist]>,
587  SchedVar<NoSchedPred,            [A57WriteLDMnoreginlist]>
588]> { let Variadic=1; }
589
590def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
591
592// TODO: no writeback latency defined in documentation (implemented as 1 cyc)
593def : InstRW<[A57WriteLDM_Upd],
594  (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
595
596def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
597
598// --- 3.9 Store Instructions ---
599
600// Store, immed offset
601def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
602  "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
603
604// Store, register offset
605// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
606// otherwise 1cyc S.
607def A57WriteStrAmLDSTSO : SchedWriteVariant<[
608  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
609  SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
610  SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
611]>;
612def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
613
614// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
615def A57WriteStrAm3 : SchedWriteVariant<[
616  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
617  SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
618]>;
619def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
620def A57WriteStrAm3X2 : SchedWriteVariant<[
621  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
622  SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
623]>;
624def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
625
626// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
627def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
628  "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
629  "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
630
631// Store, register pre-indexed:
632// 1(1) "S, I0/I1" for plus reg
633// 3(2) "I0/I1, S" for minus reg
634// 1(2) "S, M" for scaled plus lsl2
635// 3(2) "I0/I1, S" for other scaled
636def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
637  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
638  SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
639  SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
640  SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
641]>;
642def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
643  SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
644  SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
645  SchedVar<NoSchedPred,                    [A57WrBackOne]>
646]>;
647def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
648  (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
649
650// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
651// 1(1) "S, I0/I1" for imm or reg plus
652// 3(2) "I0/I1, S" for reg minus
653def A57WriteStrAm3PreX2 : SchedWriteVariant<[
654  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
655  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
656]>;
657def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
658  SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
659  SchedVar<NoSchedPred,             [A57WrBackOne]>
660]>;
661def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
662  (instregex "STRH_PRE")>;
663
664def A57WriteStrAm3PreX3 : SchedWriteVariant<[
665  SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
666  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
667]>;
668def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
669  SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
670  SchedVar<NoSchedPred,             [A57WrBackOne]>
671]>;
672def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
673  (instregex "STRD_PRE")>;
674
675def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
676  "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
677
678// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
679def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
680  "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
681
682// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
683// 1(1) "S, I0/I1" both for reg or imm
684def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
685  (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
686
687// --- Store multiple instructions ---
688// TODO: no writeback latency defined in documentation
689def A57WriteSTM : SchedWriteVariant<[
690    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
691    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
692    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
693    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
694    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
695    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
696    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
697    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
698    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
699]>;
700def A57WriteSTM_Upd : SchedWriteVariant<[
701    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
702    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
703    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
704    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
705    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
706    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
707    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
708    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
709    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
710]>;
711
712def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
713def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
714  (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
715
716def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
717
718// --- 3.10 FP Data Processing Instructions ---
719def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
720def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
721
722def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
723
724// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
725def A57WriteVcmp : SchedWriteVariant<[
726  SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
727  SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
728]>;
729def : InstRW<[A57WriteVcmp],
730  (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
731
732// fp convert
733def : InstRW<[A57Write_5cyc_1V], (instregex
734  "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
735def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
736def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
737
738def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
739
740// FP round to integral
741def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
742
743// FP divide, FP square root
744def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
745def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
746def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
747def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
748
749def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
750
751// FP max/min
752def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
753
754// FP multiply-accumulate pipelines support late forwarding of the result
755// from FP multiply μops to the accumulate operands of an
756// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
757// after the FP multiply μop has been issued
758// FP multiply, FZ
759def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
760
761def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
762def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
763def : ReadAdvance<ReadFPMUL, 0>;
764
765// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
766// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
767def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
768
769// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
770// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
771// Currently, there is no way to define different read advances for VFMA operand
772// from VFMA or from VMUL, so there will be 5 read advance.
773// Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
774// The same situation with ASIMD VMUL/VFMA instructions
775// def A57ReadVFMA : SchedRead;
776// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
777// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
778def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
779
780def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
781def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
782def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
783
784// VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
785def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
786  (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
787
788def : InstRW<[A57WriteVMUL],
789  (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
790
791def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
792def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
793
794// --- 3.11 FP Miscellaneous Instructions ---
795// VMOV: 3cyc "F0/F1" for imm/reg
796def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
797def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
798
799def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
800
801// 5cyc L for FP transfer, vfp to core reg,
802// 5cyc L for FP transfer, core reg to vfp
803def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
804// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
805def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
806
807// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
808def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
809
810// --- 3.12 FP Load Instructions ---
811def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
812
813def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
814
815// FP load multiple (VLDM)
816
817def A57VLDMOpsListUncond : A57WriteLMOpsListType<
818               [A57Write_5cyc_1L, A57Write_5cyc_1L,
819                A57Write_6cyc_1L, A57Write_6cyc_1L,
820                A57Write_7cyc_1L, A57Write_7cyc_1L,
821                A57Write_8cyc_1L, A57Write_8cyc_1L,
822                A57Write_9cyc_1L, A57Write_9cyc_1L,
823                A57Write_10cyc_1L, A57Write_10cyc_1L,
824                A57Write_11cyc_1L, A57Write_11cyc_1L,
825                A57Write_12cyc_1L, A57Write_12cyc_1L]>;
826def A57WriteVLDMuncond : SchedWriteVariant<[
827  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
828  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
829  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
830  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
831  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
832  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
833  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
834  SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
835]> { let Variadic=1; }
836
837def A57VLDMOpsListCond : A57WriteLMOpsListType<
838               [A57Write_5cyc_1L, A57Write_6cyc_1L,
839                A57Write_7cyc_1L, A57Write_8cyc_1L,
840                A57Write_9cyc_1L, A57Write_10cyc_1L,
841                A57Write_11cyc_1L, A57Write_12cyc_1L,
842                A57Write_13cyc_1L, A57Write_14cyc_1L,
843                A57Write_15cyc_1L, A57Write_16cyc_1L,
844                A57Write_17cyc_1L, A57Write_18cyc_1L,
845                A57Write_19cyc_1L, A57Write_20cyc_1L]>;
846def A57WriteVLDMcond : SchedWriteVariant<[
847  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
848  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
849  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
850  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
851  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
852  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
853  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
854  SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
855]> { let Variadic=1; }
856
857def A57WriteVLDM : SchedWriteVariant<[
858  SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
859  SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
860]> { let Variadic=1; }
861
862def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
863
864def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
865               [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
866                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
867                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
868                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
869                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
870                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
871                A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
872                A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
873def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
874  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
875  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
876  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
877  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
878  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
879  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
880  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
881  SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
882]> { let Variadic=1; }
883
884def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
885               [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
886                A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
887                A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
888                A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
889                A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
890                A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
891                A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
892                A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
893def A57WriteVLDMcond_UPD : SchedWriteVariant<[
894  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
895  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
896  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
897  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
898  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
899  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
900  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
901  SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
902]> { let Variadic=1; }
903
904def A57WriteVLDM_UPD : SchedWriteVariant<[
905  SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
906  SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
907]> { let Variadic=1; }
908
909def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
910  (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
911
912// --- 3.13 FP Store Instructions ---
913def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
914
915def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
916
917def A57WriteVSTMs : SchedWriteVariant<[
918    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
919    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
920    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
921    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
922    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
923    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
924    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
925    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
926    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
927]>;
928def A57WriteVSTMd : SchedWriteVariant<[
929    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
930    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
931    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
932    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
933    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
934    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
935    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
936    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
937    SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
938]>;
939def A57WriteVSTMs_Upd : SchedWriteVariant<[
940    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
941    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
942    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
943    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
944    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
945    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
946    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
947    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
948    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
949]>;
950def A57WriteVSTMd_Upd : SchedWriteVariant<[
951    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
952    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
953    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
954    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
955    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
956    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
957    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
958    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
959    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
960]>;
961
962def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
963def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
964def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
965  (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
966def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
967  (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
968
969// --- 3.14 ASIMD Integer Instructions ---
970
971// ASIMD absolute diff, 3cyc F0/F1 for integer VABD
972def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
973
974// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
975def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
976def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
977def : InstRW<[A57WriteVABAD, A57ReadVABAD],
978  (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
979def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
980def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
981def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
982  (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
983
984// ASIMD absolute diff accum long: 4(1) F1 for VABAL
985def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
986def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
987def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
988
989// ASIMD absolute diff long: 3cyc F0/F1 for VABDL
990def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
991
992// ASIMD arith, basic
993def : InstRW<[A57Write_3cyc_1V], (instregex "VADDv", "VADDL", "VADDW",
994  "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
995  "VPADDi", "VPADDL", "VSUBv", "VSUBL", "VSUBW")>;
996
997// ASIMD arith, complex
998def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
999  "VQABS", "VQADD", "VQNEG", "VQSUB",
1000  "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
1001
1002// ASIMD compare
1003def : InstRW<[A57Write_3cyc_1V],
1004  (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
1005
1006// ASIMD logical
1007def : InstRW<[A57Write_3cyc_1V],
1008  (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
1009
1010// ASIMD max/min
1011def : InstRW<[A57Write_3cyc_1V],
1012  (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
1013
1014// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1015// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
1016// and multiply-with-accumulate instructions relative to r0pX.
1017def A57WriteVMULD_VecInt : SchedWriteVariant<[
1018  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1019  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1020def : InstRW<[A57WriteVMULD_VecInt], (instregex
1021  "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
1022  "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
1023
1024// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1025def A57WriteVMULQ_VecInt : SchedWriteVariant<[
1026  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1027  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1028def : InstRW<[A57WriteVMULQ_VecInt], (instregex
1029  "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
1030  "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
1031
1032// ASIMD multiply accumulate, D-form
1033// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1034// (4 or 3 ReadAdvance)
1035def A57WriteVMLAD_VecInt : SchedWriteVariant<[
1036  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1037  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1038def A57ReadVMLAD_VecInt : SchedReadVariant<[
1039  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
1040  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
1041]>;
1042def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
1043  (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
1044
1045// ASIMD multiply accumulate, Q-form
1046// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1047// (4 or 3 ReadAdvance)
1048def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
1049  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1050  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1051def A57ReadVMLAQ_VecInt : SchedReadVariant<[
1052  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
1053  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
1054]>;
1055def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
1056  (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
1057
1058// ASIMD multiply accumulate long
1059// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1060// (4 or 3 ReadAdvance)
1061def A57WriteVMLAL_VecInt : SchedWriteVariant<[
1062  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1063  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1064def A57ReadVMLAL_VecInt : SchedReadVariant<[
1065  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
1066  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
1067]>;
1068def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
1069  (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
1070
1071// ASIMD multiply accumulate saturating long
1072// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1073// (3 or 2 ReadAdvance)
1074def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
1075  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1076  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1077def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
1078  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
1079  SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
1080]>;
1081def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1082  (instregex "VQDMLAL", "VQDMLSL")>;
1083
1084// Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
1085// Scheduling info from VQDMLAL/VQDMLSL
1086def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1087  (instregex "VQRDMLAH", "VQRDMLSH")>;
1088
1089// ASIMD multiply long
1090// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1091def A57WriteVMULL_VecInt : SchedWriteVariant<[
1092  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1093  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1094def : InstRW<[A57WriteVMULL_VecInt],
1095  (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
1096
1097// ASIMD pairwise add and accumulate
1098// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1099def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
1100def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
1101def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
1102
1103// ASIMD shift accumulate
1104// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1105def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
1106def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
1107def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
1108
1109// ASIMD shift by immed, basic
1110def : InstRW<[A57Write_3cyc_1X],
1111  (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
1112
1113// ASIMD shift by immed, complex
1114def : InstRW<[A57Write_4cyc_1X], (instregex
1115  "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
1116  "VRSHRN")>;
1117
1118// ASIMD shift by immed and insert, basic, D-form
1119def : InstRW<[A57Write_4cyc_1X], (instregex
1120  "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
1121
1122// ASIMD shift by immed and insert, basic, Q-form
1123def : InstRW<[A57Write_5cyc_1X], (instregex
1124  "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
1125
1126// ASIMD shift by register, basic, D-form
1127def : InstRW<[A57Write_3cyc_1X], (instregex
1128  "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1129
1130// ASIMD shift by register, basic, Q-form
1131def : InstRW<[A57Write_4cyc_1X], (instregex
1132  "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1133
1134// ASIMD shift by register, complex, D-form
1135// VQRSHL, VQSHL, VRSHL
1136def : InstRW<[A57Write_4cyc_1X], (instregex
1137  "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
1138  "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1139
1140// ASIMD shift by register, complex, Q-form
1141def : InstRW<[A57Write_5cyc_1X], (instregex
1142  "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
1143  "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1144
1145// --- 3.15 ASIMD Floating-Point Instructions ---
1146// ASIMD FP absolute value
1147def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
1148
1149// ASIMD FP arith
1150def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
1151  "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
1152
1153def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
1154
1155// ASIMD FP compare
1156def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
1157  "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
1158
1159// ASIMD FP convert, integer
1160def : InstRW<[A57Write_5cyc_1V], (instregex
1161  "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
1162  "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
1163  "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
1164
1165// ASIMD FP convert, half-precision: 8cyc F0/F1
1166def : InstRW<[A57Write_8cyc_1V], (instregex
1167  "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
1168  "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
1169  "VCVT(f2h|h2f)")>;
1170
1171// ASIMD FP max/min
1172def : InstRW<[A57Write_5cyc_1V], (instregex
1173  "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "(NEON|VFP)_VMAXNM",
1174  "(NEON|VFP)_VMINNM")>;
1175
1176// ASIMD FP multiply
1177def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
1178def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
1179
1180// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1181def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
1182def A57ReadVMLA_VecFP  :
1183  SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
1184def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
1185  (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
1186
1187// ASIMD FP negate
1188def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
1189
1190// ASIMD FP round to integral
1191def : InstRW<[A57Write_5cyc_1V], (instregex
1192  "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
1193
1194// --- 3.16 ASIMD Miscellaneous Instructions ---
1195
1196// ASIMD bitwise insert
1197def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL", "VBSP")>;
1198
1199// ASIMD count
1200def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
1201
1202// ASIMD duplicate, core reg: 8cyc "L, F0/F1"
1203def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
1204
1205// ASIMD duplicate, scalar: 3cyc "F0/F1"
1206def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
1207
1208// ASIMD extract
1209def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
1210
1211// ASIMD move, immed
1212def : InstRW<[A57Write_3cyc_1V], (instregex
1213  "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
1214  "VMOVD0", "VMOVQ0")>;
1215
1216// ASIMD move, narrowing
1217def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
1218
1219// ASIMD move, saturating
1220def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
1221
1222// ASIMD reciprocal estimate
1223def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
1224
1225// ASIMD reciprocal step, FZ
1226def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
1227
1228// ASIMD reverse, swap, table lookup (1-2 reg)
1229def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
1230
1231// ASIMD table lookup (3-4 reg)
1232def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
1233
1234// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
1235def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
1236
1237// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
1238def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
1239
1240// ASIMD transpose
1241def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
1242
1243// ASIMD unzip/zip, D-form
1244def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
1245  (instregex "VUZPd", "VZIPd")>;
1246
1247// ASIMD unzip/zip, Q-form
1248def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
1249  (instregex "VUZPq", "VZIPq")>;
1250
1251// --- 3.17 ASIMD Load Instructions ---
1252
1253// Overriden via InstRW for this processor.
1254def : WriteRes<WriteVLD1, []>;
1255def : WriteRes<WriteVLD2, []>;
1256def : WriteRes<WriteVLD3, []>;
1257def : WriteRes<WriteVLD4, []>;
1258def : WriteRes<WriteVST1, []>;
1259def : WriteRes<WriteVST2, []>;
1260def : WriteRes<WriteVST3, []>;
1261def : WriteRes<WriteVST4, []>;
1262
1263// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1264def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
1265def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
1266  (instregex "VLD1(d|q)(8|16|32|64)wb")>;
1267
1268// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1269def : InstRW<[A57Write_6cyc_1L],
1270  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
1271
1272def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
1273  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
1274
1275// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
1276def : InstRW<[A57Write_8cyc_1L_1V], (instregex
1277  "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
1278def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
1279  "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
1280
1281// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
1282def : InstRW<[A57Write_8cyc_1L_1V],
1283      (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
1284def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1285      (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
1286
1287// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
1288def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
1289def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1290      (instregex "VLD2b(8|16|32)wb")>;
1291
1292// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
1293def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1294      (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
1295                 "VLD2LN(d|q)(8|16|32)Pseudo$")>;
1296// 2 results + wb result
1297def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
1298      (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
1299// 1 result + wb result
1300def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1301      (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
1302                 "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
1303
1304// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
1305// 3 results
1306def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1307      (instregex "VLD3(d|q)(8|16|32)$")>;
1308// 1 result
1309def : InstRW<[A57Write_9cyc_1L_1V],
1310      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
1311// 3 results + wb
1312def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1313              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1314      (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
1315// 1 result + wb
1316def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1317      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1318
1319// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
1320def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1321      (instregex "VLD3LN(d|q)32$",
1322                 "VLD3LN(d|q)32Pseudo$")>;
1323def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1324              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1325      (instregex "VLD3LN(d|q)32_UPD")>;
1326def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1327      (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
1328
1329// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
1330def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1331      (instregex "VLD3LN(d|q)(8|16)$",
1332                 "VLD3LN(d|q)(8|16)Pseudo$")>;
1333def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1334              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1335      (instregex "VLD3LN(d|q)(8|16)_UPD")>;
1336def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1337      (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
1338
1339// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
1340def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1341      (instregex "VLD3DUP(d|q)(8|16|32)$",
1342                 "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
1343def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1344              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1345      (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
1346def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1347      (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
1348
1349// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
1350def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1351              A57Write_9cyc_1L_1V],
1352      (instregex "VLD4(d|q)(8|16|32)$")>;
1353def : InstRW<[A57Write_9cyc_1L_1V],
1354      (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
1355def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1356              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1357      (instregex "VLD4(d|q)(8|16|32)_UPD")>;
1358def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1359      (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1360
1361// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
1362def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1363              A57Write_8cyc_1L_1V],
1364      (instregex "VLD4LN(d|q)32$",
1365                 "VLD4LN(d|q)32Pseudo$")>;
1366def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1367              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1368              A57WrBackOne],
1369      (instregex "VLD4LN(d|q)32_UPD")>;
1370def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1371      (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
1372
1373// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
1374def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1375              A57Write_9cyc_1L_1V],
1376      (instregex "VLD4LN(d|q)(8|16)$",
1377                 "VLD4LN(d|q)(8|16)Pseudo$")>;
1378def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1379              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1380              A57WrBackOne],
1381      (instregex "VLD4LN(d|q)(8|16)_UPD")>;
1382def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1383      (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
1384
1385// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
1386def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1387              A57Write_8cyc_1L_1V],
1388      (instregex "VLD4DUP(d|q)(8|16|32)$",
1389                 "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
1390def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1391              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1392              A57WrBackOne],
1393      (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
1394def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1395      (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
1396
1397// --- 3.18 ASIMD Store Instructions ---
1398
1399// ASIMD store, 1 element, multiple, 1 reg: 1cyc S
1400def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
1401def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
1402      (instregex "VST1d(8|16|32|64)wb")>;
1403// ASIMD store, 1 element, multiple, 2 reg: 2cyc S
1404def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
1405def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
1406      (instregex "VST1q(8|16|32|64)wb")>;
1407// ASIMD store, 1 element, multiple, 3 reg: 3cyc S
1408def : InstRW<[A57Write_3cyc_1S],
1409      (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
1410def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
1411      (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
1412// ASIMD store, 1 element, multiple, 4 reg: 4cyc S
1413def : InstRW<[A57Write_4cyc_1S],
1414      (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
1415def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
1416      (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
1417// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
1418def : InstRW<[A57Write_3cyc_1S_1V],
1419      (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
1420def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1421      (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
1422// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
1423def : InstRW<[A57Write_3cyc_1S_1V],
1424      (instregex "VST2(d|b)(8|16|32)$")>;
1425def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1426      (instregex "VST2(b|d)(8|16|32)wb")>;
1427// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
1428def : InstRW<[A57Write_4cyc_1S_1V],
1429      (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
1430def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1431      (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
1432// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
1433def : InstRW<[A57Write_3cyc_1S_1V],
1434      (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
1435def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1436      (instregex "VST2LN(d|q)(8|16|32)_UPD",
1437                 "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
1438// ASIMD store, 3 element, multiple, 3 reg
1439def : InstRW<[A57Write_3cyc_1S_1V],
1440      (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
1441def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1442      (instregex "VST3(d|q)(8|16|32)_UPD",
1443                 "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1444// ASIMD store, 3 element, one lane
1445def : InstRW<[A57Write_3cyc_1S_1V],
1446      (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
1447def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1448      (instregex "VST3LN(d|q)(8|16|32)_UPD",
1449                 "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
1450// ASIMD store, 4 element, multiple, 4 reg
1451def : InstRW<[A57Write_4cyc_1S_1V],
1452      (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
1453def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1454      (instregex "VST4(d|q)(8|16|32)_UPD",
1455                 "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1456// ASIMD store, 4 element, one lane
1457def : InstRW<[A57Write_3cyc_1S_1V],
1458      (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
1459def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1460      (instregex "VST4LN(d|q)(8|16|32)_UPD",
1461                 "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
1462
1463// --- 3.19 Cryptography Extensions ---
1464// Crypto AES ops
1465// AESD, AESE, AESIMC, AESMC: 3cyc F0
1466def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
1467// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1468def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
1469// Crypto SHA1 xor ops: 6cyc F0/F1
1470def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
1471// Crypto SHA1 fast ops: 3cyc F0
1472def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
1473// Crypto SHA1 slow ops: 6cyc F0
1474def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
1475// Crypto SHA256 fast ops: 3cyc F0
1476def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
1477// Crypto SHA256 slow ops: 6cyc F0
1478def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
1479
1480// --- 3.20 CRC ---
1481def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
1482
1483// -----------------------------------------------------------------------------
1484// Common definitions
1485def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
1486def : SchedAlias<WriteALU, CheckBranchForm<0, A57BranchForm<A57Write_1cyc_1I>>>;
1487
1488def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
1489def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
1490def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
1491def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
1492
1493def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
1494def : SchedAlias<WriteST, A57Write_1cyc_1S>;
1495def : ReadAdvance<ReadALU, 0>;
1496
1497} // SchedModel = CortexA57Model
1498
1499