xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td (revision ba3c1f5972d7b90feb6e6da47905ff2757e0fe57)
1//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the itinerary class data for the ARM Cortex A9 processors.
10//
11//===----------------------------------------------------------------------===//
12
13// ===---------------------------------------------------------------------===//
14// This section contains legacy support for itineraries. This is
15// required until SD and PostRA schedulers are replaced by MachineScheduler.
16
17//
18// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
19// Reference Manual".
20//
21// Functional units
22def A9_Issue0  : FuncUnit; // Issue 0
23def A9_Issue1  : FuncUnit; // Issue 1
24def A9_Branch  : FuncUnit; // Branch
25def A9_ALU0    : FuncUnit; // ALU / MUL pipeline 0
26def A9_ALU1    : FuncUnit; // ALU pipeline 1
27def A9_AGU     : FuncUnit; // Address generation unit for ld / st
28def A9_NPipe   : FuncUnit; // NEON pipeline
29def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
30def A9_LSUnit  : FuncUnit; // L/S Unit
31def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
32def A9_DRegsN  : FuncUnit; // FP register set, NEON side
33
34// Bypasses
35def A9_LdBypass : Bypass;
36
37def CortexA9Itineraries : ProcessorItineraries<
38  [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
39   A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
40  [A9_LdBypass], [
41  // Two fully-pipelined integer ALU pipelines
42
43  //
44  // Move instructions, unconditional
45  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
46                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
47  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
48                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
49  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
50                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
51  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
52                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
53  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
54                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
55                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
56  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
57                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
58                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
59                                  InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
60  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
61                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
62                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
63                               InstrStage<1, [A9_MUX0], 0>,
64                               InstrStage<1, [A9_AGU], 0>,
65                               InstrStage<1, [A9_LSUnit]>], [5]>,
66  //
67  // MVN instructions
68  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
69                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
70                              [1]>,
71  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
72                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
73                              [1, 1], [NoBypass, A9_LdBypass]>,
74  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
75                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
76                              [2, 1]>,
77  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
78                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
79                              [3, 1, 1]>,
80  //
81  // No operand cycles
82  InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
83                               InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
84  //
85  // Binary Instructions that produce a result
86  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
87                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
88                            [1, 1], [NoBypass, A9_LdBypass]>,
89  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
90                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
91                            [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
92  InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
93                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
94                            [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
95  InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
96                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
97                            [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
98  InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
99                             InstrStage<3, [A9_ALU0, A9_ALU1]>],
100                            [3, 1, 1, 1],
101                            [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
102  //
103  // Bitwise Instructions that produce a result
104  InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
105                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
106  InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
107                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
108  InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
109                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
110  InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
111                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
112  //
113  // Unary Instructions that produce a result
114
115  // CLZ, RBIT, etc.
116  InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
117                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
118
119  // BFC, BFI, UBFX, SBFX
120  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
121                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
122
123  //
124  // Zero and sign extension instructions
125  InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
126                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
127  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
128                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
129  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
130                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
131  //
132  // Compare instructions
133  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
134                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
135                               [1], [A9_LdBypass]>,
136  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
137                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
138                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
139  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
140                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
141                                [1, 1], [A9_LdBypass, NoBypass]>,
142  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
143                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
144                              [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
145  //
146  // Test instructions
147  InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
148                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
149  InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
150                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
151  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
152                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
153  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
154                               InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
155  //
156  // Move instructions, conditional
157  // FIXME: Correctly model the extra input dep on the destination.
158  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
159                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
160  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
161                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
162  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
163                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
164  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
165                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
166  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
167                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
168                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
169                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
170
171  // Integer multiply pipeline
172  //
173  InstrItinData<IIC_iMUL16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
174                               InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
175  InstrItinData<IIC_iMAC16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
176                               InstrStage<2, [A9_ALU0]>],
177                              [3, 1, 1, 1]>,
178  InstrItinData<IIC_iMUL32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
179                               InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
180  InstrItinData<IIC_iMAC32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
181                               InstrStage<2, [A9_ALU0]>],
182                              [4, 1, 1, 1]>,
183  InstrItinData<IIC_iMUL64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
184                               InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
185  InstrItinData<IIC_iMAC64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
186                               InstrStage<3, [A9_ALU0]>],
187                              [4, 5, 1, 1]>,
188  // Integer load pipeline
189  // FIXME: The timings are some rough approximations
190  //
191  // Immediate offset
192  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
193                                 InstrStage<1, [A9_MUX0], 0>,
194                                 InstrStage<1, [A9_AGU], 0>,
195                                 InstrStage<1, [A9_LSUnit]>],
196                                [3, 1], [A9_LdBypass]>,
197  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
198                                 InstrStage<1, [A9_MUX0], 0>,
199                                 InstrStage<2, [A9_AGU], 0>,
200                                 InstrStage<1, [A9_LSUnit]>],
201                                [4, 1], [A9_LdBypass]>,
202  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
203  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
204                                 InstrStage<1, [A9_MUX0], 0>,
205                                 InstrStage<2, [A9_AGU], 0>,
206                                 InstrStage<1, [A9_LSUnit]>],
207                                [3, 3, 1], [A9_LdBypass]>,
208  //
209  // Register offset
210  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
211                                 InstrStage<1, [A9_MUX0], 0>,
212                                 InstrStage<1, [A9_AGU], 0>,
213                                 InstrStage<1, [A9_LSUnit]>],
214                                [3, 1, 1], [A9_LdBypass]>,
215  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
216                                 InstrStage<1, [A9_MUX0], 0>,
217                                 InstrStage<2, [A9_AGU], 0>,
218                                 InstrStage<1, [A9_LSUnit]>],
219                                [4, 1, 1], [A9_LdBypass]>,
220  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
221                                 InstrStage<1, [A9_MUX0], 0>,
222                                 InstrStage<2, [A9_AGU], 0>,
223                                 InstrStage<1, [A9_LSUnit]>],
224                                [3, 3, 1, 1], [A9_LdBypass]>,
225  //
226  // Scaled register offset
227  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
228                                 InstrStage<1, [A9_MUX0], 0>,
229                                 InstrStage<1, [A9_AGU], 0>,
230                                 InstrStage<1, [A9_LSUnit], 0>],
231                                [4, 1, 1], [A9_LdBypass]>,
232  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
233                                 InstrStage<1, [A9_MUX0], 0>,
234                                 InstrStage<2, [A9_AGU], 0>,
235                                 InstrStage<1, [A9_LSUnit]>],
236                                [5, 1, 1], [A9_LdBypass]>,
237  //
238  // Immediate offset with update
239  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
240                                 InstrStage<1, [A9_MUX0], 0>,
241                                 InstrStage<1, [A9_AGU], 0>,
242                                 InstrStage<1, [A9_LSUnit]>],
243                                [3, 2, 1], [A9_LdBypass]>,
244  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
245                                 InstrStage<1, [A9_MUX0], 0>,
246                                 InstrStage<2, [A9_AGU], 0>,
247                                 InstrStage<1, [A9_LSUnit]>],
248                                [4, 3, 1], [A9_LdBypass]>,
249  //
250  // Register offset with update
251  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
252                                 InstrStage<1, [A9_MUX0], 0>,
253                                 InstrStage<1, [A9_AGU], 0>,
254                                 InstrStage<1, [A9_LSUnit]>],
255                                [3, 2, 1, 1], [A9_LdBypass]>,
256  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
257                                 InstrStage<1, [A9_MUX0], 0>,
258                                 InstrStage<2, [A9_AGU], 0>,
259                                 InstrStage<1, [A9_LSUnit]>],
260                                [4, 3, 1, 1], [A9_LdBypass]>,
261  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
262                                 InstrStage<1, [A9_MUX0], 0>,
263                                 InstrStage<2, [A9_AGU], 0>,
264                                 InstrStage<1, [A9_LSUnit]>],
265                                [3, 3, 1, 1], [A9_LdBypass]>,
266  //
267  // Scaled register offset with update
268  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
269                                 InstrStage<1, [A9_MUX0], 0>,
270                                 InstrStage<1, [A9_AGU], 0>,
271                                 InstrStage<1, [A9_LSUnit]>],
272                                [4, 3, 1, 1], [A9_LdBypass]>,
273  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
274                                  InstrStage<1, [A9_MUX0], 0>,
275                                  InstrStage<2, [A9_AGU], 0>,
276                                  InstrStage<1, [A9_LSUnit]>],
277                                 [5, 4, 1, 1], [A9_LdBypass]>,
278  //
279  // Load multiple, def is the 5th operand.
280  // FIXME: This assumes 3 to 4 registers.
281  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
282                                InstrStage<1, [A9_MUX0], 0>,
283                                InstrStage<2, [A9_AGU], 1>,
284                                InstrStage<2, [A9_LSUnit]>],
285                               [1, 1, 1, 1, 3],
286                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
287                         -1>, // dynamic uops
288  //
289  // Load multiple + update, defs are the 1st and 5th operands.
290  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
291                                InstrStage<1, [A9_MUX0], 0>,
292                                InstrStage<2, [A9_AGU], 1>,
293                                InstrStage<2, [A9_LSUnit]>],
294                               [2, 1, 1, 1, 3],
295                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
296                         -1>, // dynamic uops
297  //
298  // Load multiple plus branch
299  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
300                                InstrStage<1, [A9_MUX0], 0>,
301                                InstrStage<1, [A9_AGU], 1>,
302                                InstrStage<2, [A9_LSUnit]>,
303                                InstrStage<1, [A9_Branch]>],
304                               [1, 2, 1, 1, 3],
305                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
306                         -1>, // dynamic uops
307  //
308  // Pop, def is the 3rd operand.
309  InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
310                                InstrStage<1, [A9_MUX0], 0>,
311                                InstrStage<2, [A9_AGU], 1>,
312                                InstrStage<2, [A9_LSUnit]>],
313                               [1, 1, 3],
314                               [NoBypass, NoBypass, A9_LdBypass],
315                               -1>, // dynamic uops
316  //
317  // Pop + branch, def is the 3rd operand.
318  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
319                                InstrStage<1, [A9_MUX0], 0>,
320                                InstrStage<2, [A9_AGU], 1>,
321                                InstrStage<2, [A9_LSUnit]>,
322                                InstrStage<1, [A9_Branch]>],
323                               [1, 1, 3],
324                               [NoBypass, NoBypass, A9_LdBypass],
325                               -1>, // dynamic uops
326  //
327  // iLoadi + iALUr for t2LDRpci_pic.
328  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
329                                InstrStage<1, [A9_MUX0], 0>,
330                                InstrStage<1, [A9_AGU], 0>,
331                                InstrStage<1, [A9_LSUnit]>,
332                                InstrStage<1, [A9_ALU0, A9_ALU1]>],
333                               [2, 1]>,
334
335  // Integer store pipeline
336  ///
337  // Immediate offset
338  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
339                                 InstrStage<1, [A9_MUX0], 0>,
340                                 InstrStage<1, [A9_AGU], 0>,
341                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
342  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
343                                 InstrStage<1, [A9_MUX0], 0>,
344                                 InstrStage<2, [A9_AGU], 1>,
345                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
346  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
347  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
348                                 InstrStage<1, [A9_MUX0], 0>,
349                                 InstrStage<2, [A9_AGU], 1>,
350                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
351  //
352  // Register offset
353  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
354                                 InstrStage<1, [A9_MUX0], 0>,
355                                 InstrStage<1, [A9_AGU], 0>,
356                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
357  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
358                                 InstrStage<1, [A9_MUX0], 0>,
359                                 InstrStage<2, [A9_AGU], 1>,
360                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
361  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
362                                 InstrStage<1, [A9_MUX0], 0>,
363                                 InstrStage<2, [A9_AGU], 1>,
364                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
365  //
366  // Scaled register offset
367  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
368                                  InstrStage<1, [A9_MUX0], 0>,
369                                  InstrStage<1, [A9_AGU], 0>,
370                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
371  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
372                                  InstrStage<1, [A9_MUX0], 0>,
373                                  InstrStage<2, [A9_AGU], 1>,
374                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
375  //
376  // Immediate offset with update
377  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
378                                  InstrStage<1, [A9_MUX0], 0>,
379                                  InstrStage<1, [A9_AGU], 0>,
380                                  InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
381  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
382                                  InstrStage<1, [A9_MUX0], 0>,
383                                  InstrStage<2, [A9_AGU], 1>,
384                                  InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
385  //
386  // Register offset with update
387  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
388                                  InstrStage<1, [A9_MUX0], 0>,
389                                  InstrStage<1, [A9_AGU], 0>,
390                                  InstrStage<1, [A9_LSUnit]>],
391                                 [2, 1, 1, 1]>,
392  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
393                                  InstrStage<1, [A9_MUX0], 0>,
394                                  InstrStage<2, [A9_AGU], 1>,
395                                  InstrStage<1, [A9_LSUnit]>],
396                                 [3, 1, 1, 1]>,
397  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
398                                  InstrStage<1, [A9_MUX0], 0>,
399                                  InstrStage<2, [A9_AGU], 1>,
400                                  InstrStage<1, [A9_LSUnit]>],
401                                 [3, 1, 1, 1]>,
402  //
403  // Scaled register offset with update
404  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
405                                    InstrStage<1, [A9_MUX0], 0>,
406                                    InstrStage<1, [A9_AGU], 0>,
407                                    InstrStage<1, [A9_LSUnit]>],
408                                   [2, 1, 1, 1]>,
409  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
410                                    InstrStage<1, [A9_MUX0], 0>,
411                                    InstrStage<2, [A9_AGU], 1>,
412                                    InstrStage<1, [A9_LSUnit]>],
413                                   [3, 1, 1, 1]>,
414  //
415  // Store multiple
416  InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
417                                InstrStage<1, [A9_MUX0], 0>,
418                                InstrStage<1, [A9_AGU], 0>,
419                                InstrStage<2, [A9_LSUnit]>],
420                [], [], -1>, // dynamic uops
421  //
422  // Store multiple + update
423  InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
424                                InstrStage<1, [A9_MUX0], 0>,
425                                InstrStage<1, [A9_AGU], 0>,
426                                InstrStage<2, [A9_LSUnit]>],
427                [2], [], -1>, // dynamic uops
428  //
429  // Preload
430  InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
431
432  // Branch
433  //
434  // no delay slots, so the latency of a branch is unimportant
435  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Issue0], 0>,
436                                InstrStage<1, [A9_Issue1], 0>,
437                                InstrStage<1, [A9_Branch]>]>,
438
439  // VFP and NEON shares the same register file. This means that every VFP
440  // instruction should wait for full completion of the consecutive NEON
441  // instruction and vice-versa. We model this behavior with two artificial FUs:
442  // DRegsVFP and DRegsVFP.
443  //
444  // Every VFP instruction:
445  //  - Acquires DRegsVFP resource for 1 cycle
446  //  - Reserves DRegsN resource for the whole duration (including time to
447  //    register file writeback!).
448  // Every NEON instruction does the same but with FUs swapped.
449  //
450  // Since the reserved FU cannot be acquired, this models precisely
451  // "cross-domain" stalls.
452
453  // VFP
454  // Issue through integer pipeline, and execute in NEON unit.
455
456  // FP Special Register to Integer Register File Move
457  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
458                              InstrStage<1, [A9_MUX0], 0>,
459                              InstrStage<1, [A9_DRegsVFP], 0, Required>,
460                              InstrStage<2, [A9_DRegsN],   0, Reserved>,
461                              InstrStage<1, [A9_NPipe]>],
462                             [1]>,
463  //
464  // Single-precision FP Unary
465  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
466                               InstrStage<1, [A9_MUX0], 0>,
467                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
468                               // Extra latency cycles since wbck is 2 cycles
469                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
470                               InstrStage<1, [A9_NPipe]>],
471                              [1, 1]>,
472  //
473  // Double-precision FP Unary
474  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
475                               InstrStage<1, [A9_MUX0], 0>,
476                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
477                               // Extra latency cycles since wbck is 2 cycles
478                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
479                               InstrStage<1, [A9_NPipe]>],
480                              [1, 1]>,
481
482  //
483  // Single-precision FP Compare
484  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
485                               InstrStage<1, [A9_MUX0], 0>,
486                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
487                               // Extra latency cycles since wbck is 4 cycles
488                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
489                               InstrStage<1, [A9_NPipe]>],
490                              [1, 1]>,
491  //
492  // Double-precision FP Compare
493  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
494                               InstrStage<1, [A9_MUX0], 0>,
495                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
496                               // Extra latency cycles since wbck is 4 cycles
497                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
498                               InstrStage<1, [A9_NPipe]>],
499                              [1, 1]>,
500  //
501  // Single to Double FP Convert
502  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
503                               InstrStage<1, [A9_MUX0], 0>,
504                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
505                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
506                               InstrStage<1, [A9_NPipe]>],
507                              [4, 1]>,
508  //
509  // Double to Single FP Convert
510  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
511                               InstrStage<1, [A9_MUX0], 0>,
512                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
513                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
514                               InstrStage<1, [A9_NPipe]>],
515                              [4, 1]>,
516
517  //
518  // Single to Half FP Convert
519  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
520                               InstrStage<1, [A9_MUX0], 0>,
521                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
522                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
523                               InstrStage<1, [A9_NPipe]>],
524                              [4, 1]>,
525  //
526  // Half to Single FP Convert
527  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
528                               InstrStage<1, [A9_MUX0], 0>,
529                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
530                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
531                               InstrStage<1, [A9_NPipe]>],
532                              [2, 1]>,
533
534  //
535  // Single-Precision FP to Integer Convert
536  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
537                               InstrStage<1, [A9_MUX0], 0>,
538                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
539                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
540                               InstrStage<1, [A9_NPipe]>],
541                              [4, 1]>,
542  //
543  // Double-Precision FP to Integer Convert
544  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
545                               InstrStage<1, [A9_MUX0], 0>,
546                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
547                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
548                               InstrStage<1, [A9_NPipe]>],
549                              [4, 1]>,
550  //
551  // Integer to Single-Precision FP Convert
552  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
553                               InstrStage<1, [A9_MUX0], 0>,
554                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
555                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
556                               InstrStage<1, [A9_NPipe]>],
557                              [4, 1]>,
558  //
559  // Integer to Double-Precision FP Convert
560  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
561                               InstrStage<1, [A9_MUX0], 0>,
562                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
563                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
564                               InstrStage<1, [A9_NPipe]>],
565                              [4, 1]>,
566  //
567  // Single-precision FP ALU
568  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
569                               InstrStage<1, [A9_MUX0], 0>,
570                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
571                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
572                               InstrStage<1, [A9_NPipe]>],
573                              [4, 1, 1]>,
574  //
575  // Double-precision FP ALU
576  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
577                               InstrStage<1, [A9_MUX0], 0>,
578                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
579                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
580                               InstrStage<1, [A9_NPipe]>],
581                              [4, 1, 1]>,
582  //
583  // Single-precision FP Multiply
584  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
585                               InstrStage<1, [A9_MUX0], 0>,
586                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
587                               InstrStage<6, [A9_DRegsN],   0, Reserved>,
588                               InstrStage<1, [A9_NPipe]>],
589                              [5, 1, 1]>,
590  //
591  // Double-precision FP Multiply
592  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
593                               InstrStage<1, [A9_MUX0], 0>,
594                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
595                               InstrStage<7, [A9_DRegsN],   0, Reserved>,
596                               InstrStage<2, [A9_NPipe]>],
597                              [6, 1, 1]>,
598  //
599  // Single-precision FP MAC
600  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
601                               InstrStage<1, [A9_MUX0], 0>,
602                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
603                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
604                               InstrStage<1, [A9_NPipe]>],
605                              [8, 1, 1, 1]>,
606  //
607  // Double-precision FP MAC
608  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
609                               InstrStage<1,  [A9_MUX0], 0>,
610                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
611                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
612                               InstrStage<2,  [A9_NPipe]>],
613                              [9, 1, 1, 1]>,
614  //
615  // Single-precision Fused FP MAC
616  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
617                               InstrStage<1, [A9_MUX0], 0>,
618                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
619                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
620                               InstrStage<1, [A9_NPipe]>],
621                              [8, 1, 1, 1]>,
622  //
623  // Double-precision Fused FP MAC
624  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
625                               InstrStage<1,  [A9_MUX0], 0>,
626                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
627                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
628                               InstrStage<2,  [A9_NPipe]>],
629                              [9, 1, 1, 1]>,
630  //
631  // Single-precision FP DIV
632  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
633                               InstrStage<1,  [A9_MUX0], 0>,
634                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
635                               InstrStage<16, [A9_DRegsN],  0, Reserved>,
636                               InstrStage<10, [A9_NPipe]>],
637                              [15, 1, 1]>,
638  //
639  // Double-precision FP DIV
640  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
641                               InstrStage<1,  [A9_MUX0], 0>,
642                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
643                               InstrStage<26, [A9_DRegsN],  0, Reserved>,
644                               InstrStage<20, [A9_NPipe]>],
645                              [25, 1, 1]>,
646  //
647  // Single-precision FP SQRT
648  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
649                               InstrStage<1,  [A9_MUX0], 0>,
650                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
651                               InstrStage<18, [A9_DRegsN],   0, Reserved>,
652                               InstrStage<13, [A9_NPipe]>],
653                              [17, 1]>,
654  //
655  // Double-precision FP SQRT
656  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
657                               InstrStage<1,  [A9_MUX0], 0>,
658                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
659                               InstrStage<33, [A9_DRegsN],   0, Reserved>,
660                               InstrStage<28, [A9_NPipe]>],
661                              [32, 1]>,
662
663  //
664  // Integer to Single-precision Move
665  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
666                               InstrStage<1, [A9_MUX0], 0>,
667                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
668                               // Extra 1 latency cycle since wbck is 2 cycles
669                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
670                               InstrStage<1, [A9_NPipe]>],
671                              [1, 1]>,
672  //
673  // Integer to Double-precision Move
674  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
675                               InstrStage<1, [A9_MUX0], 0>,
676                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
677                               // Extra 1 latency cycle since wbck is 2 cycles
678                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
679                               InstrStage<1, [A9_NPipe]>],
680                              [1, 1, 1]>,
681  //
682  // Single-precision to Integer Move
683  //
684  // On A9 move-from-VFP is free to issue with no stall if other VFP
685  // operations are in flight. I assume it still can't dual-issue though.
686  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
687                               InstrStage<1, [A9_MUX0], 0>],
688                              [2, 1]>,
689  //
690  // Double-precision to Integer Move
691  //
692  // On A9 move-from-VFP is free to issue with no stall if other VFP
693  // operations are in flight. I assume it still can't dual-issue though.
694  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
695                               InstrStage<1, [A9_MUX0], 0>],
696                              [2, 1, 1]>,
697  //
698  // Single-precision FP Load
699  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
700                               InstrStage<1, [A9_MUX0], 0>,
701                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
702                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
703                               InstrStage<1, [A9_NPipe], 0>,
704                               InstrStage<1, [A9_LSUnit]>],
705                              [1, 1]>,
706  //
707  // Double-precision FP Load
708  // FIXME: Result latency is 1 if address is 64-bit aligned.
709  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
710                               InstrStage<1, [A9_MUX0], 0>,
711                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
712                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
713                               InstrStage<1, [A9_NPipe], 0>,
714                               InstrStage<1, [A9_LSUnit]>],
715                              [2, 1]>,
716  //
717  // FP Load Multiple
718  // FIXME: assumes 2 doubles which requires 2 LS cycles.
719  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
720                               InstrStage<1, [A9_MUX0], 0>,
721                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
722                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
723                               InstrStage<1, [A9_NPipe], 0>,
724                               InstrStage<2, [A9_LSUnit]>],
725                [1, 1, 1, 1], [], -1>, // dynamic uops
726  //
727  // FP Load Multiple + update
728  // FIXME: assumes 2 doubles which requires 2 LS cycles.
729  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
730                               InstrStage<1, [A9_MUX0], 0>,
731                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
732                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
733                               InstrStage<1, [A9_NPipe], 0>,
734                               InstrStage<2, [A9_LSUnit]>],
735                [2, 1, 1, 1], [], -1>, // dynamic uops
736  //
737  // Single-precision FP Store
738  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
739                               InstrStage<1, [A9_MUX0], 0>,
740                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
741                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
742                               InstrStage<1, [A9_NPipe], 0>,
743                               InstrStage<1, [A9_LSUnit]>],
744                              [1, 1]>,
745  //
746  // Double-precision FP Store
747  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
748                               InstrStage<1, [A9_MUX0], 0>,
749                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
750                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
751                               InstrStage<1, [A9_NPipe], 0>,
752                               InstrStage<1, [A9_LSUnit]>],
753                              [1, 1]>,
754  //
755  // FP Store Multiple
756  // FIXME: assumes 2 doubles which requires 2 LS cycles.
757  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
758                               InstrStage<1, [A9_MUX0], 0>,
759                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
760                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
761                               InstrStage<1, [A9_NPipe], 0>,
762                               InstrStage<2, [A9_LSUnit]>],
763                [1, 1, 1, 1], [], -1>, // dynamic uops
764  //
765  // FP Store Multiple + update
766  // FIXME: assumes 2 doubles which requires 2 LS cycles.
767  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
768                                InstrStage<1, [A9_MUX0], 0>,
769                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
770                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
771                                InstrStage<1, [A9_NPipe], 0>,
772                                InstrStage<2, [A9_LSUnit]>],
773                [2, 1, 1, 1], [], -1>, // dynamic uops
774  // NEON
775  // VLD1
776  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
777                               InstrStage<1, [A9_MUX0], 0>,
778                               InstrStage<1, [A9_DRegsN],   0, Required>,
779                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
780                               InstrStage<1, [A9_NPipe], 0>,
781                               InstrStage<1, [A9_LSUnit]>],
782                              [1, 1]>,
783  // VLD1x2
784  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
785                               InstrStage<1, [A9_MUX0], 0>,
786                               InstrStage<1, [A9_DRegsN],   0, Required>,
787                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
788                               InstrStage<1, [A9_NPipe], 0>,
789                               InstrStage<1, [A9_LSUnit]>],
790                              [1, 1, 1]>,
791  // VLD1x3
792  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
793                               InstrStage<1, [A9_MUX0], 0>,
794                               InstrStage<1, [A9_DRegsN],   0, Required>,
795                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
796                               InstrStage<2, [A9_NPipe], 0>,
797                               InstrStage<2, [A9_LSUnit]>],
798                              [1, 1, 2, 1]>,
799  // VLD1x4
800  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
801                               InstrStage<1, [A9_MUX0], 0>,
802                               InstrStage<1, [A9_DRegsN],   0, Required>,
803                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
804                               InstrStage<2, [A9_NPipe], 0>,
805                               InstrStage<2, [A9_LSUnit]>],
806                              [1, 1, 2, 2, 1]>,
807  // VLD1u
808  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
809                               InstrStage<1, [A9_MUX0], 0>,
810                               InstrStage<1, [A9_DRegsN],   0, Required>,
811                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
812                               InstrStage<1, [A9_NPipe], 0>,
813                               InstrStage<1, [A9_LSUnit]>],
814                              [1, 2, 1]>,
815  // VLD1x2u
816  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
817                               InstrStage<1, [A9_MUX0], 0>,
818                               InstrStage<1, [A9_DRegsN],   0, Required>,
819                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
820                               InstrStage<1, [A9_NPipe], 0>,
821                               InstrStage<1, [A9_LSUnit]>],
822                              [1, 1, 2, 1]>,
823  // VLD1x3u
824  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
825                               InstrStage<1, [A9_MUX0], 0>,
826                               InstrStage<1, [A9_DRegsN],   0, Required>,
827                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
828                               InstrStage<2, [A9_NPipe], 0>,
829                               InstrStage<2, [A9_LSUnit]>],
830                              [1, 1, 2, 2, 1]>,
831  // VLD1x4u
832  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
833                               InstrStage<1, [A9_MUX0], 0>,
834                               InstrStage<1, [A9_DRegsN],   0, Required>,
835                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
836                               InstrStage<2, [A9_NPipe], 0>,
837                               InstrStage<2, [A9_LSUnit]>],
838                              [1, 1, 2, 2, 2, 1]>,
839  //
840  // VLD1ln
841  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
842                               InstrStage<1, [A9_MUX0], 0>,
843                               InstrStage<1, [A9_DRegsN],   0, Required>,
844                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
845                               InstrStage<2, [A9_NPipe], 0>,
846                               InstrStage<2, [A9_LSUnit]>],
847                              [3, 1, 1, 1]>,
848  //
849  // VLD1lnu
850  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
851                               InstrStage<1, [A9_MUX0], 0>,
852                               InstrStage<1, [A9_DRegsN],   0, Required>,
853                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
854                               InstrStage<2, [A9_NPipe], 0>,
855                               InstrStage<2, [A9_LSUnit]>],
856                              [3, 2, 1, 1, 1, 1]>,
857  //
858  // VLD1dup
859  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
860                               InstrStage<1, [A9_MUX0], 0>,
861                               InstrStage<1, [A9_DRegsN],   0, Required>,
862                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
863                               InstrStage<1, [A9_NPipe], 0>,
864                               InstrStage<1, [A9_LSUnit]>],
865                              [2, 1]>,
866  //
867  // VLD1dupu
868  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
869                               InstrStage<1, [A9_MUX0], 0>,
870                               InstrStage<1, [A9_DRegsN],   0, Required>,
871                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
872                               InstrStage<1, [A9_NPipe], 0>,
873                               InstrStage<1, [A9_LSUnit]>],
874                              [2, 2, 1, 1]>,
875  //
876  // VLD2
877  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
878                               InstrStage<1, [A9_MUX0], 0>,
879                               InstrStage<1, [A9_DRegsN],   0, Required>,
880                               // Extra latency cycles since wbck is 7 cycles
881                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
882                               InstrStage<1, [A9_NPipe], 0>,
883                               InstrStage<1, [A9_LSUnit]>],
884                              [2, 2, 1]>,
885  //
886  // VLD2x2
887  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
888                               InstrStage<1, [A9_MUX0], 0>,
889                               InstrStage<1, [A9_DRegsN],   0, Required>,
890                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
891                               InstrStage<2, [A9_NPipe], 0>,
892                               InstrStage<2, [A9_LSUnit]>],
893                              [2, 3, 2, 3, 1]>,
894  //
895  // VLD2ln
896  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
897                               InstrStage<1, [A9_MUX0], 0>,
898                               InstrStage<1, [A9_DRegsN],   0, Required>,
899                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
900                               InstrStage<2, [A9_NPipe], 0>,
901                               InstrStage<2, [A9_LSUnit]>],
902                              [3, 3, 1, 1, 1, 1]>,
903  //
904  // VLD2u
905  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
906                               InstrStage<1, [A9_MUX0], 0>,
907                               InstrStage<1, [A9_DRegsN],   0, Required>,
908                               // Extra latency cycles since wbck is 7 cycles
909                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
910                               InstrStage<1, [A9_NPipe], 0>,
911                               InstrStage<1, [A9_LSUnit]>],
912                              [2, 2, 2, 1, 1, 1]>,
913  //
914  // VLD2x2u
915  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
916                               InstrStage<1, [A9_MUX0], 0>,
917                               InstrStage<1, [A9_DRegsN],   0, Required>,
918                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
919                               InstrStage<2, [A9_NPipe], 0>,
920                               InstrStage<2, [A9_LSUnit]>],
921                              [2, 3, 2, 3, 2, 1]>,
922  //
923  // VLD2lnu
924  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
925                               InstrStage<1, [A9_MUX0], 0>,
926                               InstrStage<1, [A9_DRegsN],   0, Required>,
927                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
928                               InstrStage<2, [A9_NPipe], 0>,
929                               InstrStage<2, [A9_LSUnit]>],
930                              [3, 3, 2, 1, 1, 1, 1, 1]>,
931  //
932  // VLD2dup
933  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
934                               InstrStage<1, [A9_MUX0], 0>,
935                               InstrStage<1, [A9_DRegsN],   0, Required>,
936                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
937                               InstrStage<1, [A9_NPipe], 0>,
938                               InstrStage<1, [A9_LSUnit]>],
939                              [2, 2, 1]>,
940  //
941  // VLD2dupu
942  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
943                               InstrStage<1, [A9_MUX0], 0>,
944                               InstrStage<1, [A9_DRegsN],   0, Required>,
945                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
946                               InstrStage<1, [A9_NPipe], 0>,
947                               InstrStage<1, [A9_LSUnit]>],
948                              [2, 2, 2, 1, 1]>,
949  //
950  // VLD3
951  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
952                               InstrStage<1, [A9_MUX0], 0>,
953                               InstrStage<1, [A9_DRegsN],   0, Required>,
954                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
955                               InstrStage<3, [A9_NPipe], 0>,
956                               InstrStage<3, [A9_LSUnit]>],
957                              [3, 3, 4, 1]>,
958  //
959  // VLD3ln
960  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
961                               InstrStage<1, [A9_MUX0], 0>,
962                               InstrStage<1, [A9_DRegsN],   0, Required>,
963                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
964                               InstrStage<5, [A9_NPipe], 0>,
965                               InstrStage<5, [A9_LSUnit]>],
966                              [5, 5, 6, 1, 1, 1, 1, 2]>,
967  //
968  // VLD3u
969  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
970                               InstrStage<1, [A9_MUX0], 0>,
971                               InstrStage<1, [A9_DRegsN],   0, Required>,
972                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
973                               InstrStage<3, [A9_NPipe], 0>,
974                               InstrStage<3, [A9_LSUnit]>],
975                              [3, 3, 4, 2, 1]>,
976  //
977  // VLD3lnu
978  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
979                               InstrStage<1, [A9_MUX0], 0>,
980                               InstrStage<1, [A9_DRegsN],   0, Required>,
981                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
982                               InstrStage<5, [A9_NPipe], 0>,
983                               InstrStage<5, [A9_LSUnit]>],
984                              [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
985  //
986  // VLD3dup
987  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
988                               InstrStage<1, [A9_MUX0], 0>,
989                               InstrStage<1, [A9_DRegsN],   0, Required>,
990                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
991                               InstrStage<3, [A9_NPipe], 0>,
992                               InstrStage<3, [A9_LSUnit]>],
993                              [3, 3, 4, 1]>,
994  //
995  // VLD3dupu
996  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
997                               InstrStage<1, [A9_MUX0], 0>,
998                               InstrStage<1, [A9_DRegsN],   0, Required>,
999                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1000                               InstrStage<3, [A9_NPipe], 0>,
1001                               InstrStage<3, [A9_LSUnit]>],
1002                              [3, 3, 4, 2, 1, 1]>,
1003  //
1004  // VLD4
1005  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1006                               InstrStage<1, [A9_MUX0], 0>,
1007                               InstrStage<1, [A9_DRegsN],   0, Required>,
1008                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
1009                               InstrStage<3, [A9_NPipe], 0>,
1010                               InstrStage<3, [A9_LSUnit]>],
1011                              [3, 3, 4, 4, 1]>,
1012  //
1013  // VLD4ln
1014  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1015                               InstrStage<1, [A9_MUX0], 0>,
1016                               InstrStage<1, [A9_DRegsN],   0, Required>,
1017                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
1018                               InstrStage<4, [A9_NPipe], 0>,
1019                               InstrStage<4, [A9_LSUnit]>],
1020                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
1021  //
1022  // VLD4u
1023  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1024                               InstrStage<1, [A9_MUX0], 0>,
1025                               InstrStage<1, [A9_DRegsN],   0, Required>,
1026                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
1027                               InstrStage<3, [A9_NPipe], 0>,
1028                               InstrStage<3, [A9_LSUnit]>],
1029                              [3, 3, 4, 4, 2, 1]>,
1030  //
1031  // VLD4lnu
1032  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1033                               InstrStage<1, [A9_MUX0], 0>,
1034                               InstrStage<1, [A9_DRegsN],   0, Required>,
1035                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
1036                               InstrStage<4, [A9_NPipe], 0>,
1037                               InstrStage<4, [A9_LSUnit]>],
1038                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
1039  //
1040  // VLD4dup
1041  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1042                               InstrStage<1, [A9_MUX0], 0>,
1043                               InstrStage<1, [A9_DRegsN],   0, Required>,
1044                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1045                               InstrStage<2, [A9_NPipe], 0>,
1046                               InstrStage<2, [A9_LSUnit]>],
1047                              [2, 2, 3, 3, 1]>,
1048  //
1049  // VLD4dupu
1050  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1051                               InstrStage<1, [A9_MUX0], 0>,
1052                               InstrStage<1, [A9_DRegsN],   0, Required>,
1053                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1054                               InstrStage<2, [A9_NPipe], 0>,
1055                               InstrStage<2, [A9_LSUnit]>],
1056                              [2, 2, 3, 3, 2, 1, 1]>,
1057  //
1058  // VST1
1059  InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1060                               InstrStage<1, [A9_MUX0], 0>,
1061                               InstrStage<1, [A9_DRegsN],   0, Required>,
1062                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1063                               InstrStage<1, [A9_NPipe], 0>,
1064                               InstrStage<1, [A9_LSUnit]>],
1065                              [1, 1, 1]>,
1066  //
1067  // VST1x2
1068  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1069                               InstrStage<1, [A9_MUX0], 0>,
1070                               InstrStage<1, [A9_DRegsN],   0, Required>,
1071                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1072                               InstrStage<1, [A9_NPipe], 0>,
1073                               InstrStage<1, [A9_LSUnit]>],
1074                              [1, 1, 1, 1]>,
1075  //
1076  // VST1x3
1077  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1078                               InstrStage<1, [A9_MUX0], 0>,
1079                               InstrStage<1, [A9_DRegsN],   0, Required>,
1080                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1081                               InstrStage<2, [A9_NPipe], 0>,
1082                               InstrStage<2, [A9_LSUnit]>],
1083                              [1, 1, 1, 1, 2]>,
1084  //
1085  // VST1x4
1086  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1087                               InstrStage<1, [A9_MUX0], 0>,
1088                               InstrStage<1, [A9_DRegsN],   0, Required>,
1089                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1090                               InstrStage<2, [A9_NPipe], 0>,
1091                               InstrStage<2, [A9_LSUnit]>],
1092                              [1, 1, 1, 1, 2, 2]>,
1093  //
1094  // VST1u
1095  InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1096                               InstrStage<1, [A9_MUX0], 0>,
1097                               InstrStage<1, [A9_DRegsN],   0, Required>,
1098                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1099                               InstrStage<1, [A9_NPipe], 0>,
1100                               InstrStage<1, [A9_LSUnit]>],
1101                              [2, 1, 1, 1, 1]>,
1102  //
1103  // VST1x2u
1104  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1105                               InstrStage<1, [A9_MUX0], 0>,
1106                               InstrStage<1, [A9_DRegsN],   0, Required>,
1107                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1108                               InstrStage<1, [A9_NPipe], 0>,
1109                               InstrStage<1, [A9_LSUnit]>],
1110                              [2, 1, 1, 1, 1, 1]>,
1111  //
1112  // VST1x3u
1113  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1114                               InstrStage<1, [A9_MUX0], 0>,
1115                               InstrStage<1, [A9_DRegsN],   0, Required>,
1116                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1117                               InstrStage<2, [A9_NPipe], 0>,
1118                               InstrStage<2, [A9_LSUnit]>],
1119                              [2, 1, 1, 1, 1, 1, 2]>,
1120  //
1121  // VST1x4u
1122  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1123                               InstrStage<1, [A9_MUX0], 0>,
1124                               InstrStage<1, [A9_DRegsN],   0, Required>,
1125                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1126                               InstrStage<2, [A9_NPipe], 0>,
1127                               InstrStage<2, [A9_LSUnit]>],
1128                              [2, 1, 1, 1, 1, 1, 2, 2]>,
1129  //
1130  // VST1ln
1131  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1132                               InstrStage<1, [A9_MUX0], 0>,
1133                               InstrStage<1, [A9_DRegsN],   0, Required>,
1134                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1135                               InstrStage<1, [A9_NPipe], 0>,
1136                               InstrStage<1, [A9_LSUnit]>],
1137                              [1, 1, 1]>,
1138  //
1139  // VST1lnu
1140  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1141                               InstrStage<1, [A9_MUX0], 0>,
1142                               InstrStage<1, [A9_DRegsN],   0, Required>,
1143                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1144                               InstrStage<1, [A9_NPipe], 0>,
1145                               InstrStage<1, [A9_LSUnit]>],
1146                              [2, 1, 1, 1, 1]>,
1147  //
1148  // VST2
1149  InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1150                               InstrStage<1, [A9_MUX0], 0>,
1151                               InstrStage<1, [A9_DRegsN],   0, Required>,
1152                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1153                               InstrStage<1, [A9_NPipe], 0>,
1154                               InstrStage<1, [A9_LSUnit]>],
1155                              [1, 1, 1, 1]>,
1156  //
1157  // VST2x2
1158  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1159                               InstrStage<1, [A9_MUX0], 0>,
1160                               InstrStage<1, [A9_DRegsN],   0, Required>,
1161                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1162                               InstrStage<3, [A9_NPipe], 0>,
1163                               InstrStage<3, [A9_LSUnit]>],
1164                              [1, 1, 1, 1, 2, 2]>,
1165  //
1166  // VST2u
1167  InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1168                               InstrStage<1, [A9_MUX0], 0>,
1169                               InstrStage<1, [A9_DRegsN],   0, Required>,
1170                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1171                               InstrStage<1, [A9_NPipe], 0>,
1172                               InstrStage<1, [A9_LSUnit]>],
1173                              [2, 1, 1, 1, 1, 1]>,
1174  //
1175  // VST2x2u
1176  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1177                               InstrStage<1, [A9_MUX0], 0>,
1178                               InstrStage<1, [A9_DRegsN],   0, Required>,
1179                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1180                               InstrStage<3, [A9_NPipe], 0>,
1181                               InstrStage<3, [A9_LSUnit]>],
1182                              [2, 1, 1, 1, 1, 1, 2, 2]>,
1183  //
1184  // VST2ln
1185  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1186                               InstrStage<1, [A9_MUX0], 0>,
1187                               InstrStage<1, [A9_DRegsN],   0, Required>,
1188                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1189                               InstrStage<1, [A9_NPipe], 0>,
1190                               InstrStage<1, [A9_LSUnit]>],
1191                              [1, 1, 1, 1]>,
1192  //
1193  // VST2lnu
1194  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1195                               InstrStage<1, [A9_MUX0], 0>,
1196                               InstrStage<1, [A9_DRegsN],   0, Required>,
1197                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1198                               InstrStage<1, [A9_NPipe], 0>,
1199                               InstrStage<1, [A9_LSUnit]>],
1200                              [2, 1, 1, 1, 1, 1]>,
1201  //
1202  // VST3
1203  InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1204                               InstrStage<1, [A9_MUX0], 0>,
1205                               InstrStage<1, [A9_DRegsN],   0, Required>,
1206                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1207                               InstrStage<2, [A9_NPipe], 0>,
1208                               InstrStage<2, [A9_LSUnit]>],
1209                              [1, 1, 1, 1, 2]>,
1210  //
1211  // VST3u
1212  InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1213                               InstrStage<1, [A9_MUX0], 0>,
1214                               InstrStage<1, [A9_DRegsN],   0, Required>,
1215                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1216                               InstrStage<2, [A9_NPipe], 0>,
1217                               InstrStage<2, [A9_LSUnit]>],
1218                              [2, 1, 1, 1, 1, 1, 2]>,
1219  //
1220  // VST3ln
1221  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1222                               InstrStage<1, [A9_MUX0], 0>,
1223                               InstrStage<1, [A9_DRegsN],   0, Required>,
1224                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1225                               InstrStage<3, [A9_NPipe], 0>,
1226                               InstrStage<3, [A9_LSUnit]>],
1227                              [1, 1, 1, 1, 2]>,
1228  //
1229  // VST3lnu
1230  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1231                               InstrStage<1, [A9_MUX0], 0>,
1232                               InstrStage<1, [A9_DRegsN],   0, Required>,
1233                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1234                               InstrStage<3, [A9_NPipe], 0>,
1235                               InstrStage<3, [A9_LSUnit]>],
1236                              [2, 1, 1, 1, 1, 1, 2]>,
1237  //
1238  // VST4
1239  InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1240                               InstrStage<1, [A9_MUX0], 0>,
1241                               InstrStage<1, [A9_DRegsN],   0, Required>,
1242                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1243                               InstrStage<2, [A9_NPipe], 0>,
1244                               InstrStage<2, [A9_LSUnit]>],
1245                              [1, 1, 1, 1, 2, 2]>,
1246  //
1247  // VST4u
1248  InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1249                               InstrStage<1, [A9_MUX0], 0>,
1250                               InstrStage<1, [A9_DRegsN],   0, Required>,
1251                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1252                               InstrStage<2, [A9_NPipe], 0>,
1253                               InstrStage<2, [A9_LSUnit]>],
1254                              [2, 1, 1, 1, 1, 1, 2, 2]>,
1255  //
1256  // VST4ln
1257  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1258                               InstrStage<1, [A9_MUX0], 0>,
1259                               InstrStage<1, [A9_DRegsN],   0, Required>,
1260                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1261                               InstrStage<2, [A9_NPipe], 0>,
1262                               InstrStage<2, [A9_LSUnit]>],
1263                              [1, 1, 1, 1, 2, 2]>,
1264  //
1265  // VST4lnu
1266  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1267                               InstrStage<1, [A9_MUX0], 0>,
1268                               InstrStage<1, [A9_DRegsN],   0, Required>,
1269                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
1270                               InstrStage<2, [A9_NPipe], 0>,
1271                               InstrStage<2, [A9_LSUnit]>],
1272                              [2, 1, 1, 1, 1, 1, 2, 2]>,
1273
1274  //
1275  // Double-register Integer Unary
1276  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1277                               InstrStage<1, [A9_MUX0], 0>,
1278                               InstrStage<1, [A9_DRegsN],   0, Required>,
1279                               // Extra latency cycles since wbck is 6 cycles
1280                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1281                               InstrStage<1, [A9_NPipe]>],
1282                              [4, 2]>,
1283  //
1284  // Quad-register Integer Unary
1285  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1286                               InstrStage<1, [A9_MUX0], 0>,
1287                               InstrStage<1, [A9_DRegsN],   0, Required>,
1288                               // Extra latency cycles since wbck is 6 cycles
1289                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1290                               InstrStage<1, [A9_NPipe]>],
1291                              [4, 2]>,
1292  //
1293  // Double-register Integer Q-Unary
1294  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1295                               InstrStage<1, [A9_MUX0], 0>,
1296                               InstrStage<1, [A9_DRegsN],   0, Required>,
1297                               // Extra latency cycles since wbck is 6 cycles
1298                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1299                               InstrStage<1, [A9_NPipe]>],
1300                              [4, 1]>,
1301  //
1302  // Quad-register Integer CountQ-Unary
1303  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1304                               InstrStage<1, [A9_MUX0], 0>,
1305                               InstrStage<1, [A9_DRegsN],   0, Required>,
1306                               // Extra latency cycles since wbck is 6 cycles
1307                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1308                               InstrStage<1, [A9_NPipe]>],
1309                              [4, 1]>,
1310  //
1311  // Double-register Integer Binary
1312  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1313                               InstrStage<1, [A9_MUX0], 0>,
1314                               InstrStage<1, [A9_DRegsN],   0, Required>,
1315                               // Extra latency cycles since wbck is 6 cycles
1316                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1317                               InstrStage<1, [A9_NPipe]>],
1318                              [3, 2, 2]>,
1319  //
1320  // Quad-register Integer Binary
1321  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1322                               InstrStage<1, [A9_MUX0], 0>,
1323                               InstrStage<1, [A9_DRegsN],   0, Required>,
1324                               // Extra latency cycles since wbck is 6 cycles
1325                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1326                               InstrStage<1, [A9_NPipe]>],
1327                              [3, 2, 2]>,
1328  //
1329  // Double-register Integer Subtract
1330  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1331                               InstrStage<1, [A9_MUX0], 0>,
1332                               InstrStage<1, [A9_DRegsN],   0, Required>,
1333                               // Extra latency cycles since wbck is 6 cycles
1334                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1335                               InstrStage<1, [A9_NPipe]>],
1336                              [3, 2, 1]>,
1337  //
1338  // Quad-register Integer Subtract
1339  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1340                               InstrStage<1, [A9_MUX0], 0>,
1341                               InstrStage<1, [A9_DRegsN],   0, Required>,
1342                               // Extra latency cycles since wbck is 6 cycles
1343                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1344                               InstrStage<1, [A9_NPipe]>],
1345                              [3, 2, 1]>,
1346  //
1347  // Double-register Integer Shift
1348  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1349                               InstrStage<1, [A9_MUX0], 0>,
1350                               InstrStage<1, [A9_DRegsN],   0, Required>,
1351                               // Extra latency cycles since wbck is 6 cycles
1352                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1353                               InstrStage<1, [A9_NPipe]>],
1354                              [3, 1, 1]>,
1355  //
1356  // Quad-register Integer Shift
1357  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1358                               InstrStage<1, [A9_MUX0], 0>,
1359                               InstrStage<1, [A9_DRegsN],   0, Required>,
1360                               // Extra latency cycles since wbck is 6 cycles
1361                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1362                               InstrStage<1, [A9_NPipe]>],
1363                              [3, 1, 1]>,
1364  //
1365  // Double-register Integer Shift (4 cycle)
1366  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1367                               InstrStage<1, [A9_MUX0], 0>,
1368                               InstrStage<1, [A9_DRegsN],   0, Required>,
1369                               // Extra latency cycles since wbck is 6 cycles
1370                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1371                               InstrStage<1, [A9_NPipe]>],
1372                              [4, 1, 1]>,
1373  //
1374  // Quad-register Integer Shift (4 cycle)
1375  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1376                               InstrStage<1, [A9_MUX0], 0>,
1377                               InstrStage<1, [A9_DRegsN],   0, Required>,
1378                               // Extra latency cycles since wbck is 6 cycles
1379                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1380                               InstrStage<1, [A9_NPipe]>],
1381                              [4, 1, 1]>,
1382  //
1383  // Double-register Integer Binary (4 cycle)
1384  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1385                               InstrStage<1, [A9_MUX0], 0>,
1386                               InstrStage<1, [A9_DRegsN],   0, Required>,
1387                               // Extra latency cycles since wbck is 6 cycles
1388                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1389                               InstrStage<1, [A9_NPipe]>],
1390                              [4, 2, 2]>,
1391  //
1392  // Quad-register Integer Binary (4 cycle)
1393  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1394                               InstrStage<1, [A9_MUX0], 0>,
1395                               InstrStage<1, [A9_DRegsN],   0, Required>,
1396                               // Extra latency cycles since wbck is 6 cycles
1397                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1398                               InstrStage<1, [A9_NPipe]>],
1399                              [4, 2, 2]>,
1400  //
1401  // Double-register Integer Subtract (4 cycle)
1402  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1403                               InstrStage<1, [A9_MUX0], 0>,
1404                               InstrStage<1, [A9_DRegsN],   0, Required>,
1405                               // Extra latency cycles since wbck is 6 cycles
1406                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1407                               InstrStage<1, [A9_NPipe]>],
1408                              [4, 2, 1]>,
1409  //
1410  // Quad-register Integer Subtract (4 cycle)
1411  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1412                               InstrStage<1, [A9_MUX0], 0>,
1413                               InstrStage<1, [A9_DRegsN],   0, Required>,
1414                               // Extra latency cycles since wbck is 6 cycles
1415                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1416                               InstrStage<1, [A9_NPipe]>],
1417                              [4, 2, 1]>,
1418
1419  //
1420  // Double-register Integer Count
1421  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1422                               InstrStage<1, [A9_MUX0], 0>,
1423                               InstrStage<1, [A9_DRegsN],   0, Required>,
1424                               // Extra latency cycles since wbck is 6 cycles
1425                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1426                               InstrStage<1, [A9_NPipe]>],
1427                              [3, 2, 2]>,
1428  //
1429  // Quad-register Integer Count
1430  // Result written in N3, but that is relative to the last cycle of multicycle,
1431  // so we use 4 for those cases
1432  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1433                               InstrStage<1, [A9_MUX0], 0>,
1434                               InstrStage<1, [A9_DRegsN],   0, Required>,
1435                               // Extra latency cycles since wbck is 7 cycles
1436                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1437                               InstrStage<2, [A9_NPipe]>],
1438                              [4, 2, 2]>,
1439  //
1440  // Double-register Absolute Difference and Accumulate
1441  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1442                               InstrStage<1, [A9_MUX0], 0>,
1443                               InstrStage<1, [A9_DRegsN],   0, Required>,
1444                               // Extra latency cycles since wbck is 6 cycles
1445                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1446                               InstrStage<1, [A9_NPipe]>],
1447                              [6, 3, 2, 1]>,
1448  //
1449  // Quad-register Absolute Difference and Accumulate
1450  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1451                               InstrStage<1, [A9_MUX0], 0>,
1452                               InstrStage<1, [A9_DRegsN],   0, Required>,
1453                               // Extra latency cycles since wbck is 6 cycles
1454                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1455                               InstrStage<2, [A9_NPipe]>],
1456                              [6, 3, 2, 1]>,
1457  //
1458  // Double-register Integer Pair Add Long
1459  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1460                               InstrStage<1, [A9_MUX0], 0>,
1461                               InstrStage<1, [A9_DRegsN],   0, Required>,
1462                               // Extra latency cycles since wbck is 6 cycles
1463                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1464                               InstrStage<1, [A9_NPipe]>],
1465                              [6, 3, 1]>,
1466  //
1467  // Quad-register Integer Pair Add Long
1468  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1469                               InstrStage<1, [A9_MUX0], 0>,
1470                               InstrStage<1, [A9_DRegsN],   0, Required>,
1471                               // Extra latency cycles since wbck is 6 cycles
1472                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1473                               InstrStage<2, [A9_NPipe]>],
1474                              [6, 3, 1]>,
1475
1476  //
1477  // Double-register Integer Multiply (.8, .16)
1478  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1479                               InstrStage<1, [A9_MUX0], 0>,
1480                               InstrStage<1, [A9_DRegsN],   0, Required>,
1481                               // Extra latency cycles since wbck is 6 cycles
1482                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1483                               InstrStage<1, [A9_NPipe]>],
1484                              [6, 2, 2]>,
1485  //
1486  // Quad-register Integer Multiply (.8, .16)
1487  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1488                               InstrStage<1, [A9_MUX0], 0>,
1489                               InstrStage<1, [A9_DRegsN],   0, Required>,
1490                               // Extra latency cycles since wbck is 7 cycles
1491                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1492                               InstrStage<2, [A9_NPipe]>],
1493                              [7, 2, 2]>,
1494
1495  //
1496  // Double-register Integer Multiply (.32)
1497  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1498                               InstrStage<1, [A9_MUX0], 0>,
1499                               InstrStage<1, [A9_DRegsN],   0, Required>,
1500                               // Extra latency cycles since wbck is 7 cycles
1501                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1502                               InstrStage<2, [A9_NPipe]>],
1503                              [7, 2, 1]>,
1504  //
1505  // Quad-register Integer Multiply (.32)
1506  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1507                               InstrStage<1, [A9_MUX0], 0>,
1508                               InstrStage<1, [A9_DRegsN],   0, Required>,
1509                               // Extra latency cycles since wbck is 9 cycles
1510                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1511                               InstrStage<4, [A9_NPipe]>],
1512                              [9, 2, 1]>,
1513  //
1514  // Double-register Integer Multiply-Accumulate (.8, .16)
1515  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1516                               InstrStage<1, [A9_MUX0], 0>,
1517                               InstrStage<1, [A9_DRegsN],   0, Required>,
1518                               // Extra latency cycles since wbck is 6 cycles
1519                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1520                               InstrStage<1, [A9_NPipe]>],
1521                              [6, 3, 2, 2]>,
1522  //
1523  // Double-register Integer Multiply-Accumulate (.32)
1524  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1525                               InstrStage<1, [A9_MUX0], 0>,
1526                               InstrStage<1, [A9_DRegsN],   0, Required>,
1527                               // Extra latency cycles since wbck is 7 cycles
1528                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1529                               InstrStage<2, [A9_NPipe]>],
1530                              [7, 3, 2, 1]>,
1531  //
1532  // Quad-register Integer Multiply-Accumulate (.8, .16)
1533  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1534                               InstrStage<1, [A9_MUX0], 0>,
1535                               InstrStage<1, [A9_DRegsN],   0, Required>,
1536                               // Extra latency cycles since wbck is 7 cycles
1537                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1538                               InstrStage<2, [A9_NPipe]>],
1539                              [7, 3, 2, 2]>,
1540  //
1541  // Quad-register Integer Multiply-Accumulate (.32)
1542  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1543                               InstrStage<1, [A9_MUX0], 0>,
1544                               InstrStage<1, [A9_DRegsN],   0, Required>,
1545                               // Extra latency cycles since wbck is 9 cycles
1546                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1547                               InstrStage<4, [A9_NPipe]>],
1548                              [9, 3, 2, 1]>,
1549
1550  //
1551  // Move
1552  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1553                               InstrStage<1, [A9_MUX0], 0>,
1554                               InstrStage<1, [A9_DRegsN],   0, Required>,
1555                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
1556                               InstrStage<1, [A9_NPipe]>],
1557                              [1,1]>,
1558  //
1559  // Move Immediate
1560  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1561                               InstrStage<1, [A9_MUX0], 0>,
1562                               InstrStage<1, [A9_DRegsN],   0, Required>,
1563                               // Extra latency cycles since wbck is 6 cycles
1564                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1565                               InstrStage<1, [A9_NPipe]>],
1566                              [3]>,
1567  //
1568  // Double-register Permute Move
1569  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1570                               InstrStage<1, [A9_MUX0], 0>,
1571                               InstrStage<1, [A9_DRegsN],   0, Required>,
1572                               // Extra latency cycles since wbck is 6 cycles
1573                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1574                               InstrStage<1, [A9_NPipe]>],
1575                              [2, 1]>,
1576  //
1577  // Quad-register Permute Move
1578  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1579                               InstrStage<1, [A9_MUX0], 0>,
1580                               InstrStage<1, [A9_DRegsN],   0, Required>,
1581                               // Extra latency cycles since wbck is 6 cycles
1582                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1583                               InstrStage<1, [A9_NPipe]>],
1584                              [2, 1]>,
1585  //
1586  // Integer to Single-precision Move
1587  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1588                               InstrStage<1, [A9_MUX0], 0>,
1589                               InstrStage<1, [A9_DRegsN],   0, Required>,
1590                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1591                               InstrStage<1, [A9_NPipe]>],
1592                              [1, 1]>,
1593  //
1594  // Integer to Double-precision Move
1595  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1596                               InstrStage<1, [A9_MUX0], 0>,
1597                               InstrStage<1, [A9_DRegsN],   0, Required>,
1598                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1599                               InstrStage<1, [A9_NPipe]>],
1600                              [1, 1, 1]>,
1601  //
1602  // Single-precision to Integer Move
1603  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1604                               InstrStage<1, [A9_MUX0], 0>,
1605                               InstrStage<1, [A9_DRegsN],   0, Required>,
1606                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1607                               InstrStage<1, [A9_NPipe]>],
1608                              [2, 1]>,
1609  //
1610  // Double-precision to Integer Move
1611  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1612                               InstrStage<1, [A9_MUX0], 0>,
1613                               InstrStage<1, [A9_DRegsN],   0, Required>,
1614                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
1615                               InstrStage<1, [A9_NPipe]>],
1616                              [2, 2, 1]>,
1617  //
1618  // Integer to Lane Move
1619  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1620                               InstrStage<1, [A9_MUX0], 0>,
1621                               InstrStage<1, [A9_DRegsN],   0, Required>,
1622                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
1623                               InstrStage<2, [A9_NPipe]>],
1624                              [3, 1, 1]>,
1625
1626  //
1627  // Vector narrow move
1628  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1629                               InstrStage<1, [A9_MUX0], 0>,
1630                               InstrStage<1, [A9_DRegsN],   0, Required>,
1631                               // Extra latency cycles since wbck is 6 cycles
1632                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1633                               InstrStage<1, [A9_NPipe]>],
1634                              [3, 1]>,
1635  //
1636  // Double-register FP Unary
1637  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1638                               InstrStage<1, [A9_MUX0], 0>,
1639                               InstrStage<1, [A9_DRegsN],   0, Required>,
1640                               // Extra latency cycles since wbck is 6 cycles
1641                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1642                               InstrStage<1, [A9_NPipe]>],
1643                              [5, 2]>,
1644  //
1645  // Quad-register FP Unary
1646  // Result written in N5, but that is relative to the last cycle of multicycle,
1647  // so we use 6 for those cases
1648  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1649                               InstrStage<1, [A9_MUX0], 0>,
1650                               InstrStage<1, [A9_DRegsN],   0, Required>,
1651                               // Extra latency cycles since wbck is 7 cycles
1652                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1653                               InstrStage<2, [A9_NPipe]>],
1654                              [6, 2]>,
1655  //
1656  // Double-register FP Binary
1657  // FIXME: We're using this itin for many instructions and [2, 2] here is too
1658  // optimistic.
1659  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1660                               InstrStage<1, [A9_MUX0], 0>,
1661                               InstrStage<1, [A9_DRegsN],   0, Required>,
1662                               // Extra latency cycles since wbck is 6 cycles
1663                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1664                               InstrStage<1, [A9_NPipe]>],
1665                              [5, 2, 2]>,
1666
1667  //
1668  // VPADD, etc.
1669  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1670                               InstrStage<1, [A9_MUX0], 0>,
1671                               InstrStage<1, [A9_DRegsN],   0, Required>,
1672                               // Extra latency cycles since wbck is 6 cycles
1673                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1674                               InstrStage<1, [A9_NPipe]>],
1675                              [5, 1, 1]>,
1676  //
1677  // Double-register FP VMUL
1678  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1679                               InstrStage<1, [A9_MUX0], 0>,
1680                               InstrStage<1, [A9_DRegsN],   0, Required>,
1681                               // Extra latency cycles since wbck is 6 cycles
1682                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1683                               InstrStage<1, [A9_NPipe]>],
1684                              [5, 2, 1]>,
1685  //
1686  // Quad-register FP Binary
1687  // Result written in N5, but that is relative to the last cycle of multicycle,
1688  // so we use 6 for those cases
1689  // FIXME: We're using this itin for many instructions and [2, 2] here is too
1690  // optimistic.
1691  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1692                               InstrStage<1, [A9_MUX0], 0>,
1693                               InstrStage<1, [A9_DRegsN],   0, Required>,
1694                               // Extra latency cycles since wbck is 7 cycles
1695                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1696                               InstrStage<2, [A9_NPipe]>],
1697                              [6, 2, 2]>,
1698  //
1699  // Quad-register FP VMUL
1700  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1701                               InstrStage<1, [A9_MUX0], 0>,
1702                               InstrStage<1, [A9_DRegsN],   0, Required>,
1703                               // Extra latency cycles since wbck is 7 cycles
1704                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1705                               InstrStage<1, [A9_NPipe]>],
1706                              [6, 2, 1]>,
1707  //
1708  // Double-register FP Multiple-Accumulate
1709  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1710                               InstrStage<1, [A9_MUX0], 0>,
1711                               InstrStage<1, [A9_DRegsN],   0, Required>,
1712                               // Extra latency cycles since wbck is 7 cycles
1713                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1714                               InstrStage<2, [A9_NPipe]>],
1715                              [6, 3, 2, 1]>,
1716  //
1717  // Quad-register FP Multiple-Accumulate
1718  // Result written in N9, but that is relative to the last cycle of multicycle,
1719  // so we use 10 for those cases
1720  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1721                               InstrStage<1, [A9_MUX0], 0>,
1722                               InstrStage<1, [A9_DRegsN],   0, Required>,
1723                               // Extra latency cycles since wbck is 9 cycles
1724                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1725                               InstrStage<4, [A9_NPipe]>],
1726                              [8, 4, 2, 1]>,
1727  //
1728  // Double-register Fused FP Multiple-Accumulate
1729  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1730                               InstrStage<1, [A9_MUX0], 0>,
1731                               InstrStage<1, [A9_DRegsN],   0, Required>,
1732                               // Extra latency cycles since wbck is 7 cycles
1733                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1734                               InstrStage<2, [A9_NPipe]>],
1735                              [6, 3, 2, 1]>,
1736  //
1737  // Quad-register Fused FP Multiple-Accumulate
1738  // Result written in N9, but that is relative to the last cycle of multicycle,
1739  // so we use 10 for those cases
1740  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1741                               InstrStage<1, [A9_MUX0], 0>,
1742                               InstrStage<1, [A9_DRegsN],   0, Required>,
1743                               // Extra latency cycles since wbck is 9 cycles
1744                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1745                               InstrStage<4, [A9_NPipe]>],
1746                              [8, 4, 2, 1]>,
1747  //
1748  // Double-register Reciprical Step
1749  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1750                               InstrStage<1, [A9_MUX0], 0>,
1751                               InstrStage<1, [A9_DRegsN],   0, Required>,
1752                               // Extra latency cycles since wbck is 10 cycles
1753                               InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
1754                               InstrStage<1, [A9_NPipe]>],
1755                              [9, 2, 2]>,
1756  //
1757  // Quad-register Reciprical Step
1758  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1759                               InstrStage<1, [A9_MUX0], 0>,
1760                               InstrStage<1, [A9_DRegsN],   0, Required>,
1761                               // Extra latency cycles since wbck is 11 cycles
1762                               InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
1763                               InstrStage<2, [A9_NPipe]>],
1764                              [10, 2, 2]>,
1765  //
1766  // Double-register Permute
1767  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1768                               InstrStage<1, [A9_MUX0], 0>,
1769                               InstrStage<1, [A9_DRegsN],   0, Required>,
1770                               // Extra latency cycles since wbck is 6 cycles
1771                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1772                               InstrStage<1, [A9_NPipe]>],
1773                              [2, 2, 1, 1]>,
1774  //
1775  // Quad-register Permute
1776  // Result written in N2, but that is relative to the last cycle of multicycle,
1777  // so we use 3 for those cases
1778  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1779                               InstrStage<1, [A9_MUX0], 0>,
1780                               InstrStage<1, [A9_DRegsN],   0, Required>,
1781                               // Extra latency cycles since wbck is 7 cycles
1782                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1783                               InstrStage<2, [A9_NPipe]>],
1784                              [3, 3, 1, 1]>,
1785  //
1786  // Quad-register Permute (3 cycle issue)
1787  // Result written in N2, but that is relative to the last cycle of multicycle,
1788  // so we use 4 for those cases
1789  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1790                               InstrStage<1, [A9_MUX0], 0>,
1791                               InstrStage<1, [A9_DRegsN],   0, Required>,
1792                               // Extra latency cycles since wbck is 8 cycles
1793                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1794                               InstrStage<3, [A9_NPipe]>],
1795                              [4, 4, 1, 1]>,
1796
1797  //
1798  // Double-register VEXT
1799  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1800                               InstrStage<1, [A9_MUX0], 0>,
1801                               InstrStage<1, [A9_DRegsN],   0, Required>,
1802                               // Extra latency cycles since wbck is 6 cycles
1803                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1804                               InstrStage<1, [A9_NPipe]>],
1805                              [2, 1, 1]>,
1806  //
1807  // Quad-register VEXT
1808  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1809                               InstrStage<1, [A9_MUX0], 0>,
1810                               InstrStage<1, [A9_DRegsN],   0, Required>,
1811                               // Extra latency cycles since wbck is 7 cycles
1812                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1813                               InstrStage<2, [A9_NPipe]>],
1814                              [3, 1, 2]>,
1815  //
1816  // VTB
1817  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1818                               InstrStage<1, [A9_MUX0], 0>,
1819                               InstrStage<1, [A9_DRegsN],   0, Required>,
1820                               // Extra latency cycles since wbck is 7 cycles
1821                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1822                               InstrStage<2, [A9_NPipe]>],
1823                              [3, 2, 1]>,
1824  InstrItinData<IIC_VTB2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1825                               InstrStage<1, [A9_MUX0], 0>,
1826                               InstrStage<2, [A9_DRegsN],   0, Required>,
1827                               // Extra latency cycles since wbck is 7 cycles
1828                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1829                               InstrStage<2, [A9_NPipe]>],
1830                              [3, 2, 2, 1]>,
1831  InstrItinData<IIC_VTB3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1832                               InstrStage<1, [A9_MUX0], 0>,
1833                               InstrStage<2, [A9_DRegsN],   0, Required>,
1834                               // Extra latency cycles since wbck is 8 cycles
1835                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1836                               InstrStage<3, [A9_NPipe]>],
1837                              [4, 2, 2, 3, 1]>,
1838  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1839                               InstrStage<1, [A9_MUX0], 0>,
1840                               InstrStage<1, [A9_DRegsN],   0, Required>,
1841                               // Extra latency cycles since wbck is 8 cycles
1842                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1843                               InstrStage<3, [A9_NPipe]>],
1844                              [4, 2, 2, 3, 3, 1]>,
1845  //
1846  // VTBX
1847  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1848                               InstrStage<1, [A9_MUX0], 0>,
1849                               InstrStage<1, [A9_DRegsN],   0, Required>,
1850                               // Extra latency cycles since wbck is 7 cycles
1851                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1852                               InstrStage<2, [A9_NPipe]>],
1853                              [3, 1, 2, 1]>,
1854  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1855                               InstrStage<1, [A9_MUX0], 0>,
1856                               InstrStage<1, [A9_DRegsN],   0, Required>,
1857                               // Extra latency cycles since wbck is 7 cycles
1858                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1859                               InstrStage<2, [A9_NPipe]>],
1860                              [3, 1, 2, 2, 1]>,
1861  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1862                               InstrStage<1, [A9_MUX0], 0>,
1863                               InstrStage<1, [A9_DRegsN],   0, Required>,
1864                               // Extra latency cycles since wbck is 8 cycles
1865                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1866                               InstrStage<3, [A9_NPipe]>],
1867                              [4, 1, 2, 2, 3, 1]>,
1868  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
1869                               InstrStage<1, [A9_MUX0], 0>,
1870                               InstrStage<1, [A9_DRegsN],   0, Required>,
1871                               // Extra latency cycles since wbck is 8 cycles
1872                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1873                               InstrStage<2, [A9_NPipe]>],
1874                              [4, 1, 2, 2, 3, 3, 1]>
1875]>;
1876
1877// ===---------------------------------------------------------------------===//
1878// The following definitions describe the simpler per-operand machine model.
1879// This works with MachineScheduler and will eventually replace itineraries.
1880
1881class A9WriteLMOpsListType<list<WriteSequence> writes> {
1882  list <WriteSequence> Writes = writes;
1883  SchedMachineModel SchedModel = ?;
1884}
1885
1886// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
1887def CortexA9Model : SchedMachineModel {
1888  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
1889  let MicroOpBufferSize = 56; // Based on available renamed registers.
1890  let LoadLatency = 2; // Optimistic load latency assuming bypass.
1891                       // This is overriden by OperandCycles if the
1892                       // Itineraries are queried instead.
1893  let MispredictPenalty = 8; // Based on estimate of pipeline depth.
1894
1895  let Itineraries = CortexA9Itineraries;
1896
1897  // FIXME: Many vector operations were never given an itinerary. We
1898  // haven't mapped these to the new model either.
1899  let CompleteModel = 0;
1900
1901  // FIXME: Remove when all errors have been fixed.
1902  let FullInstRWOverlapCheck = 0;
1903}
1904
1905//===----------------------------------------------------------------------===//
1906// Define each kind of processor resource and number available.
1907//
1908// The AGU unit has BufferSize=1 so that the latency between operations
1909// that use it are considered to stall other operations.
1910//
1911// The FP unit has BufferSize=0 so that it is a hard dispatch
1912// hazard. No instruction may be dispatched while the unit is reserved.
1913
1914let SchedModel = CortexA9Model in {
1915
1916def A9UnitALU : ProcResource<2>;
1917def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
1918def A9UnitAGU : ProcResource<1> { let BufferSize = 1; }
1919def A9UnitLS  : ProcResource<1>;
1920def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
1921def A9UnitB   : ProcResource<1>;
1922
1923//===----------------------------------------------------------------------===//
1924// Define scheduler read/write types with their resources and latency on A9.
1925
1926// Consume an issue slot, but no processor resources. This is useful when all
1927// other writes associated with the operand have NumMicroOps = 0.
1928def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
1929
1930// Write an integer register.
1931def A9WriteI : SchedWriteRes<[A9UnitALU]>;
1932// Write an integer shifted-by register
1933def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
1934
1935// Basic ALU.
1936def A9WriteALU : SchedWriteRes<[A9UnitALU]>;
1937// ALU with operand shifted by immediate.
1938def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; }
1939// ALU with operand shifted by register.
1940def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
1941
1942// Multiplication
1943def A9WriteM   : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
1944def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
1945                                              let NumMicroOps = 0; }
1946def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
1947def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
1948                                                let NumMicroOps = 0; }
1949def : SchedAlias<WriteMUL16, A9WriteM16>;
1950def : SchedAlias<WriteMUL32, A9WriteM>;
1951def : SchedAlias<WriteMUL64Lo, A9WriteM>;
1952def : SchedAlias<WriteMUL64Hi, A9WriteMHi>;
1953def : SchedAlias<WriteMAC16, A9WriteM16>;
1954def : SchedAlias<WriteMAC32, A9WriteM>;
1955def : SchedAlias<WriteMAC64Lo, A9WriteM>;
1956def : SchedAlias<WriteMAC64Hi, A9WriteMHi>;
1957def : ReadAdvance<ReadMUL, 0>;
1958def : ReadAdvance<ReadMAC, 0>;
1959
1960// Floating-point
1961// Only one FP or AGU instruction may issue per cycle. We model this
1962// by having FP instructions consume the AGU resource.
1963def A9WriteF      : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
1964def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
1965def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
1966def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
1967def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
1968
1969def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
1970def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
1971def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
1972def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
1973def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
1974
1975// NEON has an odd mix of latencies. Simply name the write types by latency.
1976def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
1977def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
1978def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
1979def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
1980def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
1981def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
1982def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
1983def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
1984def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
1985
1986def : WriteRes<WriteVLD1, []>;
1987def : WriteRes<WriteVLD2, []>;
1988def : WriteRes<WriteVLD3, []>;
1989def : WriteRes<WriteVLD4, []>;
1990def : WriteRes<WriteVST1, []>;
1991def : WriteRes<WriteVST2, []>;
1992def : WriteRes<WriteVST3, []>;
1993def : WriteRes<WriteVST4, []>;
1994
1995// Reserve A9UnitFP for 2 consecutive cycles.
1996def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
1997  let Latency = 4;
1998  let ResourceCycles = [2, 1];
1999}
2000def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
2001  let Latency = 7;
2002  let ResourceCycles = [2, 1];
2003}
2004def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
2005  let Latency = 9;
2006  let ResourceCycles = [2, 1];
2007}
2008
2009// Branches don't have a def operand but still consume resources.
2010def A9WriteB : SchedWriteRes<[A9UnitB]>;
2011
2012// Address generation.
2013def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
2014
2015// Load Integer.
2016def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
2017def : SchedAlias<WriteLd, A9WriteL>;
2018// Load the upper 32-bits using the same micro-op.
2019def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
2020                                     let NumMicroOps = 0; }
2021// Offset shifted by register.
2022def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
2023// Load (and zero extend) a byte.
2024def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
2025def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
2026
2027// Load or Store Float, aligned.
2028def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
2029
2030// Store Integer.
2031def A9WriteS : SchedWriteRes<[A9UnitLS]>;
2032
2033//===----------------------------------------------------------------------===//
2034// Define resources dynamically for load multiple variants.
2035
2036// Define helpers for extra latency without consuming resources.
2037def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
2038foreach NumCycles = 2-8 in {
2039def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
2040} // foreach NumCycles
2041
2042// Define address generation sequences and predicates for 8 flavors of LDMs.
2043foreach NumAddr = 1-8 in {
2044
2045// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
2046// latency for instructions that generate multiple loads or stores.
2047def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
2048
2049// Define a predicate to select the LDM based on number of memory addresses.
2050def A9LMAdr#NumAddr#Pred :
2051  SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
2052
2053} // foreach NumAddr
2054
2055// Fall-back for unknown LDMs.
2056def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
2057
2058// LDM/VLDM/VLDn address generation latency & resources.
2059// Dynamically select the A9WriteAdrN sequence using a predicate.
2060def A9WriteLMAdr : SchedWriteVariant<[
2061  SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
2062  SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
2063  SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
2064  SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
2065  SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
2066  SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
2067  SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
2068  SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
2069  // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
2070  SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
2071
2072// Define LDM Resources.
2073// These take no issue resource, so they can be combined with other
2074// writes like WriteB.
2075// A9WriteLMLo takes a single LS resource and 2 cycles.
2076def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
2077                                              let NumMicroOps = 0; }
2078// Assuming aligned access, the upper half of each pair is free with
2079// the same latency.
2080def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
2081                                      let NumMicroOps = 0; }
2082// Each A9WriteL#N variant adds N cycles of latency without consuming
2083// additional resources.
2084foreach NumAddr = 1-8 in {
2085def A9WriteL#NumAddr : WriteSequence<
2086  [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
2087def A9WriteL#NumAddr#Hi : WriteSequence<
2088  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
2089}
2090
2091//===----------------------------------------------------------------------===//
2092// LDM: Load multiple into 32-bit integer registers.
2093
2094def A9WriteLMOpsList : A9WriteLMOpsListType<
2095                 [A9WriteL1, A9WriteL1Hi,
2096                  A9WriteL2, A9WriteL2Hi,
2097                  A9WriteL3, A9WriteL3Hi,
2098                  A9WriteL4, A9WriteL4Hi,
2099                  A9WriteL5, A9WriteL5Hi,
2100                  A9WriteL6, A9WriteL6Hi,
2101                  A9WriteL7, A9WriteL7Hi,
2102                  A9WriteL8, A9WriteL8Hi]>;
2103
2104// A9WriteLM variants expand into a pair of writes for each 64-bit
2105// value loaded. When the number of registers is odd, the last
2106// A9WriteLnHi is naturally ignored because the instruction has no
2107// following def operands.  These variants take no issue resource, so
2108// they may need to be part of a WriteSequence that includes A9WriteIssue.
2109def A9WriteLM : SchedWriteVariant<[
2110  SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
2111  SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
2112  SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
2113  SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
2114  SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
2115  SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
2116  SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
2117  SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
2118  // For unknown LDMs, define the maximum number of writes, but only
2119  // make the first two consume resources.
2120  SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
2121                             A9WriteL2, A9WriteL2Hi,
2122                             A9WriteL3Hi, A9WriteL3Hi,
2123                             A9WriteL4Hi, A9WriteL4Hi,
2124                             A9WriteL5Hi, A9WriteL5Hi,
2125                             A9WriteL6Hi, A9WriteL6Hi,
2126                             A9WriteL7Hi, A9WriteL7Hi,
2127                             A9WriteL8Hi, A9WriteL8Hi]>]> {
2128  let Variadic = 1;
2129}
2130
2131//===----------------------------------------------------------------------===//
2132// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
2133
2134// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
2135// so can be used in WriteSequences for in single-issue instructions that
2136// encapsulate multiple loads.
2137def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
2138  let Latency = 1;
2139  let NumMicroOps = 0;
2140}
2141
2142foreach NumAddr = 1-8 in {
2143
2144// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
2145def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
2146
2147// A9WriteLfp1-8 definitions are statically expanded into a sequence of
2148// A9WriteLfpOps with additive latency that takes a single issue slot.
2149// Used directly to describe NEON VLDn.
2150def A9WriteLfp#NumAddr : WriteSequence<
2151  [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
2152
2153// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
2154// permuting loaded values.
2155def A9WriteLfp#NumAddr#Mov : WriteSequence<
2156  [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
2157
2158} // foreach NumAddr
2159
2160// Define VLDM/VSTM PreRA resources.
2161// A9WriteLMfpPreRA are dynamically expanded into the correct
2162// A9WriteLfp1-8 sequence based on a predicate. This supports the
2163// preRA VLDM variants in which all 64-bit loads are written to the
2164// same tuple of either single or double precision registers.
2165def A9WriteLMfpPreRA : SchedWriteVariant<[
2166  SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
2167  SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
2168  SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
2169  SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
2170  SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
2171  SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
2172  SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
2173  SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
2174  // For unknown VLDM/VSTM PreRA, assume 2xS registers.
2175  SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
2176
2177// Define VLDM/VSTM PostRA Resources.
2178// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
2179def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
2180
2181foreach NumAddr = 1-8 in {
2182
2183// Each A9WriteL#N variant adds N cycles of latency without consuming
2184// additional resources.
2185def A9WriteLMfp#NumAddr : WriteSequence<
2186  [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
2187
2188// Assuming aligned access, the upper half of each pair is free with
2189// the same latency.
2190def A9WriteLMfp#NumAddr#Hi : WriteSequence<
2191  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
2192
2193} // foreach NumAddr
2194
2195// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
2196// pair of writes for each 64-bit data loaded. When the number of
2197// registers is odd, the last WriteLMfpnHi is naturally ignored because
2198// the instruction has no following def operands.
2199
2200def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
2201                 [A9WriteLMfp1, A9WriteLMfp2,       // 0-1
2202                  A9WriteLMfp3, A9WriteLMfp4,       // 2-3
2203                  A9WriteLMfp5, A9WriteLMfp6,       // 4-5
2204                  A9WriteLMfp7, A9WriteLMfp8,       // 6-7
2205                  A9WriteLMfp1Hi,                   // 8-8
2206                  A9WriteLMfp2Hi, A9WriteLMfp2Hi,   // 9-10
2207                  A9WriteLMfp3Hi, A9WriteLMfp3Hi,   // 11-12
2208                  A9WriteLMfp4Hi, A9WriteLMfp4Hi,   // 13-14
2209                  A9WriteLMfp5Hi, A9WriteLMfp5Hi,   // 15-16
2210                  A9WriteLMfp6Hi, A9WriteLMfp6Hi,   // 17-18
2211                  A9WriteLMfp7Hi, A9WriteLMfp7Hi,   // 19-20
2212                  A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
2213
2214def A9WriteLMfpPostRA : SchedWriteVariant<[
2215  SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
2216  SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
2217  SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
2218  SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
2219  SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
2220  SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
2221  SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
2222  SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
2223  // For unknown LDMs, define the maximum number of writes, but only
2224  // make the first two consume resources. We are optimizing for the case
2225  // where the operands are DPRs, and this determines the first eight
2226  // types. The remaining eight types are filled to cover the case
2227  // where the operands are SPRs.
2228  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
2229                             A9WriteLMfp3Hi, A9WriteLMfp4Hi,
2230                             A9WriteLMfp5Hi, A9WriteLMfp6Hi,
2231                             A9WriteLMfp7Hi, A9WriteLMfp8Hi,
2232                             A9WriteLMfp5Hi, A9WriteLMfp5Hi,
2233                             A9WriteLMfp6Hi, A9WriteLMfp6Hi,
2234                             A9WriteLMfp7Hi, A9WriteLMfp7Hi,
2235                             A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
2236  let Variadic = 1;
2237}
2238
2239// Distinguish between our multiple MI-level forms of the same
2240// VLDM/VSTM instructions.
2241def A9PreRA : SchedPredicate<
2242  "MI->getOperand(0).getReg().isVirtual()">;
2243def A9PostRA : SchedPredicate<
2244  "MI->getOperand(0).getReg().isPhysical()">;
2245
2246// VLDM represents all destination registers as a single register
2247// tuple, unlike LDM. So the number of write operands is not variadic.
2248def A9WriteLMfp : SchedWriteVariant<[
2249  SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
2250  SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
2251
2252//===----------------------------------------------------------------------===//
2253// Resources for other (non-LDM/VLDM) Variants.
2254
2255// These mov immediate writers are unconditionally expanded with
2256// additive latency.
2257def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
2258def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>;
2259def A9WriteI2ld  : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
2260
2261// Some ALU operations can read loaded integer values one cycle early.
2262def A9ReadALU : SchedReadAdvance<1,
2263  [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
2264   A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
2265   A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
2266   A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
2267   A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
2268
2269// Read types for operands that are unconditionally read in cycle N
2270// after the instruction issues, decreases producer latency by N-1.
2271def A9Read2 : SchedReadAdvance<1>;
2272def A9Read3 : SchedReadAdvance<2>;
2273def A9Read4 : SchedReadAdvance<3>;
2274
2275//===----------------------------------------------------------------------===//
2276// Map itinerary classes to scheduler read/write resources per operand.
2277//
2278// For ARM, we piggyback scheduler resources on the Itinerary classes
2279// to avoid perturbing the existing instruction definitions.
2280
2281// This table follows the ARM Cortex-A9 Technical Reference Manuals,
2282// mostly in order.
2283
2284def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
2285                         IIC_iMVNi,IIC_iMVNsi,
2286                         IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
2287def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>;
2288def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
2289
2290def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
2291def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
2292def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
2293
2294def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
2295def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
2296def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>;
2297def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
2298def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>;
2299def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB
2300def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
2301def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>;
2302
2303// A9WriteHi ignored for MUL32.
2304def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
2305                                     IIC_iMUL64,IIC_iMAC64]>;
2306// FIXME: SMLALxx needs itin classes
2307def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
2308
2309// TODO: For floating-point ops, we model the pipeline forwarding
2310// latencies here. WAW latencies are sometimes longer.
2311
2312def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
2313                            IIC_fpUNA32, IIC_fpUNA64,
2314                            IIC_fpCMP32, IIC_fpCMP64]>;
2315def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
2316def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
2317                         IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
2318                         IIC_fpALU32, IIC_fpALU64]>;
2319def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
2320def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
2321def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
2322def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
2323def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
2324def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
2325def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
2326def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
2327
2328def :ItinRW<[A9WriteB], [IIC_Br]>;
2329
2330// A9 PLD is processed in a dedicated unit.
2331def :ItinRW<[], [IIC_Preload]>;
2332
2333// Note: We must assume that loads are aligned, since the machine
2334// model cannot know this statically and A9 ignores alignment hints.
2335
2336// A9WriteAdr consumes AGU regardless address writeback. But it's
2337// latency is only relevant for users of an updated address.
2338def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
2339                                     IIC_iLoad_iu,IIC_iLoad_ru]>;
2340def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
2341def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
2342                                       IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
2343def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
2344def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
2345                                            IIC_iLoad_d_ru]>;
2346// Store either has no def operands, or the one def for address writeback.
2347def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
2348                                     IIC_iStore_iu, IIC_iStore_ru,
2349                                     IIC_iStore_d_i, IIC_iStore_d_r,
2350                                     IIC_iStore_d_ru]>;
2351def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
2352                                      IIC_iStore_bh_i, IIC_iStore_bh_r,
2353                                      IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
2354def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
2355
2356// A9WriteML will be expanded into a separate write for each def
2357// operand. Address generation consumes resources, but A9WriteLMAdr
2358// is listed after all def operands, so has no effective latency.
2359//
2360// Note: A9WriteLM expands into an even number of def operands. The
2361// actual number of def operands may be less by one.
2362def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
2363
2364// Load multiple with address writeback has an extra def operand in
2365// front of the loaded registers.
2366//
2367// Reuse the load-multiple variants for store-multiple because the
2368// resources are identical, For stores only the address writeback
2369// has a def operand so the WriteL latencies are unused.
2370def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
2371                                                      IIC_iStore_m,
2372                                                      IIC_iStore_mu]>;
2373def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
2374def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>;
2375
2376def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
2377
2378def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
2379def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
2380def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
2381                                        IIC_fpStore_m, IIC_fpStore_mu]>;
2382
2383// Note: Unlike VLDM, VLD1 expects the writeback operand after the
2384// normal writes.
2385def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
2386                                         IIC_VLD1x2, IIC_VLD1x2u]>;
2387def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
2388                                         IIC_VLD1x4, IIC_VLD1x4u,
2389                                         IIC_VLD4dup, IIC_VLD4dupu]>;
2390def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
2391                                            IIC_VLD2, IIC_VLD2u,
2392                                            IIC_VLD2dup, IIC_VLD2dupu]>;
2393def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
2394                                            IIC_VLD2x2, IIC_VLD2x2u,
2395                                            IIC_VLD2ln, IIC_VLD2lnu]>;
2396def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
2397                                            IIC_VLD3dup, IIC_VLD3dupu]>;
2398def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
2399                                            IIC_VLD4ln, IIC_VLD4lnu]>;
2400def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
2401
2402// Vector stores use similar resources to vector loads, so use the
2403// same write types. The address write must be first for stores with
2404// address writeback.
2405def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
2406                                         IIC_VST1x2, IIC_VST1x2u,
2407                                         IIC_VST1ln, IIC_VST1lnu,
2408                                         IIC_VST2, IIC_VST2u,
2409                                         IIC_VST2x2, IIC_VST2x2u,
2410                                         IIC_VST2ln, IIC_VST2lnu]>;
2411def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
2412                                         IIC_VST1x4, IIC_VST1x4u,
2413                                         IIC_VST3, IIC_VST3u,
2414                                         IIC_VST3ln, IIC_VST3lnu,
2415                                         IIC_VST4, IIC_VST4u,
2416                                         IIC_VST4ln, IIC_VST4lnu]>;
2417
2418// NEON moves.
2419def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
2420def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
2421def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
2422
2423// NEON integer arithmetic
2424//
2425// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
2426def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
2427// VSUB/VMVN/VCLSD/VCLZD/VCNTD
2428def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
2429// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
2430// ...
2431// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
2432def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
2433
2434// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
2435def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
2436// VQNEG/VQABS
2437def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
2438// VABS
2439def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
2440// VPADD/VPADDL are mapped later under IIC_SHLi.
2441// ...
2442// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
2443def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
2444// VMOVimm/VMVNimm/VORRimm/VBICimm
2445def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
2446def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
2447def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
2448
2449// NEON integer multiply
2450//
2451// Note: these don't quite match the timing docs, but they do match
2452// the original A9 itinerary.
2453def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
2454def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
2455def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
2456def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
2457def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
2458def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
2459def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
2460def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
2461
2462// NEON integer shift
2463// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
2464def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
2465def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
2466
2467// NEON permute
2468def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
2469def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
2470            [IIC_VPERMQ3, IIC_VEXTQ]>;
2471def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
2472def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
2473def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
2474def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
2475def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
2476def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
2477def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
2478def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
2479            [IIC_VTBX4]>;
2480
2481// NEON floating-point
2482def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
2483def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
2484def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
2485def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
2486def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
2487def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
2488def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
2489def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
2490
2491// Map SchedRWs that are identical for cortexa9 to existing resources.
2492def : SchedAlias<WriteALU, A9WriteALU>;
2493def : SchedAlias<WriteALUsr, A9WriteALUsr>;
2494def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
2495def : SchedAlias<ReadALU, A9ReadALU>;
2496def : SchedAlias<ReadALUsr, A9ReadALU>;
2497def : SchedAlias<WriteST, A9WriteS>;
2498
2499// ===---------------------------------------------------------------------===//
2500// Floating-point. Map target defined SchedReadWrite to processor specific ones
2501//
2502def : WriteRes<WriteFPCVT, [A9UnitFP, A9UnitAGU]> { let Latency = 4; }
2503def : SchedAlias<WriteFPMOV, A9WriteFMov>;
2504
2505def : SchedAlias<WriteFPALU32, A9WriteF>;
2506def : SchedAlias<WriteFPALU64, A9WriteF>;
2507
2508def : SchedAlias<WriteFPMUL32, A9WriteFMulS>;
2509def : SchedAlias<WriteFPMUL64, A9WriteFMulD>;
2510
2511def : SchedAlias<WriteFPMAC32, A9WriteFMAS>;
2512def : SchedAlias<WriteFPMAC64, A9WriteFMAD>;
2513
2514def : SchedAlias<WriteFPDIV32, A9WriteFDivS>;
2515def : SchedAlias<WriteFPDIV64, A9WriteFDivD>;
2516def : SchedAlias<WriteFPSQRT32, A9WriteFSqrtS>;
2517def : SchedAlias<WriteFPSQRT64, A9WriteFSqrtD>;
2518
2519def : ReadAdvance<ReadFPMUL, 0>;
2520def : ReadAdvance<ReadFPMAC, 0>;
2521
2522// ===---------------------------------------------------------------------===//
2523// Subtarget-specific overrides. Map opcodes to list of SchedReadWrite types.
2524//
2525def : InstRW< [WriteALU],
2526      (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
2527                 "BICrr")>;
2528def : InstRW< [WriteALUsi], (instrs ANDrsi, ORRrsi, EORrsi, BICrsi)>;
2529def : InstRW< [WriteALUsr], (instrs ANDrsr, ORRrsr, EORrsr, BICrsr)>;
2530
2531
2532def : SchedAlias<WriteCMP, A9WriteALU>;
2533def : SchedAlias<WriteCMPsi, A9WriteALU>;
2534def : SchedAlias<WriteCMPsr, A9WriteALU>;
2535
2536def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
2537                                       "MOVCCsr")>;
2538def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
2539def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm")>;
2540def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
2541def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
2542
2543def : InstRW< [WriteALU], (instregex "SEL")>;
2544
2545def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>;
2546
2547def : InstRW< [A9WriteM],
2548      (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS",
2549      "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
2550def : InstRW< [A9WriteM, A9WriteMHi],
2551      (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
2552      "UMAAL", "SMLALv5", "UMLALv5", "SMLALBB", "SMLALBT", "SMLALTB",
2553      "SMLALTT")>;
2554// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
2555def : InstRW< [A9WriteM, A9WriteMHi],
2556      (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
2557      "SMLSLD", "SMLSLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
2558
2559def : InstRW<[A9WriteM16, A9WriteM16Hi],
2560      (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
2561def : InstRW<[A9WriteM16, A9WriteM16Hi],
2562      (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>;
2563
2564def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>;
2565def : InstRW<[A9WriteLsi], (instregex "LDRrs")>;
2566def : InstRW<[A9WriteLb],
2567      (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB",
2568      "LDRH", "LDRSH", "LDRSB")>;
2569def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
2570
2571def : WriteRes<WriteDIV, []> { let Latency = 0; }
2572
2573def : WriteRes<WriteBr, [A9UnitB]>;
2574def : WriteRes<WriteBrL, [A9UnitB]>;
2575def : WriteRes<WriteBrTbl, [A9UnitB]>;
2576def : WriteRes<WritePreLd, []>;
2577def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
2578} // SchedModel = CortexA9Model
2579