xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td (revision f126d349810fdb512c0b01e101342d430b947488)
1//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for the ARM Cortex-A55 processors.
10//
11//===----------------------------------------------------------------------===//
12
13// ===---------------------------------------------------------------------===//
14// The following definitions describe the per-operand machine model.
15// This works with MachineScheduler. See MCSchedModel.h for details.
16
17// Cortex-A55 machine model for scheduling and other instruction cost heuristics.
18def CortexA55Model : SchedMachineModel {
19  let MicroOpBufferSize = 0;  // The Cortex-A55 is an in-order processor
20  let IssueWidth = 2;         // It dual-issues under most circumstances
21  let LoadLatency = 4;        // Cycles for loads to access the cache. The
22                              // optimisation guide shows that most loads have
23                              // a latency of 3, but some have a latency of 4
24                              // or 5. Setting it 4 looked to be good trade-off.
25  let MispredictPenalty = 8;  // A branch direction mispredict.
26  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
27  let CompleteModel = 0;      // Covers instructions applicable to Cortex-A55.
28
29  list<Predicate> UnsupportedFeatures = [HasSVE];
30
31  // FIXME: Remove when all errors have been fixed.
32  let FullInstRWOverlapCheck = 0;
33}
34
35//===----------------------------------------------------------------------===//
36// Define each kind of processor resource and number available.
37
38// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
39// Cortex-A55 is in-order.
40
41def CortexA55UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
42def CortexA55UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide
43def CortexA55UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined
44def CortexA55UnitLd     : ProcResource<1> { let BufferSize = 0; } // Load pipe
45def CortexA55UnitSt     : ProcResource<1> { let BufferSize = 0; } // Store pipe
46def CortexA55UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
47
48// The FP DIV/SQRT instructions execute totally differently from the FP ALU
49// instructions, which can mostly be dual-issued; that's why for now we model
50// them with 2 resources.
51def CortexA55UnitFPALU  : ProcResource<2> { let BufferSize = 0; } // FP ALU
52def CortexA55UnitFPMAC  : ProcResource<2> { let BufferSize = 0; } // FP MAC
53def CortexA55UnitFPDIV  : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128
54
55//===----------------------------------------------------------------------===//
56// Subtarget-specific SchedWrite types
57
58let SchedModel = CortexA55Model in {
59
60// These latencies are modeled without taking into account forwarding paths
61// (the software optimisation guide lists latencies taking into account
62// typical forwarding paths).
63def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; }    // MOVN, MOVZ
64def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; }      // ALU
65def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Shifted-Reg
66def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Extended-Reg
67def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; }   // EXTR from a reg pair
68def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; }     // Shift/Scale
69
70// MAC
71def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; }   // 32-bit Multiply
72def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; }   // 64-bit Multiply
73
74// Div
75def : WriteRes<WriteID32, [CortexA55UnitDiv]> {
76  let Latency = 8; let ResourceCycles = [8];
77}
78def : WriteRes<WriteID64, [CortexA55UnitDiv]> {
79  let Latency = 8; let ResourceCycles = [8];
80}
81
82// Load
83def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; }
84def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; }
85def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; }
86
87// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
88//               below, choosing the median of 3 which makes the latency 6.
89// An extra cycle is needed to get the swizzling right.
90def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6;
91                                           let ResourceCycles = [3]; }
92def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; }
93def CortexA55WriteVLD1SI : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; let SingleIssue = 1; }
94def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5;
95                                                  let ResourceCycles = [2]; }
96def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6;
97                                                  let ResourceCycles = [3]; }
98def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7;
99                                                  let ResourceCycles = [4]; }
100def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8;
101                                                  let ResourceCycles = [5]; }
102def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9;
103                                                  let ResourceCycles = [6]; }
104def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10;
105                                                  let ResourceCycles = [7]; }
106def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11;
107                                                  let ResourceCycles = [8]; }
108
109def CortexA55WriteLDP1 : SchedWriteRes<[]> { let Latency = 4; }
110def CortexA55WriteLDP2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; }
111def CortexA55WriteLDP4 : SchedWriteRes<[CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd]> { let Latency = 6; }
112
113// Pre/Post Indexing - Performed as part of address generation
114def : WriteRes<WriteAdr, []> { let Latency = 0; }
115
116// Store
117let RetireOOO = 1 in {
118def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 1; }
119def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 1; }
120def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 1; }
121}
122def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; }
123
124// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
125def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5;
126                                          let ResourceCycles = [2];}
127def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; }
128def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
129                                                  let ResourceCycles = [2]; }
130def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6;
131                                                  let ResourceCycles = [3]; }
132def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
133                                                  let ResourceCycles = [4]; }
134
135def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
136
137// Branch
138def : WriteRes<WriteBr, [CortexA55UnitB]>;
139def : WriteRes<WriteBrReg, [CortexA55UnitB]>;
140def : WriteRes<WriteSys, [CortexA55UnitB]>;
141def : WriteRes<WriteBarrier, [CortexA55UnitB]>;
142def : WriteRes<WriteHint, [CortexA55UnitB]>;
143
144// FP ALU
145//   As WriteF result is produced in F5 and it can be mostly forwarded
146//   to consumer at F1, the effectively latency is set as 4.
147def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; }
148def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
149def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
150def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
151def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
152def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; }
153def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; }
154
155// FP ALU specific new schedwrite definitions
156def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
157def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;}
158def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;}
159def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;}
160
161// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
162def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
163
164let RetireOOO = 1 in {
165def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
166                                            let ResourceCycles = [29]; }
167def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
168def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
169                                                     let ResourceCycles = [5]; }
170def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13;
171                                                     let ResourceCycles = [10]; }
172def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
173                                                     let ResourceCycles = [19]; }
174def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
175                                                      let ResourceCycles = [5]; }
176def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12;
177                                                      let ResourceCycles = [9]; }
178def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
179                                                      let ResourceCycles = [19]; }
180}
181//===----------------------------------------------------------------------===//
182// Subtarget-specific SchedRead types.
183
184def : ReadAdvance<ReadVLD, 0>;
185def : ReadAdvance<ReadExtrHi, 1>;
186def : ReadAdvance<ReadAdrBase, 1>;
187def : ReadAdvance<ReadST, 1>;
188
189// ALU - ALU input operands are generally needed in EX1. An operand produced in
190//       in say EX2 can be forwarded for consumption to ALU in EX1, thereby
191//       allowing back-to-back ALU operations such as add. If an operand requires
192//       a shift, it will, however, be required in ISS stage.
193def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
194                             WriteISReg, WriteIEReg,WriteIS,
195                             WriteID32,WriteID64,
196                             WriteIM32,WriteIM64]>;
197// Shifted operand
198def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
199                                          WriteISReg, WriteIEReg,WriteIS,
200                                          WriteID32,WriteID64,
201                                          WriteIM32,WriteIM64]>;
202def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
203                                             WriteISReg, WriteIEReg,WriteIS,
204                                             WriteID32,WriteID64,
205                                             WriteIM32,WriteIM64]>;
206def CortexA55ReadISReg : SchedReadVariant<[
207        SchedVar<RegShiftedPred, [CortexA55ReadShifted]>,
208        SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
209def : SchedAlias<ReadISReg, CortexA55ReadISReg>;
210
211def CortexA55ReadIEReg : SchedReadVariant<[
212        SchedVar<RegExtendedPred, [CortexA55ReadShifted]>,
213        SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
214def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>;
215
216// MUL
217def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
218                              WriteISReg, WriteIEReg,WriteIS,
219                              WriteID32,WriteID64,
220                              WriteIM32,WriteIM64]>;
221def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
222                               WriteISReg, WriteIEReg,WriteIS,
223                               WriteID32,WriteID64,
224                               WriteIM32,WriteIM64]>;
225
226// Div
227def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
228                              WriteISReg, WriteIEReg,WriteIS,
229                              WriteID32,WriteID64,
230                              WriteIM32,WriteIM64]>;
231
232//===----------------------------------------------------------------------===//
233// Subtarget-specific InstRWs.
234
235//---
236// Miscellaneous
237//---
238def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?Wi")>;
239def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPSi")>;
240def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)i")>;
241def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQi")>;
242def : InstRW<[WriteAdr, CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W(pre|post)")>;
243def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS(pre|post)")>;
244def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
245def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ(pre|post)")>;
246def : InstRW<[WriteI], (instrs COPY)>;
247//---
248// Vector Loads - 64-bit per cycle
249//---
250//   1-element structures
251def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
252def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
253def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
254def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
255def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
256def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
257def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
258def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
259def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
260def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
261
262def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
263def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
264def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
265def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
266def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
267def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
268def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
269def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
270def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
271def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
272
273//    2-element structures
274def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
275def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
276def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
277def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
278
279def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
280def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
281def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
282def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
283
284//    3-element structures
285def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
286def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
287def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
288def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
289
290def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
291def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
292def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
293def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
294
295//    4-element structures
296def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
297def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
298def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
299def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
300
301def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
302def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
303def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
304def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
305
306//---
307// Vector Stores
308//---
309def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
310def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
311def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
312def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
313def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
314def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
315def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
316def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
317def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
318def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
319
320def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
321def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
322def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
323def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
324def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
325def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
326
327def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
328def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
329def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
330def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
331
332def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
333def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
334def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
335def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
336
337//---
338// Floating Point Conversions, MAC, DIV, SQRT
339//---
340def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
341def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^XTN")>;
342def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
343def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
344
345def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
346def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
347def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
348
349def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
350def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>;
351def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>;
352def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>;
353def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>;
354def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>;
355def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>;
356def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>;
357def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
358def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
359def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
360
361}
362