xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td (revision e92ffd9b626833ebdbf2742c8ffddc6cd94b963e)
1//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for the ARM Cortex-A55 processors.
10//
11//===----------------------------------------------------------------------===//
12
13// ===---------------------------------------------------------------------===//
14// The following definitions describe the per-operand machine model.
15// This works with MachineScheduler. See MCSchedModel.h for details.
16
17// Cortex-A55 machine model for scheduling and other instruction cost heuristics.
18def CortexA55Model : SchedMachineModel {
19  let MicroOpBufferSize = 0;  // The Cortex-A55 is an in-order processor
20  let IssueWidth = 2;         // It dual-issues under most circumstances
21  let LoadLatency = 4;        // Cycles for loads to access the cache. The
22                              // optimisation guide shows that most loads have
23                              // a latency of 3, but some have a latency of 4
24                              // or 5. Setting it 4 looked to be good trade-off.
25  let MispredictPenalty = 8;  // A branch direction mispredict.
26  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
27  let CompleteModel = 0;      // Covers instructions applicable to Cortex-A55.
28
29  list<Predicate> UnsupportedFeatures = [HasSVE];
30
31  // FIXME: Remove when all errors have been fixed.
32  let FullInstRWOverlapCheck = 0;
33}
34
35//===----------------------------------------------------------------------===//
36// Define each kind of processor resource and number available.
37
38// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
39// Cortex-A55 is in-order.
40
41def CortexA55UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
42def CortexA55UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide
43def CortexA55UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined
44def CortexA55UnitLd     : ProcResource<1> { let BufferSize = 0; } // Load pipe
45def CortexA55UnitSt     : ProcResource<1> { let BufferSize = 0; } // Store pipe
46def CortexA55UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
47
48// The FP DIV/SQRT instructions execute totally differently from the FP ALU
49// instructions, which can mostly be dual-issued; that's why for now we model
50// them with 2 resources.
51def CortexA55UnitFPALU  : ProcResource<2> { let BufferSize = 0; } // FP ALU
52def CortexA55UnitFPMAC  : ProcResource<2> { let BufferSize = 0; } // FP MAC
53def CortexA55UnitFPDIV  : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128
54
55//===----------------------------------------------------------------------===//
56// Subtarget-specific SchedWrite types
57
58let SchedModel = CortexA55Model in {
59
60// These latencies are modeled without taking into account forwarding paths
61// (the software optimisation guide lists latencies taking into account
62// typical forwarding paths).
63def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; }    // MOVN, MOVZ
64def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; }      // ALU
65def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Shifted-Reg
66def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Extended-Reg
67def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; }   // EXTR from a reg pair
68def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; }     // Shift/Scale
69
70// MAC
71def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; }   // 32-bit Multiply
72def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; }   // 64-bit Multiply
73
74// Div
75def : WriteRes<WriteID32, [CortexA55UnitDiv]> {
76  let Latency = 8; let ResourceCycles = [8];
77}
78def : WriteRes<WriteID64, [CortexA55UnitDiv]> {
79  let Latency = 8; let ResourceCycles = [8];
80}
81
82// Load
83def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; }
84def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; }
85def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; }
86
87// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
88//               below, choosing the median of 3 which makes the latency 6.
89// An extra cycle is needed to get the swizzling right.
90def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6;
91                                           let ResourceCycles = [3]; }
92def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; }
93def CortexA55WriteVLD1SI : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; let SingleIssue = 1; }
94def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5;
95                                                  let ResourceCycles = [2]; }
96def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6;
97                                                  let ResourceCycles = [3]; }
98def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7;
99                                                  let ResourceCycles = [4]; }
100def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8;
101                                                  let ResourceCycles = [5]; }
102def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9;
103                                                  let ResourceCycles = [6]; }
104def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10;
105                                                  let ResourceCycles = [7]; }
106def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11;
107                                                  let ResourceCycles = [8]; }
108
109def CortexA55WriteLDP1 : SchedWriteRes<[]> { let Latency = 4; }
110def CortexA55WriteLDP2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; }
111def CortexA55WriteLDP4 : SchedWriteRes<[CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd]> { let Latency = 6; }
112
113// Pre/Post Indexing - Performed as part of address generation
114def : WriteRes<WriteAdr, []> { let Latency = 0; }
115
116// Store
117let RetireOOO = 1 in {
118def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 1; }
119def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 1; }
120def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 1; }
121}
122def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; }
123
124// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
125def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5;
126                                          let ResourceCycles = [2];}
127def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; }
128def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
129                                                  let ResourceCycles = [2]; }
130def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6;
131                                                  let ResourceCycles = [3]; }
132def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
133                                                  let ResourceCycles = [4]; }
134
135def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
136
137// Branch
138def : WriteRes<WriteBr, [CortexA55UnitB]>;
139def : WriteRes<WriteBrReg, [CortexA55UnitB]>;
140def : WriteRes<WriteSys, [CortexA55UnitB]>;
141def : WriteRes<WriteBarrier, [CortexA55UnitB]>;
142def : WriteRes<WriteHint, [CortexA55UnitB]>;
143
144// FP ALU
145//   As WriteF result is produced in F5 and it can be mostly forwarded
146//   to consumer at F1, the effectively latency is set as 4.
147def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; }
148def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
149def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
150def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
151def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
152def : WriteRes<WriteV, [CortexA55UnitFPALU]> { let Latency = 4; }
153
154// FP ALU specific new schedwrite definitions
155def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;}
156def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;}
157def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;}
158
159// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
160def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
161
162let RetireOOO = 1 in {
163def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
164                                            let ResourceCycles = [29]; }
165def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
166def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
167                                                     let ResourceCycles = [5]; }
168def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13;
169                                                     let ResourceCycles = [10]; }
170def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
171                                                     let ResourceCycles = [19]; }
172def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
173                                                      let ResourceCycles = [5]; }
174def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12;
175                                                      let ResourceCycles = [9]; }
176def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
177                                                      let ResourceCycles = [19]; }
178}
179//===----------------------------------------------------------------------===//
180// Subtarget-specific SchedRead types.
181
182def : ReadAdvance<ReadVLD, 0>;
183def : ReadAdvance<ReadExtrHi, 1>;
184def : ReadAdvance<ReadAdrBase, 1>;
185
186// ALU - ALU input operands are generally needed in EX1. An operand produced in
187//       in say EX2 can be forwarded for consumption to ALU in EX1, thereby
188//       allowing back-to-back ALU operations such as add. If an operand requires
189//       a shift, it will, however, be required in ISS stage.
190def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
191                             WriteISReg, WriteIEReg,WriteIS,
192                             WriteID32,WriteID64,
193                             WriteIM32,WriteIM64]>;
194// Shifted operand
195def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
196                                          WriteISReg, WriteIEReg,WriteIS,
197                                          WriteID32,WriteID64,
198                                          WriteIM32,WriteIM64]>;
199def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
200                                             WriteISReg, WriteIEReg,WriteIS,
201                                             WriteID32,WriteID64,
202                                             WriteIM32,WriteIM64]>;
203def CortexA55ReadISReg : SchedReadVariant<[
204        SchedVar<RegShiftedPred, [CortexA55ReadShifted]>,
205        SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
206def : SchedAlias<ReadISReg, CortexA55ReadISReg>;
207
208def CortexA55ReadIEReg : SchedReadVariant<[
209        SchedVar<RegExtendedPred, [CortexA55ReadShifted]>,
210        SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
211def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>;
212
213// MUL
214def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
215                              WriteISReg, WriteIEReg,WriteIS,
216                              WriteID32,WriteID64,
217                              WriteIM32,WriteIM64]>;
218def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
219                               WriteISReg, WriteIEReg,WriteIS,
220                               WriteID32,WriteID64,
221                               WriteIM32,WriteIM64]>;
222
223// Div
224def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
225                              WriteISReg, WriteIEReg,WriteIS,
226                              WriteID32,WriteID64,
227                              WriteIM32,WriteIM64]>;
228
229//===----------------------------------------------------------------------===//
230// Subtarget-specific InstRWs.
231
232//---
233// Miscellaneous
234//---
235def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W")>;
236def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS[^W]")>;
237def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)")>;
238def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ")>;
239def : InstRW<[WriteI], (instrs COPY)>;
240//---
241// Vector Loads - 64-bit per cycle
242//---
243//   1-element structures
244def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
245def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
246def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
247def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
248def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
249def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
250def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
251def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
252def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
253def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
254
255def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
256def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
257def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
258def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
259def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
260def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
261def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
262def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
263def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
264def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
265
266//    2-element structures
267def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
268def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
269def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
270def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
271
272def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
273def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
274def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
275def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
276
277//    3-element structures
278def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
279def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
280def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
281def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
282
283def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
284def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
285def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
286def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
287
288//    4-element structures
289def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
290def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
291def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
292def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
293
294def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
295def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
296def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
297def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
298
299//---
300// Vector Stores
301//---
302def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
303def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
304def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
305def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
306def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
307def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
308def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
309def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
310def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
311def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
312
313def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
314def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
315def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
316def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
317def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
318def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
319
320def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
321def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
322def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
323def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
324
325def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
326def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
327def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
328def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
329
330//---
331// Floating Point Conversions, MAC, DIV, SQRT
332//---
333def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
334def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
335
336def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
337def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
338def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
339
340def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
341def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>;
342def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>;
343def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>;
344def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>;
345def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>;
346def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>;
347def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>;
348def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
349def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
350def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
351
352}
353