xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA510.td (revision 56b17de1e8360fe131d425de20b5e75ff3ea897c)
1//==- AArch64SchedCortexA510.td - ARM Cortex-A510 Scheduling Definitions -*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for the ARM Cortex-A510 processor.
10//
11//===----------------------------------------------------------------------===//
12
13// ===---------------------------------------------------------------------===//
14// The following definitions describe the per-operand machine model.
15// This works with MachineScheduler. See MCSchedModel.h for details.
16
17// Cortex-A510 machine model for scheduling and other instruction cost heuristics.
18def CortexA510Model : SchedMachineModel {
19  let MicroOpBufferSize = 0;  // The Cortex-A510 is an in-order processor
20  let IssueWidth = 3;         // It dual-issues under most circumstances
21  let LoadLatency = 3;        // Cycles for loads to access the cache.
22                              // Most loads have a latency of 2, but some have higher latencies.
23                              // 3 seems to be a good tradeoff
24  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
25  let CompleteModel = 0;      // Covers instructions applicable to Cortex-A510.
26
27  // FIXME: Remove when all errors have been fixed.
28  let FullInstRWOverlapCheck = 0;
29}
30
31
32//===----------------------------------------------------------------------===//
33// Subtarget-specific SchedWrite types
34
35let SchedModel = CortexA510Model in {
36
37//===----------------------------------------------------------------------===//
38// Define each kind of processor resource and number available.
39
40// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
41// Cortex-A510 is in-order.
42let BufferSize = 0 in {
43  def CortexA510UnitALU0   : ProcResource<1>;    // Int ALU0
44  def CortexA510UnitALU12  : ProcResource<2>;    // Int ALU1 & ALU2
45  def CortexA510UnitMAC    : ProcResource<1>;    // Int MAC, 64-bi wide
46  def CortexA510UnitDiv    : ProcResource<1>;    // Int Division, not pipelined
47  // There are 2 LS pipes, 1 for Load/Store; 1 for Store only
48  def CortexA510UnitLdSt   : ProcResource<1>;    // Load/Store shared pipe
49  def CortexA510UnitLd1    : ProcResource<1>;    // Load pipe
50  def CortexA510UnitB      : ProcResource<1>;    // Branch
51  def CortexA510UnitPAC    : ProcResource<1>;    // Pointer Authentication (PAC) pipe
52
53  // The FP DIV/SQRT instructions execute totally differently from the FP ALU
54  // instructions, which can mostly be dual-issued; that's why for now we model
55  // them with 2 resources.
56  def CortexA510UnitVALU0  : ProcResource<1>;    // SIMD/FP/SVE ALU0
57  def CortexA510UnitVALU1  : ProcResource<1>;    // SIMD/FP/SVE ALU0
58  def CortexA510UnitVMAC   : ProcResource<2>;    // SIMD/FP/SVE MAC
59  def CortexA510UnitVMC    : ProcResource<1>;    // SIMD/FP/SVE multicycle instrs  (e.g Div, SQRT, cryptography)
60}
61
62def CortexA510UnitLd     : ProcResGroup<[CortexA510UnitLdSt, CortexA510UnitLd1]>;
63def CortexA510UnitVALU   : ProcResGroup<[CortexA510UnitVALU0, CortexA510UnitVALU1]>;
64def CortexA510UnitALU    : ProcResGroup<[CortexA510UnitALU0, CortexA510UnitALU12]>;
65// These latencies are modeled without taking into account forwarding paths
66// (the software optimisation guide lists latencies taking into account
67// typical forwarding paths).
68def : WriteRes<WriteImm, [CortexA510UnitALU]> { let Latency = 1; }    // MOVN, MOVZ
69def : WriteRes<WriteI, [CortexA510UnitALU]> { let Latency = 1; }      // ALU
70def : WriteRes<WriteISReg, [CortexA510UnitALU]> { let Latency = 2; }  // ALU of Shifted-Reg
71def : WriteRes<WriteIEReg, [CortexA510UnitALU]> { let Latency = 2; }  // ALU of Extended-Reg
72def : WriteRes<WriteExtr, [CortexA510UnitALU]> { let Latency = 2; }   // EXTR from a reg pair
73def : WriteRes<WriteIS, [CortexA510UnitALU]> { let Latency = 2; }     // Shift/Scale
74
75// MAC
76def : WriteRes<WriteIM32, [CortexA510UnitMAC]> { let Latency = 3; }   // 32-bit Multiply
77def : WriteRes<WriteIM64, [CortexA510UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];}   // 64-bit Multiply
78
79// Div
80def : WriteRes<WriteID32, [CortexA510UnitDiv]> {
81  let Latency = 8; let ReleaseAtCycles = [8];
82}
83def : WriteRes<WriteID64, [CortexA510UnitDiv]> {
84  let Latency = 16; let ReleaseAtCycles = [16];
85}
86
87//===----------------------------------------------------------------------===//
88// Define customized scheduler read/write types specific to the Cortex A510
89
90//===----------------------------------------------------------------------===//
91class CortexA510Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
92  let Latency = n;
93}
94
95class CortexA510MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> {
96  let Latency = n;
97  let ReleaseAtCycles = [m];
98  let BeginGroup = 1;
99}
100
101class CortexA510MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
102  let Latency = n;
103  let BeginGroup = 1;
104}
105
106//===----------------------------------------------------------------------===//
107// Define generic 2 micro-op types
108def A510Write_10cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> {
109  let Latency     = 10;
110  let NumMicroOps = 2;
111}
112
113def A510Write_15cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> {
114  let Latency     = 15;
115  let NumMicroOps = 2;
116}
117
118class A510Write_PAC_B <int lat> : SchedWriteRes<[CortexA510UnitPAC, CortexA510UnitB]> {
119  let Latency = lat;
120  let NumMicroOps = 2;
121}
122// Load
123def : WriteRes<WriteLD, [CortexA510UnitLd]> { let Latency = 2; }
124def : WriteRes<WriteLDIdx, [CortexA510UnitLd]> { let Latency = 2; }
125def : WriteRes<WriteLDHi, [CortexA510UnitLd]> { let Latency = 2; }
126
127def CortexA510WriteVLD1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
128def CortexA510WriteVLD1SI : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; let SingleIssue = 1; }
129def CortexA510WriteVLD2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 4;
130                                                  let ReleaseAtCycles = [2]; }
131def CortexA510WriteVLD3 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5;
132                                                  let ReleaseAtCycles = [3]; }
133def CortexA510WriteVLD4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6;
134                                                  let ReleaseAtCycles = [4]; }
135def CortexA510WriteVLD6 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5;
136                                                  let ReleaseAtCycles = [3]; }
137def CortexA510WriteVLD8 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6;
138                                                  let ReleaseAtCycles = [4]; }
139
140def CortexA510WriteLDP1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
141def CortexA510WriteLDP2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
142def CortexA510WriteLDP4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; }
143
144// Pre/Post Indexing - Performed as part of address generation
145def : WriteRes<WriteAdr, []> { let Latency = 0; }
146
147// Store
148let RetireOOO = 1 in {
149def : WriteRes<WriteST, [CortexA510UnitLdSt]> { let Latency = 1; }
150def : WriteRes<WriteSTP, [CortexA510UnitLdSt]> { let Latency = 1; }
151def : WriteRes<WriteSTIdx, [CortexA510UnitLdSt]> { let Latency = 1; }
152}
153def : WriteRes<WriteSTX, [CortexA510UnitLdSt]> { let Latency = 3; }
154
155// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
156def : WriteRes<WriteVST, [CortexA510UnitLdSt]> { let Latency = 5;
157                                          let ReleaseAtCycles = [2];}
158def CortexA510WriteVST1 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 4; }
159def CortexA510WriteVST2 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
160                                                  let ReleaseAtCycles = [2]; }
161def CortexA510WriteVST3 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
162                                                  let ReleaseAtCycles = [3]; }
163def CortexA510WriteVST4 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5;
164                                                  let ReleaseAtCycles = [4]; }
165
166def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
167
168// Branch
169def : WriteRes<WriteBr, [CortexA510UnitB]>;
170def : WriteRes<WriteBrReg, [CortexA510UnitB]>;
171def : WriteRes<WriteSys, [CortexA510UnitB]>;
172def : WriteRes<WriteBarrier, [CortexA510UnitB]>;
173def : WriteRes<WriteHint, [CortexA510UnitB]>;
174
175// FP ALU
176//   As WriteF result is produced in F5 and it can be mostly forwarded
177//   to consumer at F1, the effectively Latency is set as 4.
178def : WriteRes<WriteF, [CortexA510UnitVALU]> { let Latency = 4; }
179def : WriteRes<WriteFCmp, [CortexA510UnitVALU]> { let Latency = 3; }
180def : WriteRes<WriteFCvt, [CortexA510UnitVALU]> { let Latency = 4; }
181def : WriteRes<WriteFCopy, [CortexA510UnitVALU]> { let Latency = 3; }
182def : WriteRes<WriteFImm, [CortexA510UnitVALU]> { let Latency = 3; }
183
184class CortexA510VSt<int n> : SchedWriteRes<[CortexA510UnitLdSt]> {
185  let RetireOOO = 1;
186  let ReleaseAtCycles = [n];
187}
188
189def CortexA510VSt0      : SchedWriteRes<[CortexA510UnitLdSt]> {
190  let RetireOOO = 1;
191}
192
193def : SchedAlias<WriteVd, CortexA510Write<4, CortexA510UnitVALU>>;
194def : SchedAlias<WriteVq, CortexA510Write<4, CortexA510UnitVALU>>;
195
196// FP ALU specific new schedwrite definitions
197def CortexA510WriteFPALU_F3 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 3;}
198def CortexA510WriteFPALU_F4 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 4;}
199
200// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
201def : WriteRes<WriteFMul, [CortexA510UnitVMAC]> { let Latency = 4; }
202
203let RetireOOO = 1 in {
204def : WriteRes<WriteFDiv, [CortexA510UnitVMC]> { let Latency = 22;
205                                            let ReleaseAtCycles = [29]; }
206def CortexA510WriteVMAC : SchedWriteRes<[CortexA510UnitVMAC]> { let Latency = 4; }
207def CortexA510WriteFDivHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8;
208                                                     let ReleaseAtCycles = [5]; }
209def CortexA510WriteFDivSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 13;
210                                                     let ReleaseAtCycles = [10]; }
211def CortexA510WriteFDivDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22;
212                                                     let ReleaseAtCycles = [19]; }
213def CortexA510WriteFSqrtHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8;
214                                                      let ReleaseAtCycles = [5]; }
215def CortexA510WriteFSqrtSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 12;
216                                                      let ReleaseAtCycles = [9]; }
217def CortexA510WriteFSqrtDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22;
218                                                      let ReleaseAtCycles = [19]; }
219}
220//===----------------------------------------------------------------------===//
221// Subtarget-specific SchedRead types.
222
223def : ReadAdvance<ReadVLD, 0>;
224def : ReadAdvance<ReadExtrHi, 0>;
225def : ReadAdvance<ReadAdrBase, 0>;
226def : ReadAdvance<ReadST, 1>;
227
228def : ReadAdvance<ReadI, 0>;
229def : ReadAdvance<ReadISReg, 0>;
230def : ReadAdvance<ReadIEReg, 0>;
231
232
233// MUL
234def : ReadAdvance<ReadIM, 0>;
235def : ReadAdvance<ReadIMA, 2>;
236
237// Div
238def : ReadAdvance<ReadID, 0>;
239
240//===----------------------------------------------------------------------===//
241// Subtarget-specific InstRWs.
242
243def A510WriteISReg : SchedWriteVariant<[
244       SchedVar<RegShiftedPred, [WriteISReg]>,
245       SchedVar<NoSchedPred, [WriteI]>]>;
246def : InstRW<[A510WriteISReg], (instregex ".*rs$")>;
247def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
248
249// Pointer Authentication Instructions (v8.3 PAC)
250// -----------------------------------------------------------------------------
251
252// Authenticate data address
253// Authenticate instruction address
254// Compute pointer authentication code for data address
255// Compute pointer authentication code, using generic key
256// Compute pointer authentication code for instruction address
257def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>;
258
259// Branch and link, register, with pointer authentication
260// Branch, register, with pointer authentication
261// Branch, return, with pointer authentication
262def : InstRW<[A510Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
263                                            BRAAZ, BRAB, BRABZ, RETAA, RETAB,
264                                            ERETAA, ERETAB)>;
265
266// Load register, with pointer authentication
267def : InstRW<[CortexA510Write<2, CortexA510UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>;
268
269// Strip pointer authentication code
270def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>;
271//---
272// Miscellaneous
273//---
274def : InstRW<[CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?Wi")>;
275def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPSi")>;
276def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)i")>;
277def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQi")>;
278def : InstRW<[WriteAdr, CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?W(pre|post)")>;
279def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPS(pre|post)")>;
280def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
281def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQ(pre|post)")>;
282def : InstRW<[WriteI], (instrs COPY)>;
283//---
284// Vector Loads - 128-bit per cycle
285//---
286//   1-element structures
287def : InstRW<[CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
288def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
289def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
290def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
291def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
292def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
293def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
294def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
295def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
296def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
297
298def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
299def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
300def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
301def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
302def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
303def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
304def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
305def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
306def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
307def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
308
309//    2-element structures
310def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
311def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
312def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
313def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
314
315def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
316def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
317def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
318def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
319
320//    3-element structures
321def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
322def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
323def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
324def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
325
326def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
327def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
328def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
329def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
330
331//    4-element structures
332def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
333def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
334def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
335def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
336
337def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
338def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
339def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
340def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
341
342//---
343// Vector Stores
344//---
345def : InstRW<[CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
346def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
347def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
348def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
349def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
350def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
351def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
352def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
353def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
354def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
355
356def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
357def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
358def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
359def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
360def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
361def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
362
363def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
364def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
365def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
366def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
367
368def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
369def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
370def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
371def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
372
373//---
374// Floating Point Conversions, MAC, DIV, SQRT
375//---
376def : InstRW<[CortexA510WriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
377def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^XTN")>;
378def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
379def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
380
381def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
382def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
383def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
384
385def : InstRW<[CortexA510WriteVMAC], (instregex "^FN?M(ADD|SUB).*")>;
386def : InstRW<[CortexA510WriteVMAC], (instregex "^FML(A|S)v.*")>;
387def : InstRW<[CortexA510WriteFDivHP], (instrs FDIVHrr)>;
388def : InstRW<[CortexA510WriteFDivSP], (instrs FDIVSrr)>;
389def : InstRW<[CortexA510WriteFDivDP], (instrs FDIVDrr)>;
390def : InstRW<[CortexA510WriteFDivHP], (instregex "^FDIVv.*16$")>;
391def : InstRW<[CortexA510WriteFDivSP], (instregex "^FDIVv.*32$")>;
392def : InstRW<[CortexA510WriteFDivDP], (instregex "^FDIVv.*64$")>;
393def : InstRW<[CortexA510WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
394def : InstRW<[CortexA510WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
395def : InstRW<[CortexA510WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
396
397def : InstRW<[CortexA510WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)>;
398
399// 4.15. Advanced SIMD integer instructions
400// ASIMD absolute diff
401def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
402def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
403// ASIMD absolute diff accum
404def : InstRW<[CortexA510Write<6, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>;
405// ASIMD absolute diff long
406def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>;
407// ASIMD arith #1
408def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v",
409  "[SU]R?HADDv", "[SU]HSUBv")>;
410// ASIMD arith #2
411def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
412  "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
413  "ADDPv(2i32|4i16|8i8)$")>;
414def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>;
415def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
416  "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
417  "ADDPv(16i8|2i64|4i32|8i16)$")>;
418def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>;
419// ASIMD arith #3
420def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
421  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>;
422def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "ADDHNv", "SUBHNv")>;
423// ASIMD arith #5
424def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
425// ASIMD arith, reduce
426def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex  "ADDVv")>;
427def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "SADDLVv", "UADDLVv")>;
428// ASIMD compare #1
429def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
430def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
431// ASIMD compare #2
432def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
433def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
434// ASIMD logical $1
435def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8",
436  "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
437def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8",
438  "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
439// ASIMD max/min, basic
440def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
441def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
442// SIMD max/min, reduce
443def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>;
444// ASIMD multiply, by element
445def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
446  "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
447// ASIMD multiply
448def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv8i8)>;
449def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv16i8)>;
450// ASIMD multiply accumulate
451def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
452def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
453def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
454// ASIMD multiply accumulate half
455def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQRDML[AS]H[vi]")>;
456// ASIMD multiply accumulate long
457def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]ML[AS]Lv")>;
458// ASIMD multiply accumulate long #2
459def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQDML[AS]L[iv]")>;
460// ASIMD dot product
461def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv8i8")>;
462def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv16i8")>;
463// ASIMD dot product, by scalar
464def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTlanev")>;
465// ASIMD multiply long
466def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
467// ASIMD polynomial (8x8) multiply long
468def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>;
469// ASIMD pairwise add and accumulate
470def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>;
471// ASIMD shift accumulate
472def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
473def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
474// ASIMD shift accumulate #2
475def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>;
476// ASIMD shift by immed
477def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv",
478  "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
479// ASIMD shift by immed
480// SXTL and UXTL are aliases for SHLL
481def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[US]?SHLLv")>;
482// ASIMD shift by immed #2
483def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
484  "[SU]RSHRv(16i8|2i64|4i32|8i16)")>;
485def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)",
486  "RSHRNv(16i8|4i32|8i16)")>;
487// ASIMD shift by register
488def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
489def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
490// ASIMD shift by register #2
491def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
492def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
493
494def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>;
495def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>;
496
497def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>;
498def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>;
499
500// Cryptography extensions
501// -----------------------------------------------------------------------------
502
503// Crypto AES ops
504def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
505
506// Crypto polynomial (64x64) multiply long
507def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>;
508
509// Crypto SHA1 hash acceleration op
510// Crypto SHA1 schedule acceleration ops
511def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|SU1)")>;
512
513// Crypto SHA1 hash acceleration ops
514// Crypto SHA256 hash acceleration ops
515def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
516
517// Crypto SHA256 schedule acceleration ops
518def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>;
519
520// Crypto SHA512 hash acceleration ops
521def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>;
522
523// Crypto SHA3 ops
524def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3)>;
525def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs XAR)>;
526def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs RAX1)>;
527
528
529// Crypto SM3 ops
530def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
531                                                            "^SM3TT[12][AB]$")>;
532
533// Crypto SM4 ops
534def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>;
535
536// CRC
537// -----------------------------------------------------------------------------
538
539def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")>;
540
541// SVE Predicate instructions
542
543// Loop control, based on predicate
544def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP,
545                                                  BRKB_PPmP, BRKB_PPzP)>;
546
547// Loop control, based on predicate and flag setting
548def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
549
550// Loop control, propagating
551def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
552
553// Loop control, propagating and flag setting
554def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>;
555def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>;
556
557
558// Loop control, based on GPR
559def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
560             (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
561
562def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
563
564// Loop terminate
565def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
566
567// Predicate counting scalar
568def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
569
570def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
571             (instregex "^CNT[BHWD]_XPiI")>;
572
573def : InstRW<[CortexA510Write<3, CortexA510UnitALU>],
574             (instregex "^(INC|DEC)[BHWD]_XPiI")>;
575
576def : InstRW<[CortexA510Write<4, CortexA510UnitALU>],
577             (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>;
578
579// Predicate counting scalar, active predicate
580def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
581             (instregex "^CNTP_XPP_[BHSD]")>;
582
583def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
584             (instregex "^(DEC|INC)P_XP_[BHSD]")>;
585
586def : InstRW<[CortexA510Write<9, CortexA510UnitVALU0>],
587             (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
588                        "^(UQDEC|UQINC)P_WP_[BHSD]",
589                        "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>;
590
591
592// Predicate counting vector, active predicate
593def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
594             (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
595
596// Predicate logical
597def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
598             (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
599
600// Predicate logical, flag setting
601def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
602             (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
603
604// Predicate reverse
605def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>;
606
607// Predicate select
608def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs SEL_PPPP)>;
609
610// Predicate set
611def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
612
613// Predicate set/initialize, set flags
614def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>;
615
616// Predicate find first/next
617def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
618
619// Predicate test
620def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PTEST_PP)>;
621
622// Predicate transpose
623def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>;
624
625// Predicate unpack and widen
626def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
627
628// Predicate zip/unzip
629def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>;
630
631
632// SVE integer instructions
633// -----------------------------------------------------------------------------
634// Arithmetic, absolute diff
635def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>;
636
637// Arithmetic, absolute diff accum
638def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
639
640// Arithmetic, absolute diff accum long
641def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
642
643// Arithmetic, absolute diff long
644def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
645
646// Arithmetic, basic
647def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
648             (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]",
649                        "^(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
650                        "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
651                        "^(ADD|SUB)_ZZZ_[BHSD]",
652                        "^(ADD|SUB|SUBR)_ZI_[BHSD]",
653                        "^ADR_[SU]XTW_ZZZ_D_[0123]",
654                        "^ADR_LSL_ZZZ_[SD]_[0123]",
655                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>;
656def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
657             (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
658                        "^SADDLBT_ZZZ_[HSD]",
659                        "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
660
661// Arithmetic, complex
662def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
663             (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]",
664                        "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]",
665                        "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
666                        "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
667                        "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
668                        "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
669def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>],
670             (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>;
671
672// Arithmetic, large integer
673def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
674
675// Arithmetic, pairwise add
676def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>;
677
678// Arithmetic, pairwise add and accum long
679def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
680
681// Arithmetic, shift
682def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
683             (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
684                        "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
685                        "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
686                        "^(ASR|LSL|LSR)_ZPZI_[BHSD]",
687                        "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
688                        "^(ASR|LSL|LSR)_ZPZZ_[BHSD]",
689                        "^(ASR|LSL|LSR)_ZZI_[BHSD]",
690                        "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
691// Arithmetic, shift right for divide
692def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
693             (instregex "^ASRD_ZPmI_[BHSD]",
694                        "^ASRD_ZPZI_[BHSD]")>;
695
696// Arithmetic, shift and accumulate
697def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
698             (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>;
699
700def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>],
701             (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>;
702
703
704// Arithmetic, shift by immediate
705// Arithmetic, shift by immediate and insert
706def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
707             (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>;
708
709// Arithmetic, shift complex
710def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
711             (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
712                        "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]",
713                        "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
714                        "^SQSHRU?N[BT]_ZZI_[BHS]",
715                        "^UQR?SHRN[BT]_ZZI_[BHS]")>;
716
717// Arithmetic, shift rounding
718def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
719             (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]",
720                        "^[SU]RSHR_ZPmI_[BHSD]")>;
721
722// Bit manipulation
723def : InstRW<[CortexA510MCWrite<14, 13, CortexA510UnitVMC>],
724             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>;
725
726def : InstRW<[CortexA510MCWrite<22, 21, CortexA510UnitVMC>],
727             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>;
728
729def : InstRW<[CortexA510MCWrite<38, 37, CortexA510UnitVMC>],
730             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>;
731
732def : InstRW<[CortexA510MCWrite<70, 69, CortexA510UnitVMC>],
733             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>;
734
735
736// Bitwise select
737def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
738
739// Count/reverse bits
740def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>;
741def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>;
742def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>;
743def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>;
744// Broadcast logical bitmask immediate to vector
745def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>;
746
747// Compare and set flags
748def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
749             (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
750                        "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
751
752// Complex add
753def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>;
754
755def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>;
756
757// Complex dot product 8-bit element
758def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
759
760// Complex dot product 16-bit element
761def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
762
763// Complex multiply-add B, H, S element size
764def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]",
765                                            "^CMLA_ZZZI_[HS]")>;
766
767// Complex multiply-add D element size
768def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CMLA_ZZZ_D)>;
769
770// Conditional extract operations, scalar form
771def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
772
773// Conditional extract operations, SIMD&FP scalar and vector forms
774def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
775                                            "^COMPACT_ZPZ_[SD]",
776                                            "^SPLICE_ZPZZ?_[BHSD]")>;
777
778// Convert to floating point, 64b to float or convert to double
779def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>;
780
781// Convert to floating point, 64b to half
782def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>;
783
784// Convert to floating point, 32b to single or half
785def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
786
787// Convert to floating point, 32b to double
788def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>;
789
790// Convert to floating point, 16b to half
791def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
792
793// Copy, scalar
794def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>],(instregex "^CPY_ZPmR_[BHSD]")>;
795
796// Copy, scalar SIMD&FP or imm
797def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]",
798                                           "^CPY_ZPzI_[BHSD]")>;
799
800// Divides, 32 bit
801def : InstRW<[CortexA510MCWrite<15, 12, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>;
802
803// Divides, 64 bit
804def : InstRW<[CortexA510MCWrite<26, 23, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>;
805
806// Dot product, 8 bit
807def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>;
808
809// Dot product, 8 bit, using signed and unsigned integers
810def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
811
812// Dot product, 16 bit
813def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>;
814
815// Duplicate, immediate and indexed form
816def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZI_[BHSD]",
817                                           "^DUP_ZZI_[BHSDQ]")>;
818
819// Duplicate, scalar form
820def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZR_[BHSD]")>;
821
822// Extend, sign or zero
823def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]",
824                                            "^[SU]XTH_ZPmZ_[SD]",
825                                            "^[SU]XTW_ZPmZ_[D]")>;
826
827// Extract
828def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
829
830// Extract narrow saturating
831def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
832                                            "^SQXTUN[BT]_ZZ_[BHS]")>;
833
834// Extract/insert operation, SIMD and FP scalar form
835def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]",
836                                            "^INSR_ZV_[BHSD]")>;
837
838// Extract/insert operation, scalar
839def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^LAST[AB]_RPZ_[BHSD]",
840                                                "^INSR_ZR_[BHSD]")>;
841
842// Histogram operations
843def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^HISTCNT_ZPzZZ_[SD]",
844                                                  "^HISTSEG_ZZZ")>;
845
846// Horizontal operations, B, H, S form, immediate operands only
847def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_II_[BHS]")>;
848
849// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
850// operands only / immediate, scalar operands
851def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
852
853// Horizontal operations, D form, immediate operands only
854def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs INDEX_II_D)>;
855
856// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
857// only / immediate, scalar operands
858def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>;
859
860// Logical
861def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
862             (instregex "^(AND|EOR|ORR)_ZI",
863                        "^(AND|BIC|EOR|EOR|ORR)_ZZZ",
864                        "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]",
865                        "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>;
866
867def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
868             (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>;
869
870// Max/min, basic and pairwise
871def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
872                                           "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>;
873
874// Matching operations
875def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>;
876
877// Matrix multiply-accumulate
878def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
879
880// Move prefix
881def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
882                                           "^MOVPRFX_ZZ")>;
883
884// Multiply, B, H, S element size
885def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]",
886                                            "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>;
887
888// Multiply, D element size
889def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D",
890                                            "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>;
891
892// Multiply long
893def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
894                                            "^[SU]MULL[BT]_ZZZ_[HSD]")>;
895
896// Multiply accumulate, B, H, S element size
897def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]",
898                                            "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
899
900// Multiply accumulate, D element size
901def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D",
902                                            "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
903
904// Multiply accumulate long
905def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
906                                            "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
907
908// Multiply accumulate saturating doubling long regular
909def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]",
910                                            "^SQDML[AS](LB|LT)_ZZZI_[SD]")>;
911
912// Multiply saturating doubling high, B, H, S element size
913def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]",
914                                            "^SQDMULH_ZZZI_[HS]")>;
915
916// Multiply saturating doubling high, D element size
917def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
918
919// Multiply saturating doubling long
920def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
921                                            "^SQDMULL[BT]_ZZZI_[SD]")>;
922
923// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
924// element size
925def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
926                                            "^SQRDCMLAH_ZZZ_[BHS]",
927                                            "^SQRDML[AS]H_ZZZI_[HS]",
928                                            "^SQRDCMLAH_ZZZI_[HS]")>;
929
930// Multiply saturating rounding doubling regular/complex accumulate, D element
931// size
932def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D",
933                                            "^SQRDCMLAH_ZZZ_D")>;
934
935// Multiply saturating rounding doubling regular/complex, B, H, S element size
936def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]",
937                                            "^SQRDMULH_ZZZI_[HS]")>;
938
939// Multiply saturating rounding doubling regular/complex, D element size
940def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>;
941
942// Multiply/multiply long, (8x8) polynomial
943def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>;
944
945def : InstRW<[CortexA510Write<9, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>;
946
947
948// Predicate counting vector
949def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
950             (instregex "^(DEC|INC)[HWD]_ZPiI")>;
951def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
952             (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>;
953
954// Reciprocal estimate
955def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
956
957// Reduction, arithmetic, B form
958def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
959
960// Reduction, arithmetic, H form
961def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
962
963// Reduction, arithmetic, S form
964def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
965
966// Reduction, arithmetic, D form
967def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
968
969// Reduction, logical
970def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>;
971
972// Reverse, vector
973def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]",
974                                           "^REVB_ZPmZ_[HSD]",
975                                           "^REVH_ZPmZ_[SD]",
976                                           "^REVW_ZPmZ_D")>;
977
978// Select, vector form
979def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>;
980
981// Table lookup
982def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[BHSD]")>;
983
984// Table lookup extension
985def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>;
986
987// Transpose, vector form
988def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
989
990// Unpack and extend
991def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
992
993// Zip/unzip
994def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
995
996// SVE floating-point instructions
997// -----------------------------------------------------------------------------
998
999// Floating point absolute value/difference
1000def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]",
1001                                                                  "^FAB[SD]_ZPZZ_[HSD]")>;
1002
1003// Floating point arithmetic
1004def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]",
1005                                           "^FADDP_ZPmZZ_[HSD]",
1006                                           "^FNEG_ZPmZ_[HSD]",
1007                                           "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>;
1008
1009// Floating point associative add, F16
1010def : InstRW<[CortexA510MCWrite<32, 29, CortexA510UnitVALU>], (instrs FADDA_VPZ_H)>;
1011
1012// Floating point associative add, F32
1013def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_S)>;
1014
1015// Floating point associative add, F64
1016def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>;
1017
1018// Floating point compare
1019def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
1020                                            "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
1021                                            "^FCM(LE|LT)_PPzZ0_[HSD]",
1022                                            "^FCMUO_PPzZZ_[HSD]")>;
1023
1024// Floating point complex add
1025def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>;
1026
1027// Floating point complex multiply add
1028def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]",
1029                                           "^FCMLA_ZZZI_[HS]")>;
1030
1031// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
1032def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
1033                                            "^FCVTLT_ZPmZ_HtoS",
1034                                            "^FCVTNT_ZPmZ_StoH")>;
1035
1036// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
1037// or F64 to F16)
1038def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
1039                                            "^FCVTLT_ZPmZ_StoD",
1040                                            "^FCVTNT_ZPmZ_DtoS")>;
1041
1042// Floating point convert, round to odd
1043def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>;
1044
1045// Floating point base2 log, F16
1046def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
1047
1048// Floating point base2 log, F32
1049def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
1050
1051// Floating point base2 log, F64
1052def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
1053
1054// Floating point convert to integer, F16
1055def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
1056
1057// Floating point convert to integer, F32
1058def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
1059
1060// Floating point convert to integer, F64
1061def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
1062             (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
1063
1064// Floating point copy
1065def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>], (instregex "^FCPY_ZPmI_[HSD]",
1066                                           "^FDUP_ZI_[HSD]")>;
1067
1068// Floating point divide, F16
1069def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
1070
1071// Floating point divide, F32
1072def : InstRW<[CortexA510MCWrite<13, 10, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
1073
1074// Floating point divide, F64
1075def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
1076
1077// Floating point min/max pairwise
1078def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
1079
1080// Floating point min/max
1081def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>;
1082
1083// Floating point multiply
1084def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]",
1085                                           "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>;
1086
1087// Floating point multiply accumulate
1088def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],
1089             (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]",
1090                        "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>;
1091
1092// Floating point multiply add/sub accumulate long
1093def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
1094
1095// Floating point reciprocal estimate, F16
1096def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H",
1097                                         "^FRSQRTE_ZZ_H")>;
1098
1099// Floating point reciprocal estimate, F32
1100def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S",
1101                                         "^FRSQRTE_ZZ_S")>;
1102// Floating point reciprocal estimate, F64
1103def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D",
1104                                         "^FRSQRTE_ZZ_D")>;
1105
1106// Floating point reciprocal step
1107def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
1108
1109// Floating point reduction, F16
1110def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
1111             (instregex "^(FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_[HSD]")>;
1112
1113// Floating point reduction, F32
1114def : InstRW<[CortexA510MCWrite<12, 11, CortexA510UnitVALU0>],
1115             (instregex "^FADDV_VPZ_H")>;
1116
1117def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU0>],
1118             (instregex "^FADDV_VPZ_S")>;
1119
1120def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
1121             (instregex "^FADDV_VPZ_D")>;
1122
1123
1124// Floating point round to integral, F16
1125def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
1126
1127// Floating point round to integral, F32
1128def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
1129
1130// Floating point round to integral, F64
1131def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
1132
1133// Floating point square root, F16
1134def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_H")>;
1135
1136// Floating point square root, F32
1137def : InstRW<[CortexA510MCWrite<12, 9, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_S")>;
1138
1139// Floating point square root, F64
1140def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_D")>;
1141
1142// Floating point trigonometric exponentiation
1143def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>;
1144
1145// Floating point trigonometric multiply add
1146def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>;
1147
1148// Floating point trigonometric, miscellaneous
1149def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>;
1150def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>;
1151
1152
1153// SVE BFloat16 (BF16) instructions
1154// -----------------------------------------------------------------------------
1155
1156// Convert, F32 to BF16
1157def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
1158
1159// Dot product
1160def : InstRW<[A510Write_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
1161
1162// Matrix multiply accumulate
1163def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>;
1164
1165// Multiply accumulate long
1166def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>;
1167
1168// SVE Load instructions
1169// -----------------------------------------------------------------------------
1170
1171// Load vector
1172def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instrs LDR_ZXI)>;
1173
1174// Load predicate
1175def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instrs LDR_PXI)>;
1176
1177// Contiguous load, scalar + imm
1178def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM$",
1179                                           "^LD1S?B_[HSD]_IMM$",
1180                                           "^LD1S?H_[SD]_IMM$",
1181                                           "^LD1S?W_D_IMM$" )>;
1182// Contiguous load, scalar + scalar
1183def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]$",
1184                                             "^LD1S?B_[HSD]$",
1185                                             "^LD1S?H_[SD]$",
1186                                             "^LD1S?W_D$" )>;
1187
1188// Contiguous load broadcast, scalar + imm
1189def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1R[BHWD]_IMM$",
1190                                           "^LD1RSW_IMM$",
1191                                           "^LD1RS?B_[HSD]_IMM$",
1192                                           "^LD1RS?H_[SD]_IMM$",
1193                                           "^LD1RS?W_D_IMM$",
1194                                           "^LD1RQ_[BHWD]_IMM$")>;
1195
1196// Contiguous load broadcast, scalar + scalar
1197def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LD1RQ_[BHWD]$")>;
1198
1199// Non temporal load, scalar + imm
1200def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRI$")>;
1201
1202// Non temporal load, scalar + scalar
1203def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRR$")>;
1204
1205// Non temporal gather load, vector + scalar 32-bit element size
1206def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S$",
1207                                              "^LDNT1S[BH]_ZZR_S$")>;
1208
1209// Non temporal gather load, vector + scalar 64-bit element size
1210def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
1211def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instrs LDNT1D_ZZR_D)>;
1212
1213// Contiguous first faulting load, scalar + scalar
1214def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDFF1[BHWD]$",
1215                                              "^LDFF1S?B_[HSD]$",
1216                                              "^LDFF1S?H_[SD]$",
1217                                              "^LDFF1S?W_D$")>;
1218
1219// Contiguous non faulting load, scalar + imm
1220def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDNF1[BHWD]_IMM$",
1221                                           "^LDNF1S?B_[HSD]_IMM$",
1222                                           "^LDNF1S?H_[SD]_IMM$",
1223                                           "^LDNF1S?W_D_IMM$")>;
1224
1225// Contiguous Load two structures to two vectors, scalar + imm
1226def : InstRW<[CortexA510MCWrite<3, 1, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>;
1227
1228// Contiguous Load two structures to two vectors, scalar + scalar
1229def : InstRW<[CortexA510MCWrite<3, 2, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]$")>;
1230
1231// Contiguous Load three structures to three vectors, scalar + imm
1232def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>;
1233
1234// Contiguous Load three structures to three vectors, scalar + scalar
1235def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]$")>;
1236
1237// Contiguous Load four structures to four vectors, scalar + imm
1238def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>;
1239
1240// Contiguous Load four structures to four vectors, scalar + scalar
1241def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]$")>;
1242
1243// Gather load, vector + imm, 32-bit element size
1244def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
1245                                              "^GLD(FF)?1W_IMM$")>;
1246
1247// Gather load, vector + imm, 64-bit element size
1248def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
1249                                              "^GLD(FF)?1D_IMM$")>;
1250
1251// Gather load, 64-bit element size
1252def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>],
1253             (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW(_SCALED)?$",
1254                        "^GLD(FF)?1S?[BHW]_D(_SCALED)?$",
1255                        "^GLD(FF)?1D_[SU]XTW(_SCALED)?$",
1256                        "^GLD(FF)?1D(_SCALED)?$")>;
1257
1258// Gather load, 32-bit scaled offset
1259def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>],
1260             (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$",
1261                        "^GLD(FF)?1W_[SU]XTW_SCALED")>;
1262
1263// Gather load, 32-bit unpacked unscaled offset
1264def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
1265                                              "^GLD(FF)?1W_[SU]XTW$")>;
1266
1267def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>;
1268// SVE Store instructions
1269// -----------------------------------------------------------------------------
1270
1271// Store from predicate reg
1272def : InstRW<[CortexA510VSt0], (instrs STR_PXI)>;
1273
1274// Store from vector reg
1275def : InstRW<[CortexA510VSt0], (instrs STR_ZXI)>;
1276
1277// Contiguous store, scalar + imm
1278def : InstRW<[CortexA510VSt0], (instregex "^ST1[BHWD]_IMM$",
1279                                                "^ST1B_[HSD]_IMM$",
1280                                                "^ST1H_[SD]_IMM$",
1281                                                "^ST1W_D_IMM$")>;
1282
1283// Contiguous store, scalar + scalar
1284def : InstRW<[CortexA510VSt0], (instregex "^ST1H(_[SD])?$")>;
1285def : InstRW<[CortexA510VSt0], (instregex "^ST1[BWD]$",
1286                                                "^ST1B_[HSD]$",
1287                                                "^ST1W_D$")>;
1288
1289// Contiguous store two structures from two vectors, scalar + imm
1290def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BHWD]_IMM$")>;
1291
1292// Contiguous store two structures from two vectors, scalar + scalar
1293def : InstRW<[CortexA510VSt<11>], (instrs ST2H)>;
1294
1295// Contiguous store two structures from two vectors, scalar + scalar
1296def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BWD]$")>;
1297
1298// Contiguous store three structures from three vectors, scalar + imm
1299def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]_IMM$")>;
1300def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D_IMM$")>;
1301
1302// Contiguous store three structures from three vectors, scalar + scalar
1303def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]$")>;
1304def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D$")>;
1305
1306// Contiguous store four structures from four vectors, scalar + imm
1307def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]_IMM$")>;
1308def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D_IMM$")>;
1309
1310// Contiguous store four structures from four vectors, scalar + scalar
1311def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]$")>;
1312
1313// Contiguous store four structures from four vectors, scalar + scalar
1314def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D$")>;
1315
1316// Non temporal store, scalar + imm
1317def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BHWD]_ZRI$")>;
1318
1319// Non temporal store, scalar + scalar
1320def : InstRW<[CortexA510VSt0], (instrs STNT1H_ZRR)>;
1321def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BWD]_ZRR$")>;
1322
1323// Scatter non temporal store, vector + scalar 32-bit element size
1324def : InstRW<[CortexA510VSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>;
1325
1326// Scatter non temporal store, vector + scalar 64-bit element size
1327def : InstRW<[CortexA510VSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>;
1328
1329// Scatter store vector + imm 32-bit element size
1330def : InstRW<[CortexA510VSt<9>], (instregex "^SST1[BH]_S_IMM$",
1331                                                "^SST1W_IMM$")>;
1332
1333// Scatter store vector + imm 64-bit element size
1334def : InstRW<[CortexA510VSt<7>], (instregex "^SST1[BHW]_D_IMM$",
1335                                                "^SST1D_IMM$")>;
1336
1337// Scatter store, 32-bit scaled offset
1338def : InstRW<[CortexA510VSt<8>],
1339             (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
1340
1341// Scatter store, 32-bit unpacked unscaled offset
1342def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D_[SU]XTW$",
1343                                                "^SST1D_[SU]XTW$")>;
1344
1345// Scatter store, 32-bit unpacked scaled offset
1346def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
1347                                                "^SST1D_[SU]XTW_SCALED$")>;
1348
1349// Scatter store, 32-bit unscaled offset
1350def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BH]_S_[SU]XTW$",
1351                                                "^SST1W_[SU]XTW$")>;
1352
1353// Scatter store, 64-bit scaled offset
1354def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_SCALED$",
1355                                                "^SST1D_SCALED$")>;
1356
1357// Scatter store, 64-bit unscaled offset
1358def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D$",
1359                                                "^SST1D$")>;
1360
1361// SVE Miscellaneous instructions
1362// -----------------------------------------------------------------------------
1363
1364// Read first fault register, unpredicated
1365def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs RDFFR_P)>;
1366
1367// Read first fault register, predicated
1368def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFR_PPz)>;
1369
1370// Read first fault register and set flags
1371def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFRS_PPz)>;
1372
1373// Set first fault register
1374// Write to first fault register
1375def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs SETFFR, WRFFR)>;
1376
1377// SVE Cryptographic instructions
1378// -----------------------------------------------------------------------------
1379
1380// Crypto AES ops
1381def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_B$",
1382                                           "^AESI?MC_ZZ_B$")>;
1383
1384// Crypto SHA3 ops
1385def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$",
1386                                            "^XAR_ZZZI_[BHSD]$")>;
1387
1388def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>;
1389
1390// Crypto SM4 ops
1391def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
1392
1393}
1394