1//==- AArch64SchedCortexA510.td - ARM Cortex-A510 Scheduling Definitions -*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for the ARM Cortex-A510 processor. 10// 11//===----------------------------------------------------------------------===// 12 13// ===---------------------------------------------------------------------===// 14// The following definitions describe the per-operand machine model. 15// This works with MachineScheduler. See MCSchedModel.h for details. 16 17// Cortex-A510 machine model for scheduling and other instruction cost heuristics. 18def CortexA510Model : SchedMachineModel { 19 let MicroOpBufferSize = 0; // The Cortex-A510 is an in-order processor 20 let IssueWidth = 3; // It dual-issues under most circumstances 21 let LoadLatency = 3; // Cycles for loads to access the cache. 22 // Most loads have a latency of 2, but some have higher latencies. 23 // 3 seems to be a good tradeoff 24 let PostRAScheduler = 1; // Enable PostRA scheduler pass. 25 let CompleteModel = 0; // Covers instructions applicable to Cortex-A510. 26 27 // FIXME: Remove when all errors have been fixed. 28 let FullInstRWOverlapCheck = 0; 29} 30 31 32//===----------------------------------------------------------------------===// 33// Subtarget-specific SchedWrite types 34 35let SchedModel = CortexA510Model in { 36 37//===----------------------------------------------------------------------===// 38// Define each kind of processor resource and number available. 39 40// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the 41// Cortex-A510 is in-order. 42let BufferSize = 0 in { 43 def CortexA510UnitALU0 : ProcResource<1>; // Int ALU0 44 def CortexA510UnitALU12 : ProcResource<2>; // Int ALU1 & ALU2 45 def CortexA510UnitMAC : ProcResource<1>; // Int MAC, 64-bi wide 46 def CortexA510UnitDiv : ProcResource<1>; // Int Division, not pipelined 47 // There are 2 LS pipes, 1 for Load/Store; 1 for Store only 48 def CortexA510UnitLdSt : ProcResource<1>; // Load/Store shared pipe 49 def CortexA510UnitLd1 : ProcResource<1>; // Load pipe 50 def CortexA510UnitB : ProcResource<1>; // Branch 51 def CortexA510UnitPAC : ProcResource<1>; // Pointer Authentication (PAC) pipe 52 53 // The FP DIV/SQRT instructions execute totally differently from the FP ALU 54 // instructions, which can mostly be dual-issued; that's why for now we model 55 // them with 2 resources. 56 def CortexA510UnitVALU0 : ProcResource<1>; // SIMD/FP/SVE ALU0 57 def CortexA510UnitVALU1 : ProcResource<1>; // SIMD/FP/SVE ALU0 58 def CortexA510UnitVMAC : ProcResource<2>; // SIMD/FP/SVE MAC 59 def CortexA510UnitVMC : ProcResource<1>; // SIMD/FP/SVE multicycle instrs (e.g Div, SQRT, cryptography) 60} 61 62def CortexA510UnitLd : ProcResGroup<[CortexA510UnitLdSt, CortexA510UnitLd1]>; 63def CortexA510UnitVALU : ProcResGroup<[CortexA510UnitVALU0, CortexA510UnitVALU1]>; 64def CortexA510UnitALU : ProcResGroup<[CortexA510UnitALU0, CortexA510UnitALU12]>; 65// These latencies are modeled without taking into account forwarding paths 66// (the software optimisation guide lists latencies taking into account 67// typical forwarding paths). 68def : WriteRes<WriteImm, [CortexA510UnitALU]> { let Latency = 1; } // MOVN, MOVZ 69def : WriteRes<WriteI, [CortexA510UnitALU]> { let Latency = 1; } // ALU 70def : WriteRes<WriteISReg, [CortexA510UnitALU]> { let Latency = 2; } // ALU of Shifted-Reg 71def : WriteRes<WriteIEReg, [CortexA510UnitALU]> { let Latency = 2; } // ALU of Extended-Reg 72def : WriteRes<WriteExtr, [CortexA510UnitALU]> { let Latency = 2; } // EXTR from a reg pair 73def : WriteRes<WriteIS, [CortexA510UnitALU]> { let Latency = 2; } // Shift/Scale 74 75// MAC 76def : WriteRes<WriteIM32, [CortexA510UnitMAC]> { let Latency = 3; } // 32-bit Multiply 77def : WriteRes<WriteIM64, [CortexA510UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];} // 64-bit Multiply 78 79// Div 80def : WriteRes<WriteID32, [CortexA510UnitDiv]> { 81 let Latency = 8; let ReleaseAtCycles = [8]; 82} 83def : WriteRes<WriteID64, [CortexA510UnitDiv]> { 84 let Latency = 16; let ReleaseAtCycles = [16]; 85} 86 87//===----------------------------------------------------------------------===// 88// Define customized scheduler read/write types specific to the Cortex A510 89 90//===----------------------------------------------------------------------===// 91class CortexA510Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> { 92 let Latency = n; 93} 94 95class CortexA510MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> { 96 let Latency = n; 97 let ReleaseAtCycles = [m]; 98 let BeginGroup = 1; 99} 100 101class CortexA510MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> { 102 let Latency = n; 103 let BeginGroup = 1; 104} 105 106//===----------------------------------------------------------------------===// 107// Define generic 2 micro-op types 108def A510Write_10cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> { 109 let Latency = 10; 110 let NumMicroOps = 2; 111} 112 113def A510Write_15cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> { 114 let Latency = 15; 115 let NumMicroOps = 2; 116} 117 118class A510Write_PAC_B <int lat> : SchedWriteRes<[CortexA510UnitPAC, CortexA510UnitB]> { 119 let Latency = lat; 120 let NumMicroOps = 2; 121} 122// Load 123def : WriteRes<WriteLD, [CortexA510UnitLd]> { let Latency = 2; } 124def : WriteRes<WriteLDIdx, [CortexA510UnitLd]> { let Latency = 2; } 125def : WriteRes<WriteLDHi, [CortexA510UnitLd]> { let Latency = 2; } 126 127def CortexA510WriteVLD1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 128def CortexA510WriteVLD1SI : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; let SingleIssue = 1; } 129def CortexA510WriteVLD2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 4; 130 let ReleaseAtCycles = [2]; } 131def CortexA510WriteVLD3 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5; 132 let ReleaseAtCycles = [3]; } 133def CortexA510WriteVLD4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6; 134 let ReleaseAtCycles = [4]; } 135def CortexA510WriteVLD6 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5; 136 let ReleaseAtCycles = [3]; } 137def CortexA510WriteVLD8 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6; 138 let ReleaseAtCycles = [4]; } 139 140def CortexA510WriteLDP1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 141def CortexA510WriteLDP2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 142def CortexA510WriteLDP4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 143 144// Pre/Post Indexing - Performed as part of address generation 145def : WriteRes<WriteAdr, []> { let Latency = 0; } 146 147// Store 148let RetireOOO = 1 in { 149def : WriteRes<WriteST, [CortexA510UnitLdSt]> { let Latency = 1; } 150def : WriteRes<WriteSTP, [CortexA510UnitLdSt]> { let Latency = 1; } 151def : WriteRes<WriteSTIdx, [CortexA510UnitLdSt]> { let Latency = 1; } 152} 153def : WriteRes<WriteSTX, [CortexA510UnitLdSt]> { let Latency = 3; } 154 155// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. 156def : WriteRes<WriteVST, [CortexA510UnitLdSt]> { let Latency = 5; 157 let ReleaseAtCycles = [2];} 158def CortexA510WriteVST1 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 4; } 159def CortexA510WriteVST2 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5; 160 let ReleaseAtCycles = [2]; } 161def CortexA510WriteVST3 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5; 162 let ReleaseAtCycles = [3]; } 163def CortexA510WriteVST4 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5; 164 let ReleaseAtCycles = [4]; } 165 166def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 167 168// Branch 169def : WriteRes<WriteBr, [CortexA510UnitB]>; 170def : WriteRes<WriteBrReg, [CortexA510UnitB]>; 171def : WriteRes<WriteSys, [CortexA510UnitB]>; 172def : WriteRes<WriteBarrier, [CortexA510UnitB]>; 173def : WriteRes<WriteHint, [CortexA510UnitB]>; 174 175// FP ALU 176// As WriteF result is produced in F5 and it can be mostly forwarded 177// to consumer at F1, the effectively Latency is set as 4. 178def : WriteRes<WriteF, [CortexA510UnitVALU]> { let Latency = 4; } 179def : WriteRes<WriteFCmp, [CortexA510UnitVALU]> { let Latency = 3; } 180def : WriteRes<WriteFCvt, [CortexA510UnitVALU]> { let Latency = 4; } 181def : WriteRes<WriteFCopy, [CortexA510UnitVALU]> { let Latency = 3; } 182def : WriteRes<WriteFImm, [CortexA510UnitVALU]> { let Latency = 3; } 183 184class CortexA510VSt<int n> : SchedWriteRes<[CortexA510UnitLdSt]> { 185 let RetireOOO = 1; 186 let ReleaseAtCycles = [n]; 187} 188 189def CortexA510VSt0 : SchedWriteRes<[CortexA510UnitLdSt]> { 190 let RetireOOO = 1; 191} 192 193def : SchedAlias<WriteVd, CortexA510Write<4, CortexA510UnitVALU>>; 194def : SchedAlias<WriteVq, CortexA510Write<4, CortexA510UnitVALU>>; 195 196// FP ALU specific new schedwrite definitions 197def CortexA510WriteFPALU_F3 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 3;} 198def CortexA510WriteFPALU_F4 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 4;} 199 200// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined 201def : WriteRes<WriteFMul, [CortexA510UnitVMAC]> { let Latency = 4; } 202 203let RetireOOO = 1 in { 204def : WriteRes<WriteFDiv, [CortexA510UnitVMC]> { let Latency = 22; 205 let ReleaseAtCycles = [29]; } 206def CortexA510WriteVMAC : SchedWriteRes<[CortexA510UnitVMAC]> { let Latency = 4; } 207def CortexA510WriteFDivHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8; 208 let ReleaseAtCycles = [5]; } 209def CortexA510WriteFDivSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 13; 210 let ReleaseAtCycles = [10]; } 211def CortexA510WriteFDivDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22; 212 let ReleaseAtCycles = [19]; } 213def CortexA510WriteFSqrtHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8; 214 let ReleaseAtCycles = [5]; } 215def CortexA510WriteFSqrtSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 12; 216 let ReleaseAtCycles = [9]; } 217def CortexA510WriteFSqrtDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22; 218 let ReleaseAtCycles = [19]; } 219} 220//===----------------------------------------------------------------------===// 221// Subtarget-specific SchedRead types. 222 223def : ReadAdvance<ReadVLD, 0>; 224def : ReadAdvance<ReadExtrHi, 0>; 225def : ReadAdvance<ReadAdrBase, 0>; 226def : ReadAdvance<ReadST, 1>; 227 228def : ReadAdvance<ReadI, 0>; 229def : ReadAdvance<ReadISReg, 0>; 230def : ReadAdvance<ReadIEReg, 0>; 231 232 233// MUL 234def : ReadAdvance<ReadIM, 0>; 235def : ReadAdvance<ReadIMA, 2>; 236 237// Div 238def : ReadAdvance<ReadID, 0>; 239 240//===----------------------------------------------------------------------===// 241// Subtarget-specific InstRWs. 242 243def A510WriteISReg : SchedWriteVariant<[ 244 SchedVar<RegShiftedPred, [WriteISReg]>, 245 SchedVar<NoSchedPred, [WriteI]>]>; 246def : InstRW<[A510WriteISReg], (instregex ".*rs$")>; 247def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>; 248 249// Pointer Authentication Instructions (v8.3 PAC) 250// ----------------------------------------------------------------------------- 251 252// Authenticate data address 253// Authenticate instruction address 254// Compute pointer authentication code for data address 255// Compute pointer authentication code, using generic key 256// Compute pointer authentication code for instruction address 257def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>; 258 259// Branch and link, register, with pointer authentication 260// Branch, register, with pointer authentication 261// Branch, return, with pointer authentication 262def : InstRW<[A510Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, 263 BRAAZ, BRAB, BRABZ, RETAA, RETAB, 264 ERETAA, ERETAB)>; 265 266// Load register, with pointer authentication 267def : InstRW<[CortexA510Write<2, CortexA510UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>; 268 269// Strip pointer authentication code 270def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>; 271//--- 272// Miscellaneous 273//--- 274def : InstRW<[CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?Wi")>; 275def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPSi")>; 276def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)i")>; 277def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQi")>; 278def : InstRW<[WriteAdr, CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?W(pre|post)")>; 279def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPS(pre|post)")>; 280def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)(pre|post)")>; 281def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQ(pre|post)")>; 282def : InstRW<[WriteI], (instrs COPY)>; 283//--- 284// Vector Loads - 128-bit per cycle 285//--- 286// 1-element structures 287def : InstRW<[CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; // single element 288def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate 289def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>; 290def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(16b|8h|4s|2d)$")>; 291def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures 292def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(16b|8h|4s|2d)$")>; 293def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)$")>; 294def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>; 295def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 296def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 297 298def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>; 299def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 300def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; 301def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; 302def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; 303def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; 304def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; 305def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; 306def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; 307def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; 308 309// 2-element structures 310def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>; 311def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 312def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; 313def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; 314 315def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>; 316def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; 317def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; 318def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; 319 320// 3-element structures 321def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; 322def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 323def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>; 324def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>; 325 326def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>; 327def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 328def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; 329def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; 330 331// 4-element structures 332def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs. 333def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs. 334def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs. 335def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 336 337def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>; 338def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 339def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>; 340def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; 341 342//--- 343// Vector Stores 344//--- 345def : InstRW<[CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)$")>; 346def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 347def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 348def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 349def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 350def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>; 351def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 352def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 353def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 354def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 355 356def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>; 357def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>; 358def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>; 359def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>; 360def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>; 361def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; 362 363def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>; 364def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 365def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>; 366def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>; 367 368def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>; 369def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 370def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>; 371def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 372 373//--- 374// Floating Point Conversions, MAC, DIV, SQRT 375//--- 376def : InstRW<[CortexA510WriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>; 377def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^XTN")>; 378def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>; 379def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>; 380 381def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>; 382def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>; 383def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTFv")>; 384 385def : InstRW<[CortexA510WriteVMAC], (instregex "^FN?M(ADD|SUB).*")>; 386def : InstRW<[CortexA510WriteVMAC], (instregex "^FML(A|S)v.*")>; 387def : InstRW<[CortexA510WriteFDivHP], (instrs FDIVHrr)>; 388def : InstRW<[CortexA510WriteFDivSP], (instrs FDIVSrr)>; 389def : InstRW<[CortexA510WriteFDivDP], (instrs FDIVDrr)>; 390def : InstRW<[CortexA510WriteFDivHP], (instregex "^FDIVv.*16$")>; 391def : InstRW<[CortexA510WriteFDivSP], (instregex "^FDIVv.*32$")>; 392def : InstRW<[CortexA510WriteFDivDP], (instregex "^FDIVv.*64$")>; 393def : InstRW<[CortexA510WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; 394def : InstRW<[CortexA510WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; 395def : InstRW<[CortexA510WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; 396 397def : InstRW<[CortexA510WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)>; 398 399// 4.15. Advanced SIMD integer instructions 400// ASIMD absolute diff 401def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>; 402def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>; 403// ASIMD absolute diff accum 404def : InstRW<[CortexA510Write<6, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>; 405// ASIMD absolute diff long 406def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>; 407// ASIMD arith #1 408def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v", 409 "[SU]R?HADDv", "[SU]HSUBv")>; 410// ASIMD arith #2 411def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$", 412 "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", 413 "ADDPv(2i32|4i16|8i8)$")>; 414def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>; 415def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$", 416 "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", 417 "ADDPv(16i8|2i64|4i32|8i16)$")>; 418def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>; 419// ASIMD arith #3 420def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", 421 "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>; 422def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ADDHNv", "SUBHNv")>; 423// ASIMD arith #5 424def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; 425// ASIMD arith, reduce 426def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ADDVv")>; 427def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SADDLVv", "UADDLVv")>; 428// ASIMD compare #1 429def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>; 430def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>; 431// ASIMD compare #2 432def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>; 433def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>; 434// ASIMD logical $1 435def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8", 436 "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>; 437def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8", 438 "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>; 439// ASIMD max/min, basic 440def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; 441def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; 442// SIMD max/min, reduce 443def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>; 444// ASIMD multiply, by element 445def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$", 446 "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>; 447// ASIMD multiply 448def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv8i8)>; 449def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv16i8)>; 450// ASIMD multiply accumulate 451def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>; 452def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>; 453def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>; 454// ASIMD multiply accumulate half 455def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQRDML[AS]H[vi]")>; 456// ASIMD multiply accumulate long 457def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]ML[AS]Lv")>; 458// ASIMD multiply accumulate long #2 459def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQDML[AS]L[iv]")>; 460// ASIMD dot product 461def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv8i8")>; 462def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv16i8")>; 463// ASIMD dot product, by scalar 464def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTlanev")>; 465// ASIMD multiply long 466def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>; 467// ASIMD polynomial (8x8) multiply long 468def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>; 469// ASIMD pairwise add and accumulate 470def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>; 471// ASIMD shift accumulate 472def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; 473def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; 474// ASIMD shift accumulate #2 475def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>; 476// ASIMD shift by immed 477def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv", 478 "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>; 479// ASIMD shift by immed 480// SXTL and UXTL are aliases for SHLL 481def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[US]?SHLLv")>; 482// ASIMD shift by immed #2 483def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)", 484 "[SU]RSHRv(16i8|2i64|4i32|8i16)")>; 485def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)", 486 "RSHRNv(16i8|4i32|8i16)")>; 487// ASIMD shift by register 488def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>; 489def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>; 490// ASIMD shift by register #2 491def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>; 492def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>; 493 494def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>; 495def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>; 496 497def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>; 498def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>; 499 500// Cryptography extensions 501// ----------------------------------------------------------------------------- 502 503// Crypto AES ops 504def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; 505 506// Crypto polynomial (64x64) multiply long 507def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; 508 509// Crypto SHA1 hash acceleration op 510// Crypto SHA1 schedule acceleration ops 511def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|SU1)")>; 512 513// Crypto SHA1 hash acceleration ops 514// Crypto SHA256 hash acceleration ops 515def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; 516 517// Crypto SHA256 schedule acceleration ops 518def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>; 519 520// Crypto SHA512 hash acceleration ops 521def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; 522 523// Crypto SHA3 ops 524def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3)>; 525def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs XAR)>; 526def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs RAX1)>; 527 528 529// Crypto SM3 ops 530def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", 531 "^SM3TT[12][AB]$")>; 532 533// Crypto SM4 ops 534def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>; 535 536// CRC 537// ----------------------------------------------------------------------------- 538 539def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")>; 540 541// SVE Predicate instructions 542 543// Loop control, based on predicate 544def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP, 545 BRKB_PPmP, BRKB_PPzP)>; 546 547// Loop control, based on predicate and flag setting 548def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; 549 550// Loop control, propagating 551def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; 552 553// Loop control, propagating and flag setting 554def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>; 555def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>; 556 557 558// Loop control, based on GPR 559def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], 560 (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; 561 562def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; 563 564// Loop terminate 565def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; 566 567// Predicate counting scalar 568def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; 569 570def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], 571 (instregex "^CNT[BHWD]_XPiI")>; 572 573def : InstRW<[CortexA510Write<3, CortexA510UnitALU>], 574 (instregex "^(INC|DEC)[BHWD]_XPiI")>; 575 576def : InstRW<[CortexA510Write<4, CortexA510UnitALU>], 577 (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>; 578 579// Predicate counting scalar, active predicate 580def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], 581 (instregex "^CNTP_XPP_[BHSD]")>; 582 583def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], 584 (instregex "^(DEC|INC)P_XP_[BHSD]")>; 585 586def : InstRW<[CortexA510Write<9, CortexA510UnitVALU0>], 587 (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", 588 "^(UQDEC|UQINC)P_WP_[BHSD]", 589 "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>; 590 591 592// Predicate counting vector, active predicate 593def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 594 (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; 595 596// Predicate logical 597def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], 598 (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; 599 600// Predicate logical, flag setting 601def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], 602 (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; 603 604// Predicate reverse 605def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>; 606 607// Predicate select 608def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs SEL_PPPP)>; 609 610// Predicate set 611def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; 612 613// Predicate set/initialize, set flags 614def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>; 615 616// Predicate find first/next 617def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; 618 619// Predicate test 620def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PTEST_PP)>; 621 622// Predicate transpose 623def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; 624 625// Predicate unpack and widen 626def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; 627 628// Predicate zip/unzip 629def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; 630 631 632// SVE integer instructions 633// ----------------------------------------------------------------------------- 634// Arithmetic, absolute diff 635def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>; 636 637// Arithmetic, absolute diff accum 638def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; 639 640// Arithmetic, absolute diff accum long 641def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; 642 643// Arithmetic, absolute diff long 644def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; 645 646// Arithmetic, basic 647def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 648 (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]", 649 "^(ADD|SUB|SUBR)_ZPmZ_[BHSD]", 650 "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]", 651 "^(ADD|SUB)_ZZZ_[BHSD]", 652 "^(ADD|SUB|SUBR)_ZI_[BHSD]", 653 "^ADR_[SU]XTW_ZZZ_D_[0123]", 654 "^ADR_LSL_ZZZ_[SD]_[0123]", 655 "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>; 656def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 657 (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", 658 "^SADDLBT_ZZZ_[HSD]", 659 "^SSUBL(BT|TB)_ZZZ_[HSD]")>; 660 661// Arithmetic, complex 662def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 663 (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]", 664 "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]", 665 "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", 666 "^[SU]Q(ADD|SUB)_ZI_[BHSD]", 667 "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", 668 "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; 669def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], 670 (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>; 671 672// Arithmetic, large integer 673def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; 674 675// Arithmetic, pairwise add 676def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>; 677 678// Arithmetic, pairwise add and accum long 679def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>; 680 681// Arithmetic, shift 682def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 683 (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]", 684 "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]", 685 "^(ASR|LSL|LSR)_ZPmI_[BHSD]", 686 "^(ASR|LSL|LSR)_ZPZI_[BHSD]", 687 "^(ASR|LSL|LSR)_ZPmZ_[BHSD]", 688 "^(ASR|LSL|LSR)_ZPZZ_[BHSD]", 689 "^(ASR|LSL|LSR)_ZZI_[BHSD]", 690 "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; 691// Arithmetic, shift right for divide 692def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 693 (instregex "^ASRD_ZPmI_[BHSD]", 694 "^ASRD_ZPZI_[BHSD]")>; 695 696// Arithmetic, shift and accumulate 697def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 698 (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>; 699 700def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], 701 (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>; 702 703 704// Arithmetic, shift by immediate 705// Arithmetic, shift by immediate and insert 706def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 707 (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>; 708 709// Arithmetic, shift complex 710def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 711 (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]", 712 "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]", 713 "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]", 714 "^SQSHRU?N[BT]_ZZI_[BHS]", 715 "^UQR?SHRN[BT]_ZZI_[BHS]")>; 716 717// Arithmetic, shift rounding 718def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 719 (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]", 720 "^[SU]RSHR_ZPmI_[BHSD]")>; 721 722// Bit manipulation 723def : InstRW<[CortexA510MCWrite<14, 13, CortexA510UnitVMC>], 724 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>; 725 726def : InstRW<[CortexA510MCWrite<22, 21, CortexA510UnitVMC>], 727 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>; 728 729def : InstRW<[CortexA510MCWrite<38, 37, CortexA510UnitVMC>], 730 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>; 731 732def : InstRW<[CortexA510MCWrite<70, 69, CortexA510UnitVMC>], 733 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>; 734 735 736// Bitwise select 737def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>; 738 739// Count/reverse bits 740def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>; 741def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; 742def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>; 743def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>; 744// Broadcast logical bitmask immediate to vector 745def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>; 746 747// Compare and set flags 748def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 749 (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", 750 "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; 751 752// Complex add 753def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>; 754 755def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>; 756 757// Complex dot product 8-bit element 758def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; 759 760// Complex dot product 16-bit element 761def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; 762 763// Complex multiply-add B, H, S element size 764def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]", 765 "^CMLA_ZZZI_[HS]")>; 766 767// Complex multiply-add D element size 768def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CMLA_ZZZ_D)>; 769 770// Conditional extract operations, scalar form 771def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>; 772 773// Conditional extract operations, SIMD&FP scalar and vector forms 774def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]", 775 "^COMPACT_ZPZ_[SD]", 776 "^SPLICE_ZPZZ?_[BHSD]")>; 777 778// Convert to floating point, 64b to float or convert to double 779def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>; 780 781// Convert to floating point, 64b to half 782def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>; 783 784// Convert to floating point, 32b to single or half 785def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>; 786 787// Convert to floating point, 32b to double 788def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>; 789 790// Convert to floating point, 16b to half 791def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>; 792 793// Copy, scalar 794def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>],(instregex "^CPY_ZPmR_[BHSD]")>; 795 796// Copy, scalar SIMD&FP or imm 797def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]", 798 "^CPY_ZPzI_[BHSD]")>; 799 800// Divides, 32 bit 801def : InstRW<[CortexA510MCWrite<15, 12, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>; 802 803// Divides, 64 bit 804def : InstRW<[CortexA510MCWrite<26, 23, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>; 805 806// Dot product, 8 bit 807def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>; 808 809// Dot product, 8 bit, using signed and unsigned integers 810def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; 811 812// Dot product, 16 bit 813def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>; 814 815// Duplicate, immediate and indexed form 816def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZI_[BHSD]", 817 "^DUP_ZZI_[BHSDQ]")>; 818 819// Duplicate, scalar form 820def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZR_[BHSD]")>; 821 822// Extend, sign or zero 823def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]", 824 "^[SU]XTH_ZPmZ_[SD]", 825 "^[SU]XTW_ZPmZ_[D]")>; 826 827// Extract 828def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>; 829 830// Extract narrow saturating 831def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", 832 "^SQXTUN[BT]_ZZ_[BHS]")>; 833 834// Extract/insert operation, SIMD and FP scalar form 835def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]", 836 "^INSR_ZV_[BHSD]")>; 837 838// Extract/insert operation, scalar 839def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^LAST[AB]_RPZ_[BHSD]", 840 "^INSR_ZR_[BHSD]")>; 841 842// Histogram operations 843def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^HISTCNT_ZPzZZ_[SD]", 844 "^HISTSEG_ZZZ")>; 845 846// Horizontal operations, B, H, S form, immediate operands only 847def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_II_[BHS]")>; 848 849// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar 850// operands only / immediate, scalar operands 851def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>; 852 853// Horizontal operations, D form, immediate operands only 854def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs INDEX_II_D)>; 855 856// Horizontal operations, D form, scalar, immediate operands)/ scalar operands 857// only / immediate, scalar operands 858def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>; 859 860// Logical 861def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 862 (instregex "^(AND|EOR|ORR)_ZI", 863 "^(AND|BIC|EOR|EOR|ORR)_ZZZ", 864 "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]", 865 "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>; 866 867def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 868 (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>; 869 870// Max/min, basic and pairwise 871def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", 872 "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>; 873 874// Matching operations 875def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>; 876 877// Matrix multiply-accumulate 878def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; 879 880// Move prefix 881def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]", 882 "^MOVPRFX_ZZ")>; 883 884// Multiply, B, H, S element size 885def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]", 886 "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>; 887 888// Multiply, D element size 889def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D", 890 "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>; 891 892// Multiply long 893def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]", 894 "^[SU]MULL[BT]_ZZZ_[HSD]")>; 895 896// Multiply accumulate, B, H, S element size 897def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]", 898 "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>; 899 900// Multiply accumulate, D element size 901def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D", 902 "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; 903 904// Multiply accumulate long 905def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]", 906 "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>; 907 908// Multiply accumulate saturating doubling long regular 909def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]", 910 "^SQDML[AS](LB|LT)_ZZZI_[SD]")>; 911 912// Multiply saturating doubling high, B, H, S element size 913def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]", 914 "^SQDMULH_ZZZI_[HS]")>; 915 916// Multiply saturating doubling high, D element size 917def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; 918 919// Multiply saturating doubling long 920def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]", 921 "^SQDMULL[BT]_ZZZI_[SD]")>; 922 923// Multiply saturating rounding doubling regular/complex accumulate, B, H, S 924// element size 925def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]", 926 "^SQRDCMLAH_ZZZ_[BHS]", 927 "^SQRDML[AS]H_ZZZI_[HS]", 928 "^SQRDCMLAH_ZZZI_[HS]")>; 929 930// Multiply saturating rounding doubling regular/complex accumulate, D element 931// size 932def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D", 933 "^SQRDCMLAH_ZZZ_D")>; 934 935// Multiply saturating rounding doubling regular/complex, B, H, S element size 936def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]", 937 "^SQRDMULH_ZZZI_[HS]")>; 938 939// Multiply saturating rounding doubling regular/complex, D element size 940def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>; 941 942// Multiply/multiply long, (8x8) polynomial 943def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>; 944 945def : InstRW<[CortexA510Write<9, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>; 946 947 948// Predicate counting vector 949def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 950 (instregex "^(DEC|INC)[HWD]_ZPiI")>; 951def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 952 (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; 953 954// Reciprocal estimate 955def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; 956 957// Reduction, arithmetic, B form 958def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; 959 960// Reduction, arithmetic, H form 961def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; 962 963// Reduction, arithmetic, S form 964def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; 965 966// Reduction, arithmetic, D form 967def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; 968 969// Reduction, logical 970def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>; 971 972// Reverse, vector 973def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]", 974 "^REVB_ZPmZ_[HSD]", 975 "^REVH_ZPmZ_[SD]", 976 "^REVW_ZPmZ_D")>; 977 978// Select, vector form 979def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>; 980 981// Table lookup 982def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[BHSD]")>; 983 984// Table lookup extension 985def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>; 986 987// Transpose, vector form 988def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; 989 990// Unpack and extend 991def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; 992 993// Zip/unzip 994def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; 995 996// SVE floating-point instructions 997// ----------------------------------------------------------------------------- 998 999// Floating point absolute value/difference 1000def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]", 1001 "^FAB[SD]_ZPZZ_[HSD]")>; 1002 1003// Floating point arithmetic 1004def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]", 1005 "^FADDP_ZPmZZ_[HSD]", 1006 "^FNEG_ZPmZ_[HSD]", 1007 "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>; 1008 1009// Floating point associative add, F16 1010def : InstRW<[CortexA510MCWrite<32, 29, CortexA510UnitVALU>], (instrs FADDA_VPZ_H)>; 1011 1012// Floating point associative add, F32 1013def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_S)>; 1014 1015// Floating point associative add, F64 1016def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>; 1017 1018// Floating point compare 1019def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]", 1020 "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", 1021 "^FCM(LE|LT)_PPzZ0_[HSD]", 1022 "^FCMUO_PPzZZ_[HSD]")>; 1023 1024// Floating point complex add 1025def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>; 1026 1027// Floating point complex multiply add 1028def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]", 1029 "^FCMLA_ZZZI_[HS]")>; 1030 1031// Floating point convert, long or narrow (F16 to F32 or F32 to F16) 1032def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", 1033 "^FCVTLT_ZPmZ_HtoS", 1034 "^FCVTNT_ZPmZ_StoH")>; 1035 1036// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 1037// or F64 to F16) 1038def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)", 1039 "^FCVTLT_ZPmZ_StoD", 1040 "^FCVTNT_ZPmZ_DtoS")>; 1041 1042// Floating point convert, round to odd 1043def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>; 1044 1045// Floating point base2 log, F16 1046def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>; 1047 1048// Floating point base2 log, F32 1049def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>; 1050 1051// Floating point base2 log, F64 1052def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>; 1053 1054// Floating point convert to integer, F16 1055def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>; 1056 1057// Floating point convert to integer, F32 1058def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>; 1059 1060// Floating point convert to integer, F64 1061def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 1062 (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>; 1063 1064// Floating point copy 1065def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>], (instregex "^FCPY_ZPmI_[HSD]", 1066 "^FDUP_ZI_[HSD]")>; 1067 1068// Floating point divide, F16 1069def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>; 1070 1071// Floating point divide, F32 1072def : InstRW<[CortexA510MCWrite<13, 10, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>; 1073 1074// Floating point divide, F64 1075def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>; 1076 1077// Floating point min/max pairwise 1078def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>; 1079 1080// Floating point min/max 1081def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>; 1082 1083// Floating point multiply 1084def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]", 1085 "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>; 1086 1087// Floating point multiply accumulate 1088def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], 1089 (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]", 1090 "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>; 1091 1092// Floating point multiply add/sub accumulate long 1093def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>; 1094 1095// Floating point reciprocal estimate, F16 1096def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H", 1097 "^FRSQRTE_ZZ_H")>; 1098 1099// Floating point reciprocal estimate, F32 1100def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S", 1101 "^FRSQRTE_ZZ_S")>; 1102// Floating point reciprocal estimate, F64 1103def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D", 1104 "^FRSQRTE_ZZ_D")>; 1105 1106// Floating point reciprocal step 1107def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; 1108 1109// Floating point reduction, F16 1110def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], 1111 (instregex "^(FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_[HSD]")>; 1112 1113// Floating point reduction, F32 1114def : InstRW<[CortexA510MCWrite<12, 11, CortexA510UnitVALU0>], 1115 (instregex "^FADDV_VPZ_H")>; 1116 1117def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU0>], 1118 (instregex "^FADDV_VPZ_S")>; 1119 1120def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], 1121 (instregex "^FADDV_VPZ_D")>; 1122 1123 1124// Floating point round to integral, F16 1125def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>; 1126 1127// Floating point round to integral, F32 1128def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>; 1129 1130// Floating point round to integral, F64 1131def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>; 1132 1133// Floating point square root, F16 1134def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_H")>; 1135 1136// Floating point square root, F32 1137def : InstRW<[CortexA510MCWrite<12, 9, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_S")>; 1138 1139// Floating point square root, F64 1140def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_D")>; 1141 1142// Floating point trigonometric exponentiation 1143def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>; 1144 1145// Floating point trigonometric multiply add 1146def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>; 1147 1148// Floating point trigonometric, miscellaneous 1149def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>; 1150def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; 1151 1152 1153// SVE BFloat16 (BF16) instructions 1154// ----------------------------------------------------------------------------- 1155 1156// Convert, F32 to BF16 1157def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; 1158 1159// Dot product 1160def : InstRW<[A510Write_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; 1161 1162// Matrix multiply accumulate 1163def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>; 1164 1165// Multiply accumulate long 1166def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>; 1167 1168// SVE Load instructions 1169// ----------------------------------------------------------------------------- 1170 1171// Load vector 1172def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instrs LDR_ZXI)>; 1173 1174// Load predicate 1175def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instrs LDR_PXI)>; 1176 1177// Contiguous load, scalar + imm 1178def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM$", 1179 "^LD1S?B_[HSD]_IMM$", 1180 "^LD1S?H_[SD]_IMM$", 1181 "^LD1S?W_D_IMM$" )>; 1182// Contiguous load, scalar + scalar 1183def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]$", 1184 "^LD1S?B_[HSD]$", 1185 "^LD1S?H_[SD]$", 1186 "^LD1S?W_D$" )>; 1187 1188// Contiguous load broadcast, scalar + imm 1189def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1R[BHWD]_IMM$", 1190 "^LD1RSW_IMM$", 1191 "^LD1RS?B_[HSD]_IMM$", 1192 "^LD1RS?H_[SD]_IMM$", 1193 "^LD1RS?W_D_IMM$", 1194 "^LD1RQ_[BHWD]_IMM$")>; 1195 1196// Contiguous load broadcast, scalar + scalar 1197def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LD1RQ_[BHWD]$")>; 1198 1199// Non temporal load, scalar + imm 1200def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRI$")>; 1201 1202// Non temporal load, scalar + scalar 1203def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRR$")>; 1204 1205// Non temporal gather load, vector + scalar 32-bit element size 1206def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S$", 1207 "^LDNT1S[BH]_ZZR_S$")>; 1208 1209// Non temporal gather load, vector + scalar 64-bit element size 1210def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D$")>; 1211def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instrs LDNT1D_ZZR_D)>; 1212 1213// Contiguous first faulting load, scalar + scalar 1214def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDFF1[BHWD]$", 1215 "^LDFF1S?B_[HSD]$", 1216 "^LDFF1S?H_[SD]$", 1217 "^LDFF1S?W_D$")>; 1218 1219// Contiguous non faulting load, scalar + imm 1220def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDNF1[BHWD]_IMM$", 1221 "^LDNF1S?B_[HSD]_IMM$", 1222 "^LDNF1S?H_[SD]_IMM$", 1223 "^LDNF1S?W_D_IMM$")>; 1224 1225// Contiguous Load two structures to two vectors, scalar + imm 1226def : InstRW<[CortexA510MCWrite<3, 1, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>; 1227 1228// Contiguous Load two structures to two vectors, scalar + scalar 1229def : InstRW<[CortexA510MCWrite<3, 2, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]$")>; 1230 1231// Contiguous Load three structures to three vectors, scalar + imm 1232def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>; 1233 1234// Contiguous Load three structures to three vectors, scalar + scalar 1235def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]$")>; 1236 1237// Contiguous Load four structures to four vectors, scalar + imm 1238def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>; 1239 1240// Contiguous Load four structures to four vectors, scalar + scalar 1241def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]$")>; 1242 1243// Gather load, vector + imm, 32-bit element size 1244def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM$", 1245 "^GLD(FF)?1W_IMM$")>; 1246 1247// Gather load, vector + imm, 64-bit element size 1248def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$", 1249 "^GLD(FF)?1D_IMM$")>; 1250 1251// Gather load, 64-bit element size 1252def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], 1253 (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW(_SCALED)?$", 1254 "^GLD(FF)?1S?[BHW]_D(_SCALED)?$", 1255 "^GLD(FF)?1D_[SU]XTW(_SCALED)?$", 1256 "^GLD(FF)?1D(_SCALED)?$")>; 1257 1258// Gather load, 32-bit scaled offset 1259def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], 1260 (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$", 1261 "^GLD(FF)?1W_[SU]XTW_SCALED")>; 1262 1263// Gather load, 32-bit unpacked unscaled offset 1264def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", 1265 "^GLD(FF)?1W_[SU]XTW$")>; 1266 1267def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>; 1268// SVE Store instructions 1269// ----------------------------------------------------------------------------- 1270 1271// Store from predicate reg 1272def : InstRW<[CortexA510VSt0], (instrs STR_PXI)>; 1273 1274// Store from vector reg 1275def : InstRW<[CortexA510VSt0], (instrs STR_ZXI)>; 1276 1277// Contiguous store, scalar + imm 1278def : InstRW<[CortexA510VSt0], (instregex "^ST1[BHWD]_IMM$", 1279 "^ST1B_[HSD]_IMM$", 1280 "^ST1H_[SD]_IMM$", 1281 "^ST1W_D_IMM$")>; 1282 1283// Contiguous store, scalar + scalar 1284def : InstRW<[CortexA510VSt0], (instregex "^ST1H(_[SD])?$")>; 1285def : InstRW<[CortexA510VSt0], (instregex "^ST1[BWD]$", 1286 "^ST1B_[HSD]$", 1287 "^ST1W_D$")>; 1288 1289// Contiguous store two structures from two vectors, scalar + imm 1290def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BHWD]_IMM$")>; 1291 1292// Contiguous store two structures from two vectors, scalar + scalar 1293def : InstRW<[CortexA510VSt<11>], (instrs ST2H)>; 1294 1295// Contiguous store two structures from two vectors, scalar + scalar 1296def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BWD]$")>; 1297 1298// Contiguous store three structures from three vectors, scalar + imm 1299def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]_IMM$")>; 1300def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D_IMM$")>; 1301 1302// Contiguous store three structures from three vectors, scalar + scalar 1303def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]$")>; 1304def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D$")>; 1305 1306// Contiguous store four structures from four vectors, scalar + imm 1307def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]_IMM$")>; 1308def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D_IMM$")>; 1309 1310// Contiguous store four structures from four vectors, scalar + scalar 1311def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]$")>; 1312 1313// Contiguous store four structures from four vectors, scalar + scalar 1314def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D$")>; 1315 1316// Non temporal store, scalar + imm 1317def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BHWD]_ZRI$")>; 1318 1319// Non temporal store, scalar + scalar 1320def : InstRW<[CortexA510VSt0], (instrs STNT1H_ZRR)>; 1321def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BWD]_ZRR$")>; 1322 1323// Scatter non temporal store, vector + scalar 32-bit element size 1324def : InstRW<[CortexA510VSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>; 1325 1326// Scatter non temporal store, vector + scalar 64-bit element size 1327def : InstRW<[CortexA510VSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>; 1328 1329// Scatter store vector + imm 32-bit element size 1330def : InstRW<[CortexA510VSt<9>], (instregex "^SST1[BH]_S_IMM$", 1331 "^SST1W_IMM$")>; 1332 1333// Scatter store vector + imm 64-bit element size 1334def : InstRW<[CortexA510VSt<7>], (instregex "^SST1[BHW]_D_IMM$", 1335 "^SST1D_IMM$")>; 1336 1337// Scatter store, 32-bit scaled offset 1338def : InstRW<[CortexA510VSt<8>], 1339 (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; 1340 1341// Scatter store, 32-bit unpacked unscaled offset 1342def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D_[SU]XTW$", 1343 "^SST1D_[SU]XTW$")>; 1344 1345// Scatter store, 32-bit unpacked scaled offset 1346def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", 1347 "^SST1D_[SU]XTW_SCALED$")>; 1348 1349// Scatter store, 32-bit unscaled offset 1350def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BH]_S_[SU]XTW$", 1351 "^SST1W_[SU]XTW$")>; 1352 1353// Scatter store, 64-bit scaled offset 1354def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_SCALED$", 1355 "^SST1D_SCALED$")>; 1356 1357// Scatter store, 64-bit unscaled offset 1358def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D$", 1359 "^SST1D$")>; 1360 1361// SVE Miscellaneous instructions 1362// ----------------------------------------------------------------------------- 1363 1364// Read first fault register, unpredicated 1365def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs RDFFR_P)>; 1366 1367// Read first fault register, predicated 1368def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFR_PPz)>; 1369 1370// Read first fault register and set flags 1371def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFRS_PPz)>; 1372 1373// Set first fault register 1374// Write to first fault register 1375def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs SETFFR, WRFFR)>; 1376 1377// SVE Cryptographic instructions 1378// ----------------------------------------------------------------------------- 1379 1380// Crypto AES ops 1381def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_B$", 1382 "^AESI?MC_ZZ_B$")>; 1383 1384// Crypto SHA3 ops 1385def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$", 1386 "^XAR_ZZZI_[BHSD]$")>; 1387 1388def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>; 1389 1390// Crypto SM4 ops 1391def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; 1392 1393} 1394