1//==- AArch64SchedCortexA510.td - ARM Cortex-A510 Scheduling Definitions -*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for the ARM Cortex-A510 processor. 10// 11//===----------------------------------------------------------------------===// 12 13// ===---------------------------------------------------------------------===// 14// The following definitions describe the per-operand machine model. 15// This works with MachineScheduler. See MCSchedModel.h for details. 16 17// Cortex-A510 machine model for scheduling and other instruction cost heuristics. 18def CortexA510Model : SchedMachineModel { 19 let MicroOpBufferSize = 0; // The Cortex-A510 is an in-order processor 20 let IssueWidth = 3; // It dual-issues under most circumstances 21 let LoadLatency = 3; // Cycles for loads to access the cache. 22 // Most loads have a latency of 2, but some have higher latencies. 23 // 3 seems to be a good tradeoff 24 let PostRAScheduler = 1; // Enable PostRA scheduler pass. 25 let CompleteModel = 0; // Covers instructions applicable to Cortex-A510. 26 27 // FIXME: Remove when all errors have been fixed. 28 let FullInstRWOverlapCheck = 0; 29} 30 31 32//===----------------------------------------------------------------------===// 33// Subtarget-specific SchedWrite types 34 35let SchedModel = CortexA510Model in { 36 37//===----------------------------------------------------------------------===// 38// Define each kind of processor resource and number available. 39 40// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the 41// Cortex-A510 is in-order. 42let BufferSize = 0 in { 43 def CortexA510UnitALU0 : ProcResource<1>; // Int ALU0 44 def CortexA510UnitALU12 : ProcResource<2>; // Int ALU1 & ALU2 45 def CortexA510UnitMAC : ProcResource<1>; // Int MAC, 64-bi wide 46 def CortexA510UnitDiv : ProcResource<1>; // Int Division, not pipelined 47 // There are 2 LS pipes, 1 for Load/Store; 1 for Store only 48 def CortexA510UnitLdSt : ProcResource<1>; // Load/Store shared pipe 49 def CortexA510UnitLd1 : ProcResource<1>; // Load pipe 50 def CortexA510UnitB : ProcResource<1>; // Branch 51 def CortexA510UnitPAC : ProcResource<1>; // Pointer Authentication (PAC) pipe 52 53 // The FP DIV/SQRT instructions execute totally differently from the FP ALU 54 // instructions, which can mostly be dual-issued; that's why for now we model 55 // them with 2 resources. 56 def CortexA510UnitVALU0 : ProcResource<1>; // SIMD/FP/SVE ALU0 57 def CortexA510UnitVALU1 : ProcResource<1>; // SIMD/FP/SVE ALU0 58 def CortexA510UnitVMAC : ProcResource<2>; // SIMD/FP/SVE MAC 59 def CortexA510UnitVMC : ProcResource<1>; // SIMD/FP/SVE multicycle instrs (e.g Div, SQRT, cryptography) 60} 61 62def CortexA510UnitLd : ProcResGroup<[CortexA510UnitLdSt, CortexA510UnitLd1]>; 63def CortexA510UnitVALU : ProcResGroup<[CortexA510UnitVALU0, CortexA510UnitVALU1]>; 64def CortexA510UnitALU : ProcResGroup<[CortexA510UnitALU0, CortexA510UnitALU12]>; 65// These latencies are modeled without taking into account forwarding paths 66// (the software optimisation guide lists latencies taking into account 67// typical forwarding paths). 68def : WriteRes<WriteImm, [CortexA510UnitALU]> { let Latency = 1; } // MOVN, MOVZ 69def : WriteRes<WriteI, [CortexA510UnitALU]> { let Latency = 1; } // ALU 70def : WriteRes<WriteISReg, [CortexA510UnitALU]> { let Latency = 2; } // ALU of Shifted-Reg 71def : WriteRes<WriteIEReg, [CortexA510UnitALU]> { let Latency = 2; } // ALU of Extended-Reg 72def : WriteRes<WriteExtr, [CortexA510UnitALU]> { let Latency = 2; } // EXTR from a reg pair 73def : WriteRes<WriteIS, [CortexA510UnitALU]> { let Latency = 2; } // Shift/Scale 74 75// MAC 76def : WriteRes<WriteIM32, [CortexA510UnitMAC]> { let Latency = 3; } // 32-bit Multiply 77def : WriteRes<WriteIM64, [CortexA510UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];} // 64-bit Multiply 78 79// Div 80def : WriteRes<WriteID32, [CortexA510UnitDiv]> { 81 let Latency = 8; let ReleaseAtCycles = [8]; 82} 83def : WriteRes<WriteID64, [CortexA510UnitDiv]> { 84 let Latency = 16; let ReleaseAtCycles = [16]; 85} 86 87//===----------------------------------------------------------------------===// 88// Define customized scheduler read/write types specific to the Cortex A510 89 90//===----------------------------------------------------------------------===// 91class CortexA510Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> { 92 let Latency = n; 93} 94 95class CortexA510MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> { 96 let Latency = n; 97 let ReleaseAtCycles = [m]; 98 let BeginGroup = 1; 99} 100 101class CortexA510MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> { 102 let Latency = n; 103 let BeginGroup = 1; 104} 105 106//===----------------------------------------------------------------------===// 107// Define generic 2 micro-op types 108def A510Write_10cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> { 109 let Latency = 10; 110 let NumMicroOps = 2; 111} 112 113def A510Write_15cyc_1VMAC_1VALU : SchedWriteRes<[CortexA510UnitVALU, CortexA510UnitVMAC]> { 114 let Latency = 15; 115 let NumMicroOps = 2; 116} 117 118class A510Write_PAC_B <int lat> : SchedWriteRes<[CortexA510UnitPAC, CortexA510UnitB]> { 119 let Latency = lat; 120 let NumMicroOps = 2; 121} 122// Load 123def : WriteRes<WriteLD, [CortexA510UnitLd]> { let Latency = 2; } 124def : WriteRes<WriteLDIdx, [CortexA510UnitLd]> { let Latency = 2; } 125def : WriteRes<WriteLDHi, [CortexA510UnitLd]> { let Latency = 2; } 126 127def CortexA510WriteVLD1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 128def CortexA510WriteVLD1SI : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; let SingleIssue = 1; } 129def CortexA510WriteVLD2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 4; 130 let ReleaseAtCycles = [2]; } 131def CortexA510WriteVLD3 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5; 132 let ReleaseAtCycles = [3]; } 133def CortexA510WriteVLD4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6; 134 let ReleaseAtCycles = [4]; } 135def CortexA510WriteVLD6 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 5; 136 let ReleaseAtCycles = [3]; } 137def CortexA510WriteVLD8 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 6; 138 let ReleaseAtCycles = [4]; } 139 140def CortexA510WriteLDP1 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 141def CortexA510WriteLDP2 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 142def CortexA510WriteLDP4 : SchedWriteRes<[CortexA510UnitLd]> { let Latency = 3; } 143 144// Pre/Post Indexing - Performed as part of address generation 145def : WriteRes<WriteAdr, []> { let Latency = 0; } 146 147// Store 148let RetireOOO = 1 in { 149def : WriteRes<WriteST, [CortexA510UnitLdSt]> { let Latency = 1; } 150def : WriteRes<WriteSTP, [CortexA510UnitLdSt]> { let Latency = 1; } 151def : WriteRes<WriteSTIdx, [CortexA510UnitLdSt]> { let Latency = 1; } 152} 153def : WriteRes<WriteSTX, [CortexA510UnitLdSt]> { let Latency = 3; } 154 155// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. 156def : WriteRes<WriteVST, [CortexA510UnitLdSt]> { let Latency = 5; 157 let ReleaseAtCycles = [2];} 158def CortexA510WriteVST1 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 4; } 159def CortexA510WriteVST2 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5; 160 let ReleaseAtCycles = [2]; } 161def CortexA510WriteVST3 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5; 162 let ReleaseAtCycles = [3]; } 163def CortexA510WriteVST4 : SchedWriteRes<[CortexA510UnitLdSt]> { let Latency = 5; 164 let ReleaseAtCycles = [4]; } 165 166def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 167 168// Branch 169def : WriteRes<WriteBr, [CortexA510UnitB]>; 170def : WriteRes<WriteBrReg, [CortexA510UnitB]>; 171def : WriteRes<WriteSys, [CortexA510UnitB]>; 172def : WriteRes<WriteBarrier, [CortexA510UnitB]>; 173def : WriteRes<WriteHint, [CortexA510UnitB]>; 174 175// FP ALU 176// As WriteF result is produced in F5 and it can be mostly forwarded 177// to consumer at F1, the effectively Latency is set as 4. 178def : WriteRes<WriteF, [CortexA510UnitVALU]> { let Latency = 4; } 179def : WriteRes<WriteFCmp, [CortexA510UnitVALU]> { let Latency = 3; } 180def : WriteRes<WriteFCvt, [CortexA510UnitVALU]> { let Latency = 4; } 181def : WriteRes<WriteFCopy, [CortexA510UnitVALU]> { let Latency = 3; } 182def : WriteRes<WriteFImm, [CortexA510UnitVALU]> { let Latency = 3; } 183 184class CortexA510VSt<int n> : SchedWriteRes<[CortexA510UnitLdSt]> { 185 let RetireOOO = 1; 186 let ReleaseAtCycles = [n]; 187} 188 189def CortexA510VSt0 : SchedWriteRes<[CortexA510UnitLdSt]> { 190 let RetireOOO = 1; 191} 192 193def : SchedAlias<WriteVd, CortexA510Write<4, CortexA510UnitVALU>>; 194def : SchedAlias<WriteVq, CortexA510Write<4, CortexA510UnitVALU>>; 195 196// FP ALU specific new schedwrite definitions 197def CortexA510WriteFPALU_F3 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 3;} 198def CortexA510WriteFPALU_F4 : SchedWriteRes<[CortexA510UnitVALU]> { let Latency = 4;} 199 200// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined 201def : WriteRes<WriteFMul, [CortexA510UnitVMAC]> { let Latency = 4; } 202 203let RetireOOO = 1 in { 204def : WriteRes<WriteFDiv, [CortexA510UnitVMC]> { let Latency = 22; 205 let ReleaseAtCycles = [29]; } 206def CortexA510WriteVMAC : SchedWriteRes<[CortexA510UnitVMAC]> { let Latency = 4; } 207def CortexA510WriteFDivHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8; 208 let ReleaseAtCycles = [5]; } 209def CortexA510WriteFDivSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 13; 210 let ReleaseAtCycles = [10]; } 211def CortexA510WriteFDivDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22; 212 let ReleaseAtCycles = [19]; } 213def CortexA510WriteFSqrtHP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 8; 214 let ReleaseAtCycles = [5]; } 215def CortexA510WriteFSqrtSP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 12; 216 let ReleaseAtCycles = [9]; } 217def CortexA510WriteFSqrtDP : SchedWriteRes<[CortexA510UnitVMC]> { let Latency = 22; 218 let ReleaseAtCycles = [19]; } 219} 220//===----------------------------------------------------------------------===// 221// Subtarget-specific SchedRead types. 222 223def : ReadAdvance<ReadVLD, 0>; 224def : ReadAdvance<ReadExtrHi, 0>; 225def : ReadAdvance<ReadAdrBase, 0>; 226def : ReadAdvance<ReadST, 1>; 227 228def : ReadAdvance<ReadI, 0>; 229def : ReadAdvance<ReadISReg, 0>; 230def : ReadAdvance<ReadIEReg, 0>; 231 232 233// MUL 234def : ReadAdvance<ReadIM, 0>; 235def : ReadAdvance<ReadIMA, 2>; 236 237// Div 238def : ReadAdvance<ReadID, 0>; 239 240//===----------------------------------------------------------------------===// 241// Subtarget-specific InstRWs. 242 243def A510WriteISReg : SchedWriteVariant<[ 244 SchedVar<RegShiftedPred, [WriteISReg]>, 245 SchedVar<NoSchedPred, [WriteI]>]>; 246def : InstRW<[A510WriteISReg], (instregex ".*rs$")>; 247def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>; 248 249// Pointer Authentication Instructions (v8.3 PAC) 250// ----------------------------------------------------------------------------- 251 252// Authenticate data address 253// Authenticate instruction address 254// Compute pointer authentication code for data address 255// Compute pointer authentication code, using generic key 256// Compute pointer authentication code for instruction address 257def : InstRW<[CortexA510Write<3, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>; 258 259// Branch and link, register, with pointer authentication 260// Branch, register, with pointer authentication 261// Branch, return, with pointer authentication 262def : InstRW<[A510Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, 263 BRAAZ, BRAB, BRABZ, RETAA, RETAB, 264 ERETAA, ERETAB)>; 265 266// Load register, with pointer authentication 267def : InstRW<[CortexA510Write<2, CortexA510UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>; 268 269// Strip pointer authentication code 270def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>; 271//--- 272// Miscellaneous 273//--- 274def : InstRW<[CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?Wi")>; 275def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPSi")>; 276def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)i")>; 277def : InstRW<[CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQi")>; 278def : InstRW<[WriteAdr, CortexA510WriteVLD1SI,CortexA510WriteLDP1], (instregex "LDPS?W(pre|post)")>; 279def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP1], (instregex "LDPS(pre|post)")>; 280def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP2], (instregex "LDP(X|D)(pre|post)")>; 281def : InstRW<[WriteAdr, CortexA510WriteVLD1,CortexA510WriteLDP4], (instregex "LDPQ(pre|post)")>; 282def : InstRW<[WriteI], (instrs COPY)>; 283//--- 284// Vector Loads - 128-bit per cycle 285//--- 286// 1-element structures 287def : InstRW<[CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; // single element 288def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate 289def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>; 290def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Onev(16b|8h|4s|2d)$")>; 291def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures 292def : InstRW<[CortexA510WriteVLD1], (instregex "LD1Twov(16b|8h|4s|2d)$")>; 293def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)$")>; 294def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>; 295def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 296def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 297 298def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>; 299def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 300def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; 301def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; 302def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; 303def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; 304def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; 305def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; 306def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; 307def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; 308 309// 2-element structures 310def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>; 311def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 312def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; 313def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; 314 315def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>; 316def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; 317def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; 318def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; 319 320// 3-element structures 321def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; 322def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 323def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>; 324def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>; 325 326def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>; 327def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 328def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; 329def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; 330 331// 4-element structures 332def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs. 333def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs. 334def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs. 335def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 336 337def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>; 338def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 339def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>; 340def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; 341 342//--- 343// Vector Stores 344//--- 345def : InstRW<[CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)$")>; 346def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 347def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 348def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 349def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 350def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>; 351def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 352def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 353def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 354def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 355 356def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>; 357def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>; 358def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>; 359def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>; 360def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>; 361def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; 362 363def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>; 364def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 365def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>; 366def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>; 367 368def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>; 369def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 370def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>; 371def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 372 373//--- 374// Floating Point Conversions, MAC, DIV, SQRT 375//--- 376def : InstRW<[CortexA510WriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>; 377def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^XTN")>; 378def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>; 379def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>; 380 381def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>; 382def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>; 383def : InstRW<[CortexA510WriteFPALU_F4], (instregex "^(S|U)CVTFv")>; 384 385def : InstRW<[CortexA510WriteVMAC], (instregex "^FN?M(ADD|SUB).*")>; 386def : InstRW<[CortexA510WriteVMAC], (instregex "^FML(A|S)v.*")>; 387def : InstRW<[CortexA510WriteFDivHP], (instrs FDIVHrr)>; 388def : InstRW<[CortexA510WriteFDivSP], (instrs FDIVSrr)>; 389def : InstRW<[CortexA510WriteFDivDP], (instrs FDIVDrr)>; 390def : InstRW<[CortexA510WriteFDivHP], (instregex "^FDIVv.*16$")>; 391def : InstRW<[CortexA510WriteFDivSP], (instregex "^FDIVv.*32$")>; 392def : InstRW<[CortexA510WriteFDivDP], (instregex "^FDIVv.*64$")>; 393def : InstRW<[CortexA510WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; 394def : InstRW<[CortexA510WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; 395def : InstRW<[CortexA510WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; 396 397// 4.15. Advanced SIMD integer instructions 398// ASIMD absolute diff 399def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>; 400def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>; 401// ASIMD absolute diff accum 402def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>; 403// ASIMD absolute diff long 404def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>; 405// ASIMD arith #1 406def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)", 407 "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>; 408def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)", 409 "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>; 410// ASIMD arith #2 411def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$", 412 "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", 413 "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$", 414 "ADDPv(2i32|4i16|8i8)$")>; 415def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$", 416 "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", 417 "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$", 418 "ADDPv(16i8|2i64|4i32|8i16)$")>; 419// ASIMD arith #3 420def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", 421 "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>; 422// ASIMD arith #5 423def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; 424// ASIMD arith, reduce 425def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ADDVv", "SADDLVv", "UADDLVv")>; 426// ASIMD compare #1 427def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>; 428def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>; 429// ASIMD compare #2 430def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>; 431def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>; 432// ASIMD logical $1 433def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8", 434 "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>; 435def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8", 436 "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>; 437// ASIMD max/min, basic 438def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; 439def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; 440// SIMD max/min, reduce 441def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>; 442// ASIMD multiply, by element 443def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$", 444 "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>; 445// ASIMD multiply 446def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv8i8)>; 447def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULv16i8)>; 448// ASIMD multiply accumulate 449def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>; 450def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>; 451def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>; 452// ASIMD multiply accumulate half 453def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQRDML[AS]H[vi]")>; 454// ASIMD multiply accumulate long 455def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]ML[AS]Lv")>; 456// ASIMD multiply accumulate long #2 457def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SQDML[AS]L[iv]")>; 458// ASIMD dot product 459def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv8i8")>; 460def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTv16i8")>; 461// ASIMD dot product, by scalar 462def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]DOTlanev")>; 463// ASIMD multiply long 464def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>; 465// ASIMD polynomial (8x8) multiply long 466def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>; 467// ASIMD pairwise add and accumulate 468def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>; 469// ASIMD shift accumulate 470def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; 471def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; 472// ASIMD shift accumulate #2 473def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>; 474// ASIMD shift by immed 475def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv", 476 "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>; 477// ASIMD shift by immed 478// SXTL and UXTL are aliases for SHLL 479def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[US]?SHLLv")>; 480// ASIMD shift by immed #2 481def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)", 482 "[SU]RSHRv(16i8|2i64|4i32|8i16)")>; 483def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)", 484 "RSHRNv(16i8|4i32|8i16)")>; 485// ASIMD shift by register 486def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>; 487def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>; 488// ASIMD shift by register #2 489def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>; 490def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>; 491 492def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>; 493def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>; 494 495def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>; 496def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>; 497 498// Cryptography extensions 499// ----------------------------------------------------------------------------- 500 501// Crypto AES ops 502def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; 503 504// Crypto polynomial (64x64) multiply long 505def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; 506 507// Crypto SHA1 hash acceleration op 508// Crypto SHA1 schedule acceleration ops 509def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|SU1)")>; 510 511// Crypto SHA1 hash acceleration ops 512// Crypto SHA256 hash acceleration ops 513def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; 514 515// Crypto SHA256 schedule acceleration ops 516def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>; 517 518// Crypto SHA512 hash acceleration ops 519def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; 520 521// Crypto SHA3 ops 522def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3, XAR)>; 523def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs RAX1)>; 524 525 526// Crypto SM3 ops 527def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", 528 "^SM3TT[12][AB]$")>; 529 530// Crypto SM4 ops 531def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>; 532 533// CRC 534// ----------------------------------------------------------------------------- 535 536def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")>; 537 538// SVE Predicate instructions 539 540// Loop control, based on predicate 541def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP, 542 BRKB_PPmP, BRKB_PPzP)>; 543 544// Loop control, based on predicate and flag setting 545def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; 546 547// Loop control, propagating 548def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; 549 550// Loop control, propagating and flag setting 551def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>; 552def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>; 553 554 555// Loop control, based on GPR 556def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], 557 (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; 558 559def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; 560 561// Loop terminate 562def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; 563 564// Predicate counting scalar 565def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; 566 567def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], 568 (instregex "^CNT[BHWD]_XPiI")>; 569 570def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], 571 (instregex "^(INC|DEC)[BHWD]_XPiI")>; 572 573def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], 574 (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>; 575 576// Predicate counting scalar, active predicate 577def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], 578 (instregex "^CNTP_XPP_[BHSD]")>; 579 580def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], 581 (instregex "^(DEC|INC)P_XP_[BHSD]")>; 582 583def : InstRW<[CortexA510Write<8, CortexA510UnitVALU0>], 584 (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", 585 "^(UQDEC|UQINC)P_WP_[BHSD]", 586 "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>; 587 588 589// Predicate counting vector, active predicate 590def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 591 (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; 592 593// Predicate logical 594def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], 595 (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; 596 597// Predicate logical, flag setting 598def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], 599 (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; 600 601// Predicate reverse 602def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>; 603 604// Predicate select 605def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs SEL_PPPP)>; 606 607// Predicate set 608def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; 609 610// Predicate set/initialize, set flags 611def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>; 612 613// Predicate find first/next 614def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; 615 616// Predicate test 617def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PTEST_PP)>; 618 619// Predicate transpose 620def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; 621 622// Predicate unpack and widen 623def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; 624 625// Predicate zip/unzip 626def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; 627 628 629// SVE integer instructions 630// ----------------------------------------------------------------------------- 631// Arithmetic, absolute diff 632def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>; 633 634// Arithmetic, absolute diff accum 635def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; 636 637// Arithmetic, absolute diff accum long 638def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; 639 640// Arithmetic, absolute diff long 641def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; 642 643// Arithmetic, basic 644def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 645 (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]", 646 "^(ADD|SUB|SUBR)_ZPmZ_[BHSD]", 647 "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]", 648 "^(ADD|SUB)_ZZZ_[BHSD]", 649 "^(ADD|SUB|SUBR)_ZI_[BHSD]", 650 "^ADR_[SU]XTW_ZZZ_D_[0123]", 651 "^ADR_LSL_ZZZ_[SD]_[0123]", 652 "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", 653 "^SADDLBT_ZZZ_[HSD]", 654 "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]", 655 "^SSUBL(BT|TB)_ZZZ_[HSD]")>; 656 657// Arithmetic, complex 658def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 659 (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]", 660 "^SQ(ABS|NEG)_ZPmZ_[BHSD]", 661 "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]", 662 "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", 663 "^[SU]Q(ADD|SUB)_ZI_[BHSD]", 664 "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", 665 "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; 666 667// Arithmetic, large integer 668def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; 669 670// Arithmetic, pairwise add 671def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>; 672 673// Arithmetic, pairwise add and accum long 674def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>; 675 676// Arithmetic, shift 677def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 678 (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]", 679 "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]", 680 "^(ASR|LSL|LSR)_ZPmI_[BHSD]", 681 "^(ASR|LSL|LSR)_ZPZI_[BHSD]", 682 "^(ASR|LSL|LSR)_ZPmZ_[BHSD]", 683 "^(ASR|LSL|LSR)_ZPZZ_[BHSD]", 684 "^(ASR|LSL|LSR)_ZZI_[BHSD]", 685 "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; 686// Arithmetic, shift right for divide 687def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 688 (instregex "^ASRD_ZPmI_[BHSD]", 689 "^ASRD_ZPZI_[BHSD]")>; 690 691// Arithmetic, shift and accumulate 692def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 693 (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>; 694 695def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], 696 (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>; 697 698 699// Arithmetic, shift by immediate 700// Arithmetic, shift by immediate and insert 701def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 702 (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>; 703 704// Arithmetic, shift complex 705def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 706 (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]", 707 "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]", 708 "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]", 709 "^SQSHRU?N[BT]_ZZI_[BHS]", 710 "^UQR?SHRN[BT]_ZZI_[BHS]")>; 711 712// Arithmetic, shift rounding 713def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 714 (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]", 715 "^[SU]RSHR_ZPmI_[BHSD]")>; 716 717// Bit manipulation 718def : InstRW<[CortexA510MCWrite<14, 13, CortexA510UnitVMC>], 719 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>; 720 721def : InstRW<[CortexA510MCWrite<22, 21, CortexA510UnitVMC>], 722 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>; 723 724def : InstRW<[CortexA510MCWrite<38, 37, CortexA510UnitVMC>], 725 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>; 726 727def : InstRW<[CortexA510MCWrite<70, 69, CortexA510UnitVMC>], 728 (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>; 729 730 731// Bitwise select 732def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>; 733 734// Count/reverse bits 735def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>; 736def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; 737def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>; 738def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>; 739// Broadcast logical bitmask immediate to vector 740def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>; 741 742// Compare and set flags 743def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 744 (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", 745 "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; 746 747// Complex add 748def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>; 749 750def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>; 751 752// Complex dot product 8-bit element 753def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; 754 755// Complex dot product 16-bit element 756def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; 757 758// Complex multiply-add B, H, S element size 759def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]", 760 "^CMLA_ZZZI_[HS]")>; 761 762// Complex multiply-add D element size 763def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs CMLA_ZZZ_D)>; 764 765// Conditional extract operations, scalar form 766def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>; 767 768// Conditional extract operations, SIMD&FP scalar and vector forms 769def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]", 770 "^COMPACT_ZPZ_[SD]", 771 "^SPLICE_ZPZZ?_[BHSD]")>; 772 773// Convert to floating point, 64b to float or convert to double 774def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>; 775 776// Convert to floating point, 64b to half 777def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>; 778 779// Convert to floating point, 32b to single or half 780def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>; 781 782// Convert to floating point, 32b to double 783def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>; 784 785// Convert to floating point, 16b to half 786def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>; 787 788// Copy, scalar 789def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>],(instregex "^CPY_ZPmR_[BHSD]")>; 790 791// Copy, scalar SIMD&FP or imm 792def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]", 793 "^CPY_ZPzI_[BHSD]")>; 794 795// Divides, 32 bit 796def : InstRW<[CortexA510MCWrite<15, 12, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>; 797 798// Divides, 64 bit 799def : InstRW<[CortexA510MCWrite<26, 23, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>; 800 801// Dot product, 8 bit 802def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>; 803 804// Dot product, 8 bit, using signed and unsigned integers 805def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; 806 807// Dot product, 16 bit 808def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>; 809 810// Duplicate, immediate and indexed form 811def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZI_[BHSD]", 812 "^DUP_ZZI_[BHSDQ]")>; 813 814// Duplicate, scalar form 815def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZR_[BHSD]")>; 816 817// Extend, sign or zero 818def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]", 819 "^[SU]XTH_ZPmZ_[SD]", 820 "^[SU]XTW_ZPmZ_[D]")>; 821 822// Extract 823def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>; 824 825// Extract narrow saturating 826def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", 827 "^SQXTUN[BT]_ZZ_[BHS]")>; 828 829// Extract/insert operation, SIMD and FP scalar form 830def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]", 831 "^INSR_ZV_[BHSD]")>; 832 833// Extract/insert operation, scalar 834def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^LAST[AB]_RPZ_[BHSD]", 835 "^INSR_ZR_[BHSD]")>; 836 837// Histogram operations 838def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU0>], (instregex "^HISTCNT_ZPzZZ_[SD]", 839 "^HISTSEG_ZZZ")>; 840 841// Horizontal operations, B, H, S form, immediate operands only 842def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_II_[BHS]")>; 843 844// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar 845// operands only / immediate, scalar operands 846def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>; 847 848// Horizontal operations, D form, immediate operands only 849def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs INDEX_II_D)>; 850 851// Horizontal operations, D form, scalar, immediate operands)/ scalar operands 852// only / immediate, scalar operands 853def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>; 854 855// Logical 856def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], 857 (instregex "^(AND|EOR|ORR)_ZI", 858 "^(AND|BIC|EOR|EOR|ORR)_ZZZ", 859 "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]", 860 "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>; 861 862def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 863 (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>; 864 865// Max/min, basic and pairwise 866def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", 867 "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>; 868 869// Matching operations 870def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>; 871 872// Matrix multiply-accumulate 873def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; 874 875// Move prefix 876def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]", 877 "^MOVPRFX_ZZ")>; 878 879// Multiply, B, H, S element size 880def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]", 881 "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>; 882 883// Multiply, D element size 884def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D", 885 "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>; 886 887// Multiply long 888def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]", 889 "^[SU]MULL[BT]_ZZZ_[HSD]")>; 890 891// Multiply accumulate, B, H, S element size 892def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]", 893 "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>; 894 895// Multiply accumulate, D element size 896def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D", 897 "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; 898 899// Multiply accumulate long 900def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]", 901 "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>; 902 903// Multiply accumulate saturating doubling long regular 904def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]", 905 "^SQDML[AS](LB|LT)_ZZZI_[SD]")>; 906 907// Multiply saturating doubling high, B, H, S element size 908def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]", 909 "^SQDMULH_ZZZI_[HS]")>; 910 911// Multiply saturating doubling high, D element size 912def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; 913 914// Multiply saturating doubling long 915def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]", 916 "^SQDMULL[BT]_ZZZI_[SD]")>; 917 918// Multiply saturating rounding doubling regular/complex accumulate, B, H, S 919// element size 920def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]", 921 "^SQRDCMLAH_ZZZ_[BHS]", 922 "^SQRDML[AS]H_ZZZI_[HS]", 923 "^SQRDCMLAH_ZZZI_[HS]")>; 924 925// Multiply saturating rounding doubling regular/complex accumulate, D element 926// size 927def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D", 928 "^SQRDCMLAH_ZZZ_D")>; 929 930// Multiply saturating rounding doubling regular/complex, B, H, S element size 931def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]", 932 "^SQRDMULH_ZZZI_[HS]")>; 933 934// Multiply saturating rounding doubling regular/complex, D element size 935def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>; 936 937// Multiply/multiply long, (8x8) polynomial 938def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>; 939 940def : InstRW<[CortexA510Write<6, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>; 941 942 943// Predicate counting vector 944def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 945 (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; 946 947// Reciprocal estimate 948def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; 949 950// Reduction, arithmetic, B form 951def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; 952 953// Reduction, arithmetic, H form 954def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; 955 956// Reduction, arithmetic, S form 957def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; 958 959// Reduction, arithmetic, D form 960def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; 961 962// Reduction, logical 963def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>; 964 965// Reverse, vector 966def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]", 967 "^REVB_ZPmZ_[HSD]", 968 "^REVH_ZPmZ_[SD]", 969 "^REVW_ZPmZ_D")>; 970 971// Select, vector form 972def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>; 973 974// Table lookup 975def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[BHSD]")>; 976 977// Table lookup extension 978def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>; 979 980// Transpose, vector form 981def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; 982 983// Unpack and extend 984def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; 985 986// Zip/unzip 987def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; 988 989// SVE floating-point instructions 990// ----------------------------------------------------------------------------- 991 992// Floating point absolute value/difference 993def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]", 994 "^FAB[SD]_ZPZZ_[HSD]")>; 995 996// Floating point arithmetic 997def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]", 998 "^FADDP_ZPmZZ_[HSD]", 999 "^FNEG_ZPmZ_[HSD]", 1000 "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>; 1001 1002// Floating point associative add, F16 1003def : InstRW<[CortexA510MCWrite<32, 29, CortexA510UnitVALU>], (instrs FADDA_VPZ_H)>; 1004 1005// Floating point associative add, F32 1006def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_S)>; 1007 1008// Floating point associative add, F64 1009def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>; 1010 1011// Floating point compare 1012def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]", 1013 "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", 1014 "^FCM(LE|LT)_PPzZ0_[HSD]", 1015 "^FCMUO_PPzZZ_[HSD]")>; 1016 1017// Floating point complex add 1018def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>; 1019 1020// Floating point complex multiply add 1021def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]", 1022 "^FCMLA_ZZZI_[HS]")>; 1023 1024// Floating point convert, long or narrow (F16 to F32 or F32 to F16) 1025def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", 1026 "^FCVTLT_ZPmZ_HtoS", 1027 "^FCVTNT_ZPmZ_StoH")>; 1028 1029// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 1030// or F64 to F16) 1031def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)", 1032 "^FCVTLT_ZPmZ_StoD", 1033 "^FCVTNT_ZPmZ_DtoS")>; 1034 1035// Floating point convert, round to odd 1036def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>; 1037 1038// Floating point base2 log, F16 1039def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>; 1040 1041// Floating point base2 log, F32 1042def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>; 1043 1044// Floating point base2 log, F64 1045def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>; 1046 1047// Floating point convert to integer, F16 1048def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>; 1049 1050// Floating point convert to integer, F32 1051def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>; 1052 1053// Floating point convert to integer, F64 1054def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], 1055 (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>; 1056 1057// Floating point copy 1058def : InstRW<[CortexA510Write<3, CortexA510UnitVALU0>], (instregex "^FCPY_ZPmI_[HSD]", 1059 "^FDUP_ZI_[HSD]")>; 1060 1061// Floating point divide, F16 1062def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>; 1063 1064// Floating point divide, F32 1065def : InstRW<[CortexA510MCWrite<13, 10, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>; 1066 1067// Floating point divide, F64 1068def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>; 1069 1070// Floating point min/max pairwise 1071def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>; 1072 1073// Floating point min/max 1074def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>; 1075 1076// Floating point multiply 1077def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]", 1078 "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>; 1079 1080// Floating point multiply accumulate 1081def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], 1082 (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]", 1083 "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>; 1084 1085// Floating point multiply add/sub accumulate long 1086def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>; 1087 1088// Floating point reciprocal estimate, F16 1089def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H", 1090 "^FRSQRTE_ZZ_H")>; 1091 1092// Floating point reciprocal estimate, F32 1093def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S", 1094 "^FRSQRTE_ZZ_S")>; 1095// Floating point reciprocal estimate, F64 1096def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D", 1097 "^FRSQRTE_ZZ_D")>; 1098 1099// Floating point reciprocal step 1100def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; 1101 1102// Floating point reduction, F16 1103def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], 1104 (instregex "^(FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_[HSD]")>; 1105 1106// Floating point reduction, F32 1107def : InstRW<[CortexA510MCWrite<12, 11, CortexA510UnitVALU0>], 1108 (instregex "^FADDV_VPZ_H")>; 1109 1110def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU0>], 1111 (instregex "^FADDV_VPZ_S")>; 1112 1113def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], 1114 (instregex "^FADDV_VPZ_D")>; 1115 1116 1117// Floating point round to integral, F16 1118def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>; 1119 1120// Floating point round to integral, F32 1121def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>; 1122 1123// Floating point round to integral, F64 1124def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>; 1125 1126// Floating point square root, F16 1127def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_H")>; 1128 1129// Floating point square root, F32 1130def : InstRW<[CortexA510MCWrite<12, 9, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_S")>; 1131 1132// Floating point square root, F64 1133def : InstRW<[CortexA510MCWrite<22, 19, CortexA510UnitVMC>], (instregex "^FSQRT_ZPmZ_D")>; 1134 1135// Floating point trigonometric exponentiation 1136def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>; 1137 1138// Floating point trigonometric multiply add 1139def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>; 1140 1141// Floating point trigonometric, miscellaneous 1142def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>; 1143def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; 1144 1145 1146// SVE BFloat16 (BF16) instructions 1147// ----------------------------------------------------------------------------- 1148 1149// Convert, F32 to BF16 1150def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; 1151 1152// Dot product 1153def : InstRW<[A510Write_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; 1154 1155// Matrix multiply accumulate 1156def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>; 1157 1158// Multiply accumulate long 1159def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>; 1160 1161// SVE Load instructions 1162// ----------------------------------------------------------------------------- 1163 1164// Load vector 1165def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instrs LDR_ZXI)>; 1166 1167// Load predicate 1168def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instrs LDR_PXI)>; 1169 1170// Contiguous load, scalar + imm 1171def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM$", 1172 "^LD1S?B_[HSD]_IMM$", 1173 "^LD1S?H_[SD]_IMM$", 1174 "^LD1S?W_D_IMM$" )>; 1175// Contiguous load, scalar + scalar 1176def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]$", 1177 "^LD1S?B_[HSD]$", 1178 "^LD1S?H_[SD]$", 1179 "^LD1S?W_D$" )>; 1180 1181// Contiguous load broadcast, scalar + imm 1182def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1R[BHWD]_IMM$", 1183 "^LD1RSW_IMM$", 1184 "^LD1RS?B_[HSD]_IMM$", 1185 "^LD1RS?H_[SD]_IMM$", 1186 "^LD1RS?W_D_IMM$", 1187 "^LD1RQ_[BHWD]_IMM$")>; 1188 1189// Contiguous load broadcast, scalar + scalar 1190def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LD1RQ_[BHWD]$")>; 1191 1192// Non temporal load, scalar + imm 1193def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRI$")>; 1194 1195// Non temporal load, scalar + scalar 1196def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRR$")>; 1197 1198// Non temporal gather load, vector + scalar 32-bit element size 1199def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S_REAL$", 1200 "^LDNT1S[BH]_ZZR_S_REAL$")>; 1201 1202// Non temporal gather load, vector + scalar 64-bit element size 1203def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D_REAL$")>; 1204def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instrs LDNT1D_ZZR_D_REAL)>; 1205 1206// Contiguous first faulting load, scalar + scalar 1207def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDFF1[BHWD]_REAL$", 1208 "^LDFF1S?B_[HSD]_REAL$", 1209 "^LDFF1S?H_[SD]_REAL$", 1210 "^LDFF1S?W_D_REAL$")>; 1211 1212// Contiguous non faulting load, scalar + imm 1213def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LDNF1[BHWD]_IMM_REAL$", 1214 "^LDNF1S?B_[HSD]_IMM_REAL$", 1215 "^LDNF1S?H_[SD]_IMM_REAL$", 1216 "^LDNF1S?W_D_IMM_REAL$")>; 1217 1218// Contiguous Load two structures to two vectors, scalar + imm 1219def : InstRW<[CortexA510MCWrite<3, 1, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>; 1220 1221// Contiguous Load two structures to two vectors, scalar + scalar 1222def : InstRW<[CortexA510MCWrite<3, 2, CortexA510UnitLdSt>], (instregex "^LD2[BHWD]$")>; 1223 1224// Contiguous Load three structures to three vectors, scalar + imm 1225def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>; 1226 1227// Contiguous Load three structures to three vectors, scalar + scalar 1228def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD3[BHWD]$")>; 1229 1230// Contiguous Load four structures to four vectors, scalar + imm 1231def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>; 1232 1233// Contiguous Load four structures to four vectors, scalar + scalar 1234def : InstRW<[CortexA510MCWrite<5, 3, CortexA510UnitLdSt>], (instregex "^LD4[BHWD]$")>; 1235 1236// Gather load, vector + imm, 32-bit element size 1237def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM_REAL$", 1238 "^GLD(FF)?1W_IMM_REAL$")>; 1239 1240// Gather load, vector + imm, 64-bit element size 1241def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM_REAL$", 1242 "^GLD(FF)?1D_IMM_REAL$")>; 1243 1244// Gather load, 64-bit element size 1245def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], 1246 (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW_(SCALED_)?REAL$", 1247 "^GLD(FF)?1S?[BHW]_D_(SCALED_)?REAL$", 1248 "^GLD(FF)?1D_[SU]XTW_(SCALED_)?REAL$", 1249 "^GLD(FF)?1D_(SCALED_)?REAL$")>; 1250 1251// Gather load, 32-bit scaled offset 1252def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], 1253 (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED_REAL$", 1254 "^GLD(FF)?1W_[SU]XTW_SCALED_REAL")>; 1255 1256// Gather load, 32-bit unpacked unscaled offset 1257def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW_REAL$", 1258 "^GLD(FF)?1W_[SU]XTW_REAL$")>; 1259 1260def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>; 1261// SVE Store instructions 1262// ----------------------------------------------------------------------------- 1263 1264// Store from predicate reg 1265def : InstRW<[CortexA510VSt0], (instrs STR_PXI)>; 1266 1267// Store from vector reg 1268def : InstRW<[CortexA510VSt0], (instrs STR_ZXI)>; 1269 1270// Contiguous store, scalar + imm 1271def : InstRW<[CortexA510VSt0], (instregex "^ST1[BHWD]_IMM$", 1272 "^ST1B_[HSD]_IMM$", 1273 "^ST1H_[SD]_IMM$", 1274 "^ST1W_D_IMM$")>; 1275 1276// Contiguous store, scalar + scalar 1277def : InstRW<[CortexA510VSt0], (instregex "^ST1H(_[SD])?$")>; 1278def : InstRW<[CortexA510VSt0], (instregex "^ST1[BWD]$", 1279 "^ST1B_[HSD]$", 1280 "^ST1W_D$")>; 1281 1282// Contiguous store two structures from two vectors, scalar + imm 1283def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BHWD]_IMM$")>; 1284 1285// Contiguous store two structures from two vectors, scalar + scalar 1286def : InstRW<[CortexA510VSt<11>], (instrs ST2H)>; 1287 1288// Contiguous store two structures from two vectors, scalar + scalar 1289def : InstRW<[CortexA510VSt<11>], (instregex "^ST2[BWD]$")>; 1290 1291// Contiguous store three structures from three vectors, scalar + imm 1292def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]_IMM$")>; 1293def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D_IMM$")>; 1294 1295// Contiguous store three structures from three vectors, scalar + scalar 1296def : InstRW<[CortexA510VSt<25>], (instregex "^ST3[BHW]$")>; 1297def : InstRW<[CortexA510VSt<14>], (instregex "^ST3D$")>; 1298 1299// Contiguous store four structures from four vectors, scalar + imm 1300def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]_IMM$")>; 1301def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D_IMM$")>; 1302 1303// Contiguous store four structures from four vectors, scalar + scalar 1304def : InstRW<[CortexA510VSt<50>], (instregex "^ST4[BHW]$")>; 1305 1306// Contiguous store four structures from four vectors, scalar + scalar 1307def : InstRW<[CortexA510VSt<25>], (instregex "^ST4D$")>; 1308 1309// Non temporal store, scalar + imm 1310def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BHWD]_ZRI$")>; 1311 1312// Non temporal store, scalar + scalar 1313def : InstRW<[CortexA510VSt0], (instrs STNT1H_ZRR)>; 1314def : InstRW<[CortexA510VSt0], (instregex "^STNT1[BWD]_ZRR$")>; 1315 1316// Scatter non temporal store, vector + scalar 32-bit element size 1317def : InstRW<[CortexA510VSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>; 1318 1319// Scatter non temporal store, vector + scalar 64-bit element size 1320def : InstRW<[CortexA510VSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>; 1321 1322// Scatter store vector + imm 32-bit element size 1323def : InstRW<[CortexA510VSt<9>], (instregex "^SST1[BH]_S_IMM$", 1324 "^SST1W_IMM$")>; 1325 1326// Scatter store vector + imm 64-bit element size 1327def : InstRW<[CortexA510VSt<7>], (instregex "^SST1[BHW]_D_IMM$", 1328 "^SST1D_IMM$")>; 1329 1330// Scatter store, 32-bit scaled offset 1331def : InstRW<[CortexA510VSt<8>], 1332 (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; 1333 1334// Scatter store, 32-bit unpacked unscaled offset 1335def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D_[SU]XTW$", 1336 "^SST1D_[SU]XTW$")>; 1337 1338// Scatter store, 32-bit unpacked scaled offset 1339def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", 1340 "^SST1D_[SU]XTW_SCALED$")>; 1341 1342// Scatter store, 32-bit unscaled offset 1343def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BH]_S_[SU]XTW$", 1344 "^SST1W_[SU]XTW$")>; 1345 1346// Scatter store, 64-bit scaled offset 1347def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[HW]_D_SCALED$", 1348 "^SST1D_SCALED$")>; 1349 1350// Scatter store, 64-bit unscaled offset 1351def : InstRW<[CortexA510VSt<8>], (instregex "^SST1[BHW]_D$", 1352 "^SST1D$")>; 1353 1354// SVE Miscellaneous instructions 1355// ----------------------------------------------------------------------------- 1356 1357// Read first fault register, unpredicated 1358def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs RDFFR_P_REAL)>; 1359 1360// Read first fault register, predicated 1361def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFR_PPz_REAL)>; 1362 1363// Read first fault register and set flags 1364def : InstRW<[CortexA510Write<3, CortexA510UnitALU0>], (instrs RDFFRS_PPz)>; 1365 1366// Set first fault register 1367// Write to first fault register 1368def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs SETFFR, WRFFR)>; 1369 1370// SVE Cryptographic instructions 1371// ----------------------------------------------------------------------------- 1372 1373// Crypto AES ops 1374def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_B$", 1375 "^AESI?MC_ZZ_B$")>; 1376 1377// Crypto SHA3 ops 1378def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$", 1379 "^XAR_ZZZI_[BHSD]$")>; 1380 1381def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>; 1382 1383// Crypto SM4 ops 1384def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; 1385 1386} 1387