//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the machine model for the Ampere Computing Ampere-1 to // support instruction scheduling and other instruction cost heuristics. // //===----------------------------------------------------------------------===// // The Ampere-1 core is an out-of-order micro-architecture. The front // end has branch prediction, with a 10-cycle recovery time from a // mispredicted branch. Instructions coming out of the front end are // decoded into internal micro-ops (uops). def Ampere1Model : SchedMachineModel { let IssueWidth = 4; // 4-way decode and dispatch let MicroOpBufferSize = 174; // micro-op re-order buffer size let LoadLatency = 4; // Optimistic load latency let MispredictPenalty = 10; // Branch mispredict penalty let LoopMicroOpBufferSize = 32; // Instruction queue size let CompleteModel = 0; list UnsupportedFeatures = !listconcat(SVEUnsupported.F, SMEUnsupported.F, PAUnsupported.F, [HasMTE]); } let SchedModel = Ampere1Model in { //===----------------------------------------------------------------------===// // Define each kind of processor resource and number available on Ampere-1. // Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, // and 2 memory) issue into. The integer and FP schedulers can each issue // one uop per cycle, while the memory schedulers can each issue one load // and one store address calculation per cycle. def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle def Ampere1UnitL : ProcResource<2>; // load def Ampere1UnitS : ProcResource<2>; // store address calculation def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>; def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>; //===----------------------------------------------------------------------===// // Define customized scheduler read/write types specific to the Ampere-1. def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> { let Latency = 1; let NumMicroOps = 1; } def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> { let Latency = 1; let NumMicroOps = 2; } def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> { let Latency = 1; let NumMicroOps = 1; } def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> { let Latency = 1; let NumMicroOps = 1; } def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> { let Latency = 1; let NumMicroOps = 1; } def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> { let Latency = 1; let NumMicroOps = 1; } def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> { let Latency = 1; let NumMicroOps = 2; } def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { let Latency = 2; let NumMicroOps = 1; } def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, Ampere1UnitS]> { let Latency = 2; let NumMicroOps = 3; } def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, Ampere1UnitZ]> { let Latency = 2; let NumMicroOps = 3; } def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 2; let NumMicroOps = 1; } def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> { let Latency = 2; let NumMicroOps = 2; } def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { let Latency = 3; let NumMicroOps = 1; } def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 3; let NumMicroOps = 1; } def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS, Ampere1UnitAB]> { let Latency = 2; let NumMicroOps = 3; } def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 2; let NumMicroOps = 3; } def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 2; let NumMicroOps = 4; } def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { let Latency = 4; let NumMicroOps = 1; } def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> { let Latency = 4; let NumMicroOps = 1; } def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> { let Latency = 4; let NumMicroOps = 1; } def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { let Latency = 4; let NumMicroOps = 1; } def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> { let Latency = 4; let NumMicroOps = 1; } def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { let Latency = 4; let NumMicroOps = 2; } def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 4; let NumMicroOps = 1; } def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 4; let NumMicroOps = 2; } def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> { let Latency = 4; let NumMicroOps = 3; } def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 4; let NumMicroOps = 6; } def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { let Latency = 5; let NumMicroOps = 2; } def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { let Latency = 5; let NumMicroOps = 1; } def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> { let Latency = 5; let NumMicroOps = 1; } def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> { let Latency = 5; let NumMicroOps = 1; } def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { let Latency = 5; let NumMicroOps = 2; } def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> { let Latency = 5; let NumMicroOps = 2; } def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 5; let NumMicroOps = 1; } def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 5; let NumMicroOps = 2; } def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 5; let NumMicroOps = 8; } def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 5; let NumMicroOps = 6; } def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 6; let NumMicroOps = 6; } def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 6; let NumMicroOps = 9; } def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { let Latency = 6; let NumMicroOps = 2; } def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 6; let NumMicroOps = 1; } def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 6; let NumMicroOps = 2; } def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 6; let NumMicroOps = 3; } def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { let Latency = 6; let NumMicroOps = 3; } def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { let Latency = 6; let NumMicroOps = 4; } def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { let Latency = 6; let NumMicroOps = 2; } def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { let Latency = 7; let NumMicroOps = 1; } def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> { let Latency = 7; let NumMicroOps = 2; } def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> { let Latency = 7; let NumMicroOps = 2; } def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 7; let NumMicroOps = 4; } def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 7; let NumMicroOps = 2; } def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 7; let NumMicroOps = 12; } def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> { let Latency = 8; let NumMicroOps = 2; } def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA, Ampere1UnitA]> { let Latency = 8; let NumMicroOps = 3; } def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 8; let NumMicroOps = 2; } def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 8; let NumMicroOps = 4; } def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 8; let NumMicroOps = 6; } def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 8; let NumMicroOps = 8; } def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 9; let NumMicroOps = 6; } def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 9; let NumMicroOps = 8; } def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 9; let NumMicroOps = 3; } def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 9; let NumMicroOps = 5; } def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 9; let NumMicroOps = 14; } def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { let Latency = 9; let NumMicroOps = 16; } def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 10; let NumMicroOps = 2; } def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { let Latency = 10; let NumMicroOps = 2; } def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> { let Latency = 10; let NumMicroOps = 2; } def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 10; let NumMicroOps = 6; } def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { let Latency = 10; let NumMicroOps = 3; } def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { let Latency = 10; let NumMicroOps = 3; } def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> { let Latency = 11; let NumMicroOps = 2; } def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { let Latency = 11; let NumMicroOps = 3; } def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { let Latency = 11; let NumMicroOps = 3; } def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 11; let NumMicroOps = 12; } def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 12; let NumMicroOps = 12; } def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 12; let NumMicroOps = 3; } def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { let Latency = 12; let NumMicroOps = 4; } def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { let Latency = 18; let NumMicroOps = 1; } def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 19; let NumMicroOps = 1; } def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 25; let NumMicroOps = 1; } def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 32; let NumMicroOps = 1; } def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { let Latency = 34; let NumMicroOps = 1; } def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 34; let NumMicroOps = 1; } def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 39; let NumMicroOps = 1; } def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { let Latency = 62; let NumMicroOps = 1; } // For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), // which are a single uop, and for extended registers, which have full flexibility // across Unit A or B for both uops. def Ampere1Write_Arith : SchedWriteVariant<[ SchedVar, SchedVar, SchedVar]>; def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[ SchedVar, SchedVar, SchedVar]>; //===----------------------------------------------------------------------===// // Map the target-defined scheduler read/write resources and latencies for Ampere-1. // This provides a coarse model, which is then specialised below. def : WriteRes; // MOVN, MOVZ def : WriteRes; // ALU def : WriteRes { let Latency = 2; let NumMicroOps = 2; } // ALU of Shifted-Reg def : WriteRes { let Latency = 2; let NumMicroOps = 2; } // ALU of Extended-Reg def : WriteRes; // EXTR shifts a reg pair def : WriteRes; // Shift/Scale def : WriteRes { let Latency = 18; } // 32-bit Divide def : WriteRes { let Latency = 34; } // 64-bit Divide def : WriteRes { let Latency = 3; } // 32-bit Multiply def : WriteRes { let Latency = 3; } // 32-bit Multiply def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 4; } // Load from base addr plus immediate offset def : WriteRes { let Latency = 1; } // Store to base addr plus immediate offset def : WriteRes { let Latency = 1; let NumMicroOps = 2; } // Store a register pair. def : WriteRes; def : WriteRes { let Latency = 5; let NumMicroOps = 2; } // Load from a register index (maybe scaled). def : WriteRes { let Latency = 1; let NumMicroOps = 2; } // Store to a register index (maybe scaled). def : WriteRes { let Latency = 2; } // General floating-point ops. def : WriteRes { let Latency = 5; } // Floating-point compare. def : WriteRes { let Latency = 6; } // Float conversion. def : WriteRes { } // Float-int register copy. def : WriteRes { let Latency = 2; } // Float-int register copy. def : WriteRes { let Latency = 5; } // Floating-point multiply. def : WriteRes { let Latency = 34; } // Floating-point division. def : WriteRes { let Latency = 3; } // 64bit Vector D ops. def : WriteRes { let Latency = 3; } // 128bit Vector Q ops. def : WriteRes { let Latency = 5; } // Vector loads. def : WriteRes { let Latency = 2; } // Vector stores. def : WriteRes { let Unsupported = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 4; } // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP // Forwarding logic. def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; //===----------------------------------------------------------------------===// // Specialising the scheduling model further for Ampere-1. def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>; // Branch instructions def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>; def : InstRW<[Ampere1Write_1cyc_1A], (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>; // Cryptography instructions // -- AES encryption/decryption def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>; def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>; // -- Polynomial multiplication def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; // -- SHA-256 hash def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>; // -- SHA-256 schedule update def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>; // -- SHA-3 instructions def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; // -- SHA-512 hash def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>; // -- SHA-512 schedule update def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>; // -- SHA1 choose/majority/parity def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>; // -- SHA1 hash/schedule update def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>; def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>; // FP and vector load instructions // -- Load 1-element structure to one/all lanes // ---- all lanes def : InstRW<[Ampere1Write_7cyc_1L_1XY], (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; // ---- one lane def : InstRW<[Ampere1Write_7cyc_1L_1XY], (instregex "^LD1i(8|16|32|64)")>; // -- Load 1-element structure to one/all lanes, 1D size def : InstRW<[Ampere1Write_5cyc_1L], (instregex "^LD1Rv1d")>; // -- Load 1-element structures to 1 register def : InstRW<[Ampere1Write_5cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Load 1-element structures to 2 registers def : InstRW<[Ampere1Write_5cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Load 1-element structures to 3 registers def : InstRW<[Ampere1Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Load 1-element structures to 4 registers def : InstRW<[Ampere1Write_6cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Load 2-element structure to all lanes of 2 registers, 1D size def : InstRW<[Ampere1Write_5cyc_2L], (instregex "^LD2Rv1d")>; // -- Load 2-element structure to all lanes of 2 registers, other sizes def : InstRW<[Ampere1Write_7cyc_2L_2XY], (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; // -- Load 2-element structure to one lane of 2 registers def : InstRW<[Ampere1Write_7cyc_2L_2XY], (instregex "^LD2i(8|16|32|64)")>; // -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size def : InstRW<[Ampere1Write_7cyc_2L_2XY], (instregex "^LD2Twov(16b|8h|4s|2d)")>; // -- Load 2-element structures to 2 registers, 8B/4H/2S size def : InstRW<[Ampere1Write_9cyc_2L_3XY], (instregex "^LD2Twov(8b|4h|2s)")>; // -- Load 3-element structure to all lanes of 3 registers, 1D size def : InstRW<[Ampere1Write_6cyc_3L], (instregex "^LD3Rv1d")>; // -- Load 3-element structure to all lanes of 3 registers, other sizes def : InstRW<[Ampere1Write_8cyc_3L_3XY], (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; // -- Load 3-element structure to one lane of 3 registers def : InstRW<[Ampere1Write_8cyc_3L_3XY], (instregex "^LD3i(8|16|32|64)")>; // -- Load 3-element structures to 3 registers, 16B/8H/4S sizes def : InstRW<[Ampere1Write_9cyc_3L_3XY], (instregex "^LD3Threev(16b|8h|4s)")>; // -- Load 3-element structures to 3 registers, 2D size def : InstRW<[Ampere1Write_8cyc_3L_3XY], (instregex "^LD3Threev2d")>; // -- Load 3-element structures to 3 registers, 8B/4H/2S sizes def : InstRW<[Ampere1Write_10cyc_3L_3XY], (instregex "^LD3Threev(8b|4h|2s)")>; // -- Load 4-element structure to all lanes of 4 registers, 1D size def : InstRW<[Ampere1Write_6cyc_4L], (instregex "^LD4Rv1d")>; // -- Load 4-element structure to all lanes of 4 registers, other sizes def : InstRW<[Ampere1Write_8cyc_4L_4XY], (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; // -- Load 4-element structure to one lane of 4 registers def : InstRW<[Ampere1Write_6cyc_4L], (instregex "^LD4i(8|16|32|64)")>; // -- Load 4-element structures to 4 registers, 2D size def : InstRW<[Ampere1Write_9cyc_4L_4XY], (instregex "^LD4Fourv2d")>; // -- Load 4-element structures to 4 registers, 2S size def : InstRW<[Ampere1Write_12cyc_4L_8XY], (instregex "^LD4Fourv2s")>; // -- Load 4-element structures to 4 registers, other sizes def : InstRW<[Ampere1Write_11cyc_4L_8XY], (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; // -- Load pair, Q-form def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>; // -- Load pair, S/D-form def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; // -- Load register def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>; // -- Load register, sign-extended register def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; // FP and vector store instructions // -- Store 1-element structure from one lane of 1 register def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z], (instregex "^ST1i(8|16|32|64)")>; // -- Store 1-element structures from 1 register def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Store 1-element structures from 2 registers def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Store 1-element structures from 3 registers def : InstRW<[Ampere1Write_4cyc_3S_3Z], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Store 1-element structures from 4 registers def : InstRW<[Ampere1Write_5cyc_4S_4Z], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Store 2-element structure from one lane of 2 registers def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], (instregex "^ST2i(8|16|32|64)")>; // -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], (instregex "^ST2Twov(16b|8h|4s|2d)")>; // -- Store 2-element structures from 2 registers, 8B/4H/2S sizes def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z], (instregex "^ST2Twov(8b|4h|2s)")>; // -- Store 3-element structure from one lane of 3 registers def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], (instregex "^ST3i(8|16|32|64)")>; // -- Store 3-element structures from 3 registers def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; // -- Store 4-element structure from one lane of 4 registers def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], (instregex "^ST4i(8|16|32|64)")>; // -- Store 4-element structures from 4 registers, 16B/8H/4S sizes def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z], (instregex "^ST4Fourv(16b|8h|4s)")>; // -- Store 4-element structures from 4 registers, 2D sizes def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], (instregex "^ST4Fourv2d")>; // -- Store 4-element structures from 4 registers, 8B/4H/2S sizes def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z], (instregex "^ST4Fourv(8b|4h|2s)")>; // -- Store pair, Q-form def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>; // -- Store pair, S/D-form def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>; // -- Store register def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; // -- Store register, sign-extended register offset def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; // FP data processing, bfloat16 format def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>; def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>; def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>; // FP data processing, scalar/vector, half precision def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^FCMPE?H")>; def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X], (instregex "^FCCMPE?H")>; def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY], (instregex "^FCSELH")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>; def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>; // FP data processing, scalar/vector, single/double precision def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; def : InstRW<[Ampere1Write_5cyc_1X], (instregex "^FCMPE?(S|D)")>; def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X], (instregex "^FCCMPE?(S|D)")>; def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY], (instregex "^FCSEL(S|D)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>; def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>; def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>; def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>; def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>; // FP miscellaneous instructions def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>; def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>; def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>; def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>; def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; // Integer arithmetic and logical instructions def : InstRW<[Ampere1Write_1cyc_1A], (instregex "ADC(W|X)r", "SBC(W|X)r")>; def : InstRW<[Ampere1Write_Arith], (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r[sx]")>; def : InstRW<[Ampere1Write_1cyc_1AB], (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r[ri]")>; def : InstRW<[Ampere1Write_ArithFlagsetting], (instregex "(ADD|AND|BIC|SUB)S(W|X)r[sx]")>; def : InstRW<[Ampere1Write_1cyc_1A], (instregex "(ADD|AND|BIC|SUB)S(W|X)r[ri]")>; def : InstRW<[Ampere1Write_1cyc_1A], (instregex "(ADC|SBC)S(W|X)r")>; def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>; def : InstRW<[Ampere1Write_1cyc_1A], (instregex "(CCMN|CCMP)(X|W)")>; def : InstRW<[Ampere1Write_1cyc_1A], (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>; def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>; def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "(S|U)MULHr")>; def : InstRW<[Ampere1Write_4cyc_1BS], (instregex "(S|U)?M(ADD|SUB)L?r")>; // Integer load instructions def : InstRW<[Ampere1Write_4cyc_2L], (instregex "(LDNP|LDP|LDPSW)(X|W)")>; def : InstRW<[Ampere1Write_4cyc_1L], (instregex "LDR(B|D|H|Q|S)ui")>; def : InstRW<[Ampere1Write_4cyc_1L], (instregex "LDR(D|Q|W|X)l")>; def : InstRW<[Ampere1Write_4cyc_1L], (instregex "LDTR(B|H|W|X)i")>; def : InstRW<[Ampere1Write_4cyc_1L], (instregex "LDTRS(BW|BX|HW|HX|W)i")>; def : InstRW<[Ampere1Write_4cyc_1L], (instregex "LDUR(BB|HH|X|W)i")>; def : InstRW<[Ampere1Write_4cyc_1L], (instregex "LDURS(BW|BX|HW|HX|W)i")>; def : InstRW<[Ampere1Write_5cyc_1AB_1L], (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; def : InstRW<[Ampere1Write_1cyc_1L], (instrs PRFMl, PRFUMi, PRFUMi)>; def : InstRW<[Ampere1Write_2cyc_1AB_1L], (instrs PRFMroW, PRFMroX)>; // Integer miscellaneous instructions def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>; def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>; def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>; def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>; def : InstRW<[Ampere1Write_1cyc_1AB], (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; // Integer store instructions def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>; def : InstRW<[Ampere1Write_2cyc_1B_1S], (instrs STPWi, STPXi)>; def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>; def : InstRW<[Ampere1Write_1cyc_1S], (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; def : InstRW<[Ampere1Write_1cyc_1S], (instregex "STUR(BB|HH|X|W)i", "STR(X|W)ui", "STUR(BB|HH|X|W)i")>; def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>; def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>; // Pointer authentication //def : InstRW<[Ampere1Write_7cyc_1BS], // (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>; def : InstRW<[Ampere1Write_8cyc_1BS_1A], (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; def : InstRW<[Ampere1Write_8cyc_1BS_2A], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; //def : InstRW<[Ampere1Write_7cyc_1BS], // (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>; def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>; def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>; // Vector integer instructions // -- absolute difference def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; // -- arithmetic def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; // -- arithmetic, horizontal, 16B def : InstRW<[Ampere1Write_12cyc_4XY], (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; def : InstRW<[Ampere1Write_12cyc_4XY], (instregex "^[SU](MIN|MAX)Vv16i8v")>; // -- arithmetic, horizontal, 4H/4S def : InstRW<[Ampere1Write_6cyc_2XY], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; def : InstRW<[Ampere1Write_6cyc_2XY], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; // -- arithmetic, horizontal, 8B/8H def : InstRW<[Ampere1Write_9cyc_3XY], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; def : InstRW<[Ampere1Write_9cyc_3XY], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; // -- arithmetic, narrowing def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; // -- arithmetic, pairwise def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; // -- arithmetic, saturating def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; // -- bit count def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^(CLS|CLZ|CNT)v")>; // -- compare def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", "^CMHIv", "^CMHSv")>; // -- compare non-zero def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>; // -- dot product def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; // -- fp reciprocal estimate def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>; // -- integer reciprocal estimate def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; // -- logical def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; // -- logical, narrowing def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "RSHRNv", "SHRNv", "SQSHRNv", "SQSHRUNv", "UQXTNv")>; // -- matrix multiply def : InstRW<[Ampere1Write_6cyc_2XY], (instrs SMMLA, UMMLA, USMMLA)>; // -- max/min def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; // -- move immediate def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; // -- multiply def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; // -- multiply accumulate def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; // -- negation, saturating def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; // -- reverse bits/bytes def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; // -- shift def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; // -- shift and accumulate def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; // -- shift, saturating def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", "^UQSHL")>; // Vector miscellaneous instructions // -- duplicate element def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>; // -- duplicate from GPR def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>; // -- extract narrow def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>; // -- insert/extract element def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; // -- move FP immediate def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>; // -- move element to GPR def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>; // -- move from GPR to any element def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; // -- table lookup def : InstRW<[Ampere1Write_2cyc_1XY], (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; def : InstRW<[Ampere1Write_4cyc_2XY], (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; def : InstRW<[Ampere1Write_6cyc_3XY], (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; def : InstRW<[Ampere1Write_8cyc_4XY], (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; // -- transpose def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; // -- zip/unzip def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; } // SchedModel = Ampere1Model