Lines Matching +full:pre +full:- +full:multiply
1 //=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the machine model for ARM Cortex-A57 to support
12 //===----------------------------------------------------------------------===//
14 //===----------------------------------------------------------------------===//
16 // The Cortex-A57 is a traditional superscalar microprocessor with a
17 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
18 // much wider out-of-order issue stage, this produced a need to carefully
19 // schedule micro-ops so that all three decoded each cycle are successfully
22 // modeling the machine as out-of-order.
74 let IssueWidth = 3; // 3-way decode and dispatch
75 let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
90 //===----------------------------------------------------------------------===//
91 // Define each kind of processor resource and number available on Cortex-A57.
92 // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
93 // micro-ops wait for their operands and then issue out-of-order.
95 def A57UnitB : ProcResource<1>; // Type B micro-ops
96 def A57UnitI : ProcResource<2>; // Type I micro-ops
97 def A57UnitM : ProcResource<1>; // Type M micro-ops
98 def A57UnitL : ProcResource<1>; // Type L micro-ops
99 def A57UnitS : ProcResource<1>; // Type S micro-ops
101 def A57UnitX : ProcResource<1>; // Type X micro-ops (F1)
102 def A57UnitW : ProcResource<1>; // Type W micro-ops (F0)
105 def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops
110 //===----------------------------------------------------------------------===//
111 // Define customized scheduler read/write types specific to the Cortex-A57.
151 // -----------------------------------------------------------------------------
155 // --- 3.2 Branch Instructions ---
169 // --- 3.3 Arithmetic and Logical Instructions ---
210 // --- 3.4 Move and Shift Instructions ---
240 // MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
257 // --- 3.5 Divide and Multiply Instructions ---
261 // Multiply: tMul not bound to common WriteRes types
267 // Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
269 // Multiply-accumulate pipelines support late-forwarding of accumulate operands
270 // from similar μops, allowing a typical sequence of multiply-accumulate μops
290 // Multiply long: SMULL, UMULL
294 // --- 3.6 Saturating and Parallel Arithmetic Instructions ---
297 // Conditional GE-setting instructions require three extra μops
346 // --- 3.7 Miscellaneous Data-Processing Instructions ---
374 // --- 3.8 Load Instructions ---
418 // --- LDR pre-indexed ---
419 // Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
423 // Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
424 // (5 cyc load result for not-lsl2 scaled)
441 // LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
455 // --- LDR post-indexed ---
489 // LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
495 // --- Preload instructions ---
510 // --- Load multiple instructions ---
511 foreach NumAddr = 1-8 in {
530 SchedVar<A57LMAddrPred1, A57LDMOpsListNoregin.Writes[0-1]>,
531 SchedVar<A57LMAddrPred2, A57LDMOpsListNoregin.Writes[0-3]>,
532 SchedVar<A57LMAddrPred3, A57LDMOpsListNoregin.Writes[0-5]>,
533 SchedVar<A57LMAddrPred4, A57LDMOpsListNoregin.Writes[0-7]>,
534 SchedVar<A57LMAddrPred5, A57LDMOpsListNoregin.Writes[0-9]>,
535 SchedVar<A57LMAddrPred6, A57LDMOpsListNoregin.Writes[0-11]>,
536 SchedVar<A57LMAddrPred7, A57LDMOpsListNoregin.Writes[0-13]>,
537 SchedVar<A57LMAddrPred8, A57LDMOpsListNoregin.Writes[0-15]>,
538 SchedVar<NoSchedPred, A57LDMOpsListNoregin.Writes[0-15]>
551 SchedVar<A57LMAddrPred1, A57LDMOpsListRegin.Writes[0-1]>,
552 SchedVar<A57LMAddrPred2, A57LDMOpsListRegin.Writes[0-3]>,
553 SchedVar<A57LMAddrPred3, A57LDMOpsListRegin.Writes[0-5]>,
554 SchedVar<A57LMAddrPred4, A57LDMOpsListRegin.Writes[0-7]>,
555 SchedVar<A57LMAddrPred5, A57LDMOpsListRegin.Writes[0-9]>,
556 SchedVar<A57LMAddrPred6, A57LDMOpsListRegin.Writes[0-11]>,
557 SchedVar<A57LMAddrPred7, A57LDMOpsListRegin.Writes[0-13]>,
558 SchedVar<A57LMAddrPred8, A57LDMOpsListRegin.Writes[0-15]>,
559 SchedVar<NoSchedPred, A57LDMOpsListRegin.Writes[0-15]>
573 SchedVar<A57LMAddrUpdPred1, A57LDMOpsList_Upd.Writes[0-2]>,
574 SchedVar<A57LMAddrUpdPred2, A57LDMOpsList_Upd.Writes[0-4]>,
575 SchedVar<A57LMAddrUpdPred3, A57LDMOpsList_Upd.Writes[0-6]>,
576 SchedVar<A57LMAddrUpdPred4, A57LDMOpsList_Upd.Writes[0-8]>,
577 SchedVar<A57LMAddrUpdPred5, A57LDMOpsList_Upd.Writes[0-10]>,
578 SchedVar<A57LMAddrUpdPred6, A57LDMOpsList_Upd.Writes[0-12]>,
579 SchedVar<A57LMAddrUpdPred7, A57LDMOpsList_Upd.Writes[0-14]>,
580 SchedVar<A57LMAddrUpdPred8, A57LDMOpsList_Upd.Writes[0-16]>,
581 SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]>
597 // --- 3.9 Store Instructions ---
625 // Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
627 "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
628 "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
630 // Store, register pre-indexed:
649 // pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
677 // 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
681 // post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
686 // --- Store multiple instructions ---
717 // --- 3.10 FP Data Processing Instructions ---
723 // fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
753 // FP multiply-accumulate pipelines support late forwarding of the result
754 // from FP multiply μops to the accumulate operands of an
755 // FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
756 // after the FP multiply μop has been issued
757 // FP multiply, FZ
764 // FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
768 // VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
769 // VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
772 // Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
793 // --- 3.11 FP Miscellaneous Instructions ---
806 // 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
809 // --- 3.12 FP Load Instructions ---
826 SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond.Writes[0-1]>,
827 SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond.Writes[0-3]>,
828 SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond.Writes[0-5]>,
829 SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond.Writes[0-7]>,
830 SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond.Writes[0-9]>,
831 SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond.Writes[0-11]>,
832 SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond.Writes[0-13]>,
833 SchedVar<NoSchedPred, A57VLDMOpsListUncond.Writes[0-15]>
846 SchedVar<A57LMAddrPred1, A57VLDMOpsListCond.Writes[0-1]>,
847 SchedVar<A57LMAddrPred2, A57VLDMOpsListCond.Writes[0-3]>,
848 SchedVar<A57LMAddrPred3, A57VLDMOpsListCond.Writes[0-5]>,
849 SchedVar<A57LMAddrPred4, A57VLDMOpsListCond.Writes[0-7]>,
850 SchedVar<A57LMAddrPred5, A57VLDMOpsListCond.Writes[0-9]>,
851 SchedVar<A57LMAddrPred6, A57VLDMOpsListCond.Writes[0-11]>,
852 SchedVar<A57LMAddrPred7, A57VLDMOpsListCond.Writes[0-13]>,
853 SchedVar<NoSchedPred, A57VLDMOpsListCond.Writes[0-15]>
873 SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond_Upd.Writes[0-1]>,
874 SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond_Upd.Writes[0-3]>,
875 SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond_Upd.Writes[0-5]>,
876 SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond_Upd.Writes[0-7]>,
877 SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond_Upd.Writes[0-9]>,
878 SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond_Upd.Writes[0-11]>,
879 SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond_Upd.Writes[0-13]>,
880 SchedVar<NoSchedPred, A57VLDMOpsListUncond_Upd.Writes[0-15]>
893 SchedVar<A57LMAddrPred1, A57VLDMOpsListCond_Upd.Writes[0-1]>,
894 SchedVar<A57LMAddrPred2, A57VLDMOpsListCond_Upd.Writes[0-3]>,
895 SchedVar<A57LMAddrPred3, A57VLDMOpsListCond_Upd.Writes[0-5]>,
896 SchedVar<A57LMAddrPred4, A57VLDMOpsListCond_Upd.Writes[0-7]>,
897 SchedVar<A57LMAddrPred5, A57VLDMOpsListCond_Upd.Writes[0-9]>,
898 SchedVar<A57LMAddrPred6, A57VLDMOpsListCond_Upd.Writes[0-11]>,
899 SchedVar<A57LMAddrPred7, A57VLDMOpsListCond_Upd.Writes[0-13]>,
900 SchedVar<NoSchedPred, A57VLDMOpsListCond_Upd.Writes[0-15]>
911 // --- 3.13 FP Store Instructions ---
968 // --- 3.14 ASIMD Integer Instructions ---
973 // ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
1013 // ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1014 // Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
1015 // and multiply-with-accumulate instructions relative to r0pX.
1023 // ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1031 // ASIMD multiply accumulate, D-form
1044 // ASIMD multiply accumulate, Q-form
1057 // ASIMD multiply accumulate long
1070 // ASIMD multiply accumulate saturating long
1083 // Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
1088 // ASIMD multiply long
1117 // ASIMD shift by immed and insert, basic, D-form
1121 // ASIMD shift by immed and insert, basic, Q-form
1125 // ASIMD shift by register, basic, D-form
1129 // ASIMD shift by register, basic, Q-form
1133 // ASIMD shift by register, complex, D-form
1139 // ASIMD shift by register, complex, Q-form
1144 // --- 3.15 ASIMD Floating-Point Instructions ---
1164 // ASIMD FP convert, half-precision: 8cyc F0/F1
1175 // ASIMD FP multiply
1179 // ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1193 // --- 3.16 ASIMD Miscellaneous Instructions ---
1227 // ASIMD reverse, swap, table lookup (1-2 reg)
1230 // ASIMD table lookup (3-4 reg)
1242 // ASIMD unzip/zip, D-form
1246 // ASIMD unzip/zip, Q-form
1250 // --- 3.17 ASIMD Load Instructions ---
1262 // 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1267 // 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1396 // --- 3.18 ASIMD Store Instructions ---
1462 // --- 3.19 Cryptography Extensions ---
1466 // Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1479 // --- 3.20 CRC ---
1482 // -----------------------------------------------------------------------------