Lines Matching +full:quad +full:- +full:precision
1 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 //===----------------------------------------------------------------------===//
13 // ===---------------------------------------------------------------------===//
18 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
41 // Two fully-pipelined integer ALU pipelines
202 // FIXME: If address is 64-bit aligned, AGU cycles is 1.
287 -1>, // dynamic uops
296 -1>, // dynamic uops
306 -1>, // dynamic uops
315 -1>, // dynamic uops
325 -1>, // dynamic uops
346 // FIXME: If address is 64-bit aligned, AGU cycles is 1.
420 [], [], -1>, // dynamic uops
427 [2], [], -1>, // dynamic uops
441 // instruction and vice-versa. We model this behavior with two artificial FUs:
445 // - Acquires DRegsVFP resource for 1 cycle
446 // - Reserves DRegsN resource for the whole duration (including time to
451 // "cross-domain" stalls.
464 // Single-precision FP Unary
473 // Double-precision FP Unary
483 // Single-precision FP Compare
492 // Double-precision FP Compare
535 // Single-Precision FP to Integer Convert
543 // Double-Precision FP to Integer Convert
551 // Integer to Single-Precision FP Convert
559 // Integer to Double-Precision FP Convert
567 // Single-precision FP ALU
575 // Double-precision FP ALU
583 // Single-precision FP Multiply
591 // Double-precision FP Multiply
599 // Single-precision FP MAC
607 // Double-precision FP MAC
615 // Single-precision Fused FP MAC
623 // Double-precision Fused FP MAC
631 // Single-precision FP DIV
639 // Double-precision FP DIV
647 // Single-precision FP SQRT
655 // Double-precision FP SQRT
664 // Integer to Single-precision Move
673 // Integer to Double-precision Move
682 // Single-precision to Integer Move
684 // On A9 move-from-VFP is free to issue with no stall if other VFP
685 // operations are in flight. I assume it still can't dual-issue though.
690 // Double-precision to Integer Move
692 // On A9 move-from-VFP is free to issue with no stall if other VFP
693 // operations are in flight. I assume it still can't dual-issue though.
698 // Single-precision FP Load
707 // Double-precision FP Load
708 // FIXME: Result latency is 1 if address is 64-bit aligned.
725 [1, 1, 1, 1], [], -1>, // dynamic uops
735 [2, 1, 1, 1], [], -1>, // dynamic uops
737 // Single-precision FP Store
746 // Double-precision FP Store
763 [1, 1, 1, 1], [], -1>, // dynamic uops
773 [2, 1, 1, 1], [], -1>, // dynamic uops
1275 // Double-register Integer Unary
1284 // Quad-register Integer Unary
1293 // Double-register Integer Q-Unary
1302 // Quad-register Integer CountQ-Unary
1311 // Double-register Integer Binary
1320 // Quad-register Integer Binary
1329 // Double-register Integer Subtract
1338 // Quad-register Integer Subtract
1347 // Double-register Integer Shift
1356 // Quad-register Integer Shift
1365 // Double-register Integer Shift (4 cycle)
1374 // Quad-register Integer Shift (4 cycle)
1383 // Double-register Integer Binary (4 cycle)
1392 // Quad-register Integer Binary (4 cycle)
1401 // Double-register Integer Subtract (4 cycle)
1410 // Quad-register Integer Subtract (4 cycle)
1420 // Double-register Integer Count
1429 // Quad-register Integer Count
1440 // Double-register Absolute Difference and Accumulate
1449 // Quad-register Absolute Difference and Accumulate
1458 // Double-register Integer Pair Add Long
1467 // Quad-register Integer Pair Add Long
1477 // Double-register Integer Multiply (.8, .16)
1486 // Quad-register Integer Multiply (.8, .16)
1496 // Double-register Integer Multiply (.32)
1505 // Quad-register Integer Multiply (.32)
1514 // Double-register Integer Multiply-Accumulate (.8, .16)
1523 // Double-register Integer Multiply-Accumulate (.32)
1532 // Quad-register Integer Multiply-Accumulate (.8, .16)
1541 // Quad-register Integer Multiply-Accumulate (.32)
1568 // Double-register Permute Move
1577 // Quad-register Permute Move
1586 // Integer to Single-precision Move
1594 // Integer to Double-precision Move
1602 // Single-precision to Integer Move
1610 // Double-precision to Integer Move
1636 // Double-register FP Unary
1645 // Quad-register FP Unary
1656 // Double-register FP Binary
1677 // Double-register FP VMUL
1686 // Quad-register FP Binary
1699 // Quad-register FP VMUL
1708 // Double-register FP Multiple-Accumulate
1717 // Quad-register FP Multiple-Accumulate
1728 // Double-register Fused FP Multiple-Accumulate
1737 // Quad-register Fused FP Multiple-Accumulate
1748 // Double-register Reciprical Step
1757 // Quad-register Reciprical Step
1766 // Double-register Permute
1775 // Quad-register Permute
1786 // Quad-register Permute (3 cycle issue)
1798 // Double-register VEXT
1807 // Quad-register VEXT
1877 // ===---------------------------------------------------------------------===//
1878 // The following definitions describe the simpler per-operand machine model.
1886 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
1888 let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
1905 //===----------------------------------------------------------------------===//
1923 //===----------------------------------------------------------------------===//
1932 // Write an integer shifted-by register
1960 // Floating-point
2018 // Load the upper 32-bits using the same micro-op.
2033 //===----------------------------------------------------------------------===//
2038 foreach NumCycles = 2-8 in {
2043 foreach NumAddr = 1-8 in {
2045 // Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
2051 SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
2055 // Fall-back for unknown LDMs.
2056 def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
2069 // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
2084 foreach NumAddr = 1-8 in {
2091 //===----------------------------------------------------------------------===//
2092 // LDM: Load multiple into 32-bit integer registers.
2104 // A9WriteLM variants expand into a pair of writes for each 64-bit
2110 SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
2111 SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
2112 SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
2113 SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
2114 SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
2115 SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
2116 SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
2117 SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
2131 //===----------------------------------------------------------------------===//
2135 // so can be used in WriteSequences for in single-issue instructions that
2142 foreach NumAddr = 1-8 in {
2144 // Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
2147 // A9WriteLfp1-8 definitions are statically expanded into a sequence of
2153 // A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
2162 // A9WriteLfp1-8 sequence based on a predicate. This supports the
2163 // preRA VLDM variants in which all 64-bit loads are written to the
2164 // same tuple of either single or double precision registers.
2181 foreach NumAddr = 1-8 in {
2196 // pair of writes for each 64-bit data loaded. When the number of
2201 [A9WriteLMfp1, A9WriteLMfp2, // 0-1
2202 A9WriteLMfp3, A9WriteLMfp4, // 2-3
2203 A9WriteLMfp5, A9WriteLMfp6, // 4-5
2204 A9WriteLMfp7, A9WriteLMfp8, // 6-7
2205 A9WriteLMfp1Hi, // 8-8
2206 A9WriteLMfp2Hi, A9WriteLMfp2Hi, // 9-10
2207 A9WriteLMfp3Hi, A9WriteLMfp3Hi, // 11-12
2208 A9WriteLMfp4Hi, A9WriteLMfp4Hi, // 13-14
2209 A9WriteLMfp5Hi, A9WriteLMfp5Hi, // 15-16
2210 A9WriteLMfp6Hi, A9WriteLMfp6Hi, // 17-18
2211 A9WriteLMfp7Hi, A9WriteLMfp7Hi, // 19-20
2212 A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
2215 SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
2216 SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
2217 SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
2218 SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
2219 SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
2220 SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
2221 SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
2222 SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
2239 // Distinguish between our multiple MI-level forms of the same
2242 "MI->getOperand(0).getReg().isVirtual()">;
2244 "MI->getOperand(0).getReg().isPhysical()">;
2252 //===----------------------------------------------------------------------===//
2253 // Resources for other (non-LDM/VLDM) Variants.
2270 // after the instruction issues, decreases producer latency by N-1.
2275 //===----------------------------------------------------------------------===//
2281 // This table follows the ARM Cortex-A9 Technical Reference Manuals,
2309 // TODO: For floating-point ops, we model the pipeline forwarding
2367 // Reuse the load-multiple variants for store-multiple because the
2481 // NEON floating-point
2499 // ===---------------------------------------------------------------------===//
2500 // Floating-point. Map target defined SchedReadWrite to processor specific ones
2522 // ===---------------------------------------------------------------------===//
2523 // Subtarget-specific overrides. Map opcodes to list of SchedReadWrite types.