Lines Matching +full:- +full:19 +full:v

1 //=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 // * AMD Software Optimization Guide for AMD Family 19h Processors.
14 //===----------------------------------------------------------------------===//
17 // AMD SOG 19h, 2.9.6 Dispatch
21 // AMD SOG 19h, 2.10.3
23 // outstanding operations (integer, load/store, and floating-point) and is
26 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
28 // AMD SOG 19h, 2.9.1 Op Cache
30 // At each set-way intersection is an entry containing up to 8 macro ops.
33 // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
34 // unrolling leading to excessive filling of the op-cache from frontend.
36 // AMD SOG 19h, 2.6.2 L1 Data Cache
37 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38 // AMD SOG 19h, 2.12 L1 Data Cache
40 // <...> and can achieve 4-cycle load-to-use integer load latency.
42 // AMD SOG 19h, 2.12 L1 Data Cache
44 // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
50 // AMD SOG 19h, 2.8 Optimizing Branching
63 //===----------------------------------------------------------------------===//
65 //===----------------------------------------------------------------------===//
67 // AMD SOG 19h, 2.10.3 Retire Control Unit
69 // 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
70 // The retire unit handles in-order commit of up to nine macro ops per cycle.
73 //===----------------------------------------------------------------------===//
77 // AMD SOG 19h, 2.4 Superscalar Organization
83 //===----------------------------------------------------------------------===//
85 // AMD SOG 19h, 2.10.2 Execution Units
93 // AMD SOG 19h, 2.10.2 Execution Units
97 // AMD SOG 19h, 2.10.2 Execution Units
107 //===----------------------------------------------------------------------===//
109 // AMD SOG 19h, 2.10.2 Execution Units
113 // AMD SOG 19h, 2.10.2 Execution Units
121 //===----------------------------------------------------------------------===//
144 //===----------------------------------------------------------------------===//
146 // AMD SOG 19h, 2.10.3 Retire Control Unit
153 // AMD SOG 19h, 2.10.1 Schedulers
166 //===----------------------------------------------------------------------===//
167 // Floating-Point Unit
170 // AMD SOG 19h, 2.4 Superscalar Organization
172 // each servicing two FP pipelines and one store or FP-to-integer pipeline.
176 //===----------------------------------------------------------------------===//
178 // AMD SOG 19h, 2.10.1 Schedulers
190 //===----------------------------------------------------------------------===//
191 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
193 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
197 // (v)FADD*
206 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
253 //===----------------------------------------------------------------------===//
255 // AMD SOG 19h, 2.11 Floating-Point Unit
260 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
263 // (v)FADD*
284 // AMD SOG 19h, 2.11 Floating-Point Unit
289 // AMD SOG 19h, 2.11 Floating-Point Unit
328 //===----------------------------------------------------------------------===//
330 // Agner, 21.8 Register renaming and out-of-order schedulers
337 // AMD SOG 19h, 2.11 Floating-Point Unit
338 // The floating-point scheduler has a 2*32 entry macro op capacity.
339 // AMD SOG 19h, 2.11 Floating-Point Unit
348 // AMD SOG 19h, 2.11 Floating-Point Unit
350 // even if floating-point scheduler is full.
354 //===----------------------------------------------------------------------===//
355 // Load-Store Unit
358 // AMD SOG 19h, 2.12 Load-Store Unit
359 // The LS unit contains three largely independent pipe-lines
360 // enabling the execution of three 256-bit memory operations per cycle.
363 // AMD SOG 19h, 2.12 Load-Store Unit
367 // AMD SOG 19h, 2.12 Load-Store Unit
368 // The LS unit can process up to 72 out-of-order loads.
374 // AMD SOG 19h, 2.12 Load-Store Unit
378 // AMD SOG 19h, 2.12 Load-Store Unit
379 // The LS unit utilizes a 64-entry store queue (STQ).
385 //===----------------------------------------------------------------------===//
387 //===----------------------------------------------------------------------===//
390 // Instructions with folded loads are usually micro-fused, so they only appear
391 // as two micro-ops when dispatched by the schedulers.
484 //===----------------------------------------------------------------------===//
486 //===----------------------------------------------------------------------===//
494 // AMD SOG 19h, 2.11 Floating-Point Unit
497 def : ReadAdvance<ReadInt2Fpu, -1>;
506 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
597 // A 3-operand LEA (base, index, offset).
615 let Latency = 2; // FIXME: not from llvm-exegesis
623 defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
624 defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-b…
625 defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplica…
626 defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplica…
627 defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplica…
628 defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned M…
629 defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplica…
630 defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplica…
631 defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplica…
632 defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned M…
633 defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplica…
634 defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplica…
638 defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swa…
639 defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swa…
660 let Latency = 3; // FIXME: not from llvm-exegesis
662 let NumMicroOps = 19;
667 let Latency = 4; // FIXME: not from llvm-exegesis
681 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
688 let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
695 // FIXME: uops for 8-bit division measures as 2. for others it's a guess.
696 // FIXME: latency for 8-bit division measures as 10. for others it's a guess.
737 defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X8…
739 …CStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
846 defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
860 let Latency = 2; // FIXME: not from llvm-exegesis
881 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
891 let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
913 defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exe…
914 defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-e…
939 defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-
955 defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-ex…
963 defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-e…
964 defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-
965 defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-
1140 …est, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector in…
1141 …estY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector i…
1142 …estZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector i…
1182 defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr t…
1193 defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer.
1194 defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1195 defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1196 defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1203 defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer.
1205 defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1206 defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1207 defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1209 defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> …
1210 defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1211 defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> …
1212 defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> …
1220 defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> …
1221 defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1222 defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1223 defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1231 defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conver…
1232 defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size convers…
1233 defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> D…
1234 defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> D…
1236 defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conver…
1237 defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size convers…
1238 defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conver…
1239 defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conver…
1241 defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversio…
1242 defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Fl…
1243 defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Fl…
1245 defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1246 defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (…
1247 defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (…
1249 …FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size c…
1250 …FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size c…
1251 …FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size c…
1341 // Carry-less multiplication instructions.
1345 defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1348 …ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1349 …Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1351 // Catch-all for expensive system instructions.
1355 let Latency = 0; // FIXME: not from llvm-exegesis
1362 let Latency = 10; // FIXME: not from llvm-exegesis
1369 defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit…
1370 defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-
1371 defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffl…
1429 …WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-c…
1430 defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector v…
1456 defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1480 defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW su…
1548 "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
1549 "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1559 "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1581 "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1582 "(V?)P(SLL|SRL|SRA)DQYri",
1583 "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1584 "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1585 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1586 "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1587 "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1607 "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1617 "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1627 "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1628 "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1630 "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1640 "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)"
1649 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
1650 "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
1651 "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
1661 "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)"
1670 "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)"
1681 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1690 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1699 "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1758 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1770 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1785 "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1840 // NOTE: XORPSrr, XORPDrr are not zero-cycle!
1855 // NOTE: PXORrr,PANDNrr are not zero-cycle!
1869 // PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1883 // GPR Zero-idioms.
1889 // SSE XMM Zero-idioms.
1904 // AVX XMM Zero-idioms.
1919 // AVX YMM Zero-idioms.