1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AMD btver2 (Jaguar) to support 10// instruction scheduling and other instruction cost heuristics. Based off AMD Software 11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 12// 13//===----------------------------------------------------------------------===// 14 15def BtVer2Model : SchedMachineModel { 16 // All x86 instructions are modeled as a single micro-op, and btver2 can 17 // decode 2 instructions per cycle. 18 let IssueWidth = 2; 19 let MicroOpBufferSize = 64; // Retire Control Unit 20 let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 21 let HighLatency = 25; 22 let MispredictPenalty = 14; // Minimum branch misdirection penalty 23 let PostRAScheduler = 1; 24 25 // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 26 // the scheduler to assign a default model to unrecognized opcodes. 27 let CompleteModel = 0; 28} 29 30let SchedModel = BtVer2Model in { 31 32// Jaguar can issue up to 6 micro-ops in one cycle 33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 39 40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 41// speculative version of the 64-bit integer registers. 42// Reference: www.realworldtech.com/jaguar/4/ 43// 44// The processor always keeps the different parts of an integer register 45// together. An instruction that writes to a part of a register will therefore 46// have a false dependence on any previous write to the same register or any 47// part of it. 48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 49// access" - Agner Fog's "microarchitecture.pdf". 50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 51 0, // Max moves that can be eliminated per cycle. 52 1>; // Restrict move elimination to zero regs. 53 54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 55// registers. Operations on 256-bit data types are cracked into two COPs. 56// Reference: www.realworldtech.com/jaguar/4/ 57 58// The PRF in the floating point unit can eliminate a move from a MMX or SSE 59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 60// dependency breaking instruction, or via VZEROALL). 61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 62// instructions" - Agner Fog's "microarchitecture.pdf" 63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 64 0, // Max moves that can be eliminated per cycle. 65 1>; // Restrict move elimination to zero regs. 66 67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 68// retire up to two macro-ops per cycle. 69// Reference: "Software Optimization Guide for AMD Family 16h Processors" 70def JRCU : RetireControlUnit<64, 2>; 71 72// Integer Pipe Scheduler 73def JALU01 : ProcResGroup<[JALU0, JALU1]> { 74 let BufferSize=20; 75} 76 77// AGU Pipe Scheduler 78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 79 let BufferSize=12; 80} 81 82// Fpu Pipe Scheduler 83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 84 let BufferSize=18; 85} 86 87// Functional units 88def JDiv : ProcResource<1>; // integer division 89def JMul : ProcResource<1>; // integer multiplication 90def JVALU0 : ProcResource<1>; // vector integer 91def JVALU1 : ProcResource<1>; // vector integer 92def JVIMUL : ProcResource<1>; // vector integer multiplication 93def JSTC : ProcResource<1>; // vector store/convert 94def JFPM : ProcResource<1>; // FP multiplication 95def JFPA : ProcResource<1>; // FP addition 96 97// Functional unit groups 98def JFPX : ProcResGroup<[JFPA, JFPM]>; 99def JVALU : ProcResGroup<[JVALU0, JVALU1]>; 100 101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 102// cycles after the memory operand. 103def : ReadAdvance<ReadAfterLd, 3>; 104 105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 106// cycles after the memory operand. 107def : ReadAdvance<ReadAfterVecLd, 5>; 108def : ReadAdvance<ReadAfterVecXLd, 5>; 109def : ReadAdvance<ReadAfterVecYLd, 5>; 110 111/// "Additional 6 cycle transfer operation which moves a floating point 112/// operation input value from the integer unit to the floating point unit. 113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 114def : ReadAdvance<ReadInt2Fpu, -6>; 115 116// Many SchedWrites are defined in pairs with and without a folded load. 117// Instructions with folded loads are usually micro-fused, so they only appear 118// as two micro-ops when dispatched by the schedulers. 119// This multiclass defines the resource usage for variants with and without 120// folded loads. 121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 122 list<ProcResourceKind> ExePorts, 123 int Lat, list<int> Res = [], int UOps = 1, 124 int LoadUOps = 0> { 125 // Register variant is using a single cycle on ExePort. 126 def : WriteRes<SchedRW, ExePorts> { 127 let Latency = Lat; 128 let ResourceCycles = Res; 129 let NumMicroOps = UOps; 130 } 131 132 // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 133 // latency. 134 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 135 let Latency = !add(Lat, 3); 136 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 137 let NumMicroOps = !add(UOps, LoadUOps); 138 } 139} 140 141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 142 list<ProcResourceKind> ExePorts, 143 int Lat, list<int> Res = [], int UOps = 1, 144 int LoadUOps = 0> { 145 // Register variant is using a single cycle on ExePort. 146 def : WriteRes<SchedRW, ExePorts> { 147 let Latency = Lat; 148 let ResourceCycles = Res; 149 let NumMicroOps = UOps; 150 } 151 152 // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 153 // latency. 154 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 155 let Latency = !add(Lat, 5); 156 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 157 let NumMicroOps = !add(UOps, LoadUOps); 158 } 159} 160 161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 162 list<ProcResourceKind> ExePorts, 163 int Lat, list<int> Res = [2], int UOps = 2, 164 int LoadUOps = 0> { 165 // Register variant is using a single cycle on ExePort. 166 def : WriteRes<SchedRW, ExePorts> { 167 let Latency = Lat; 168 let ResourceCycles = Res; 169 let NumMicroOps = UOps; 170 } 171 172 // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 173 // latency. 174 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 175 let Latency = !add(Lat, 5); 176 let ResourceCycles = !listconcat([2], Res); 177 let NumMicroOps = !add(UOps, LoadUOps); 178 } 179} 180 181// Instructions that have local forwarding disabled have an extra +1cy latency. 182 183// A folded store needs a cycle on the SAGU for the store data, most RMW 184// instructions don't need an extra uop. ALU RMW operations don't seem to 185// benefit from STLF, and their observed latency is 6cy. That is the reason why 186// this write adds two extra cycles (instead of just 1cy for the store). 187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; 188 189//////////////////////////////////////////////////////////////////////////////// 190// Arithmetic. 191//////////////////////////////////////////////////////////////////////////////// 192 193defm : JWriteResIntPair<WriteALU, [JALU01], 1>; 194defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 195 196defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 197defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 198defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; 199defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; 200defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; 201 202defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; 203defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; 204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; 205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; 206defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; 207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; 208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; 209defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; 211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; 212defm : X86WriteResUnsupported<WriteIMulH>; 213defm : X86WriteResUnsupported<WriteIMulHLd>; 214defm : X86WriteResPairUnsupported<WriteMULX32>; 215defm : X86WriteResPairUnsupported<WriteMULX64>; 216 217defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 218defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 219defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 220defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 221defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 222defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 223defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 224defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 225 226defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 227 228defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 229defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 230def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 231def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 232def : WriteRes<WriteLAHFSAHF, [JALU01]>; 233 234defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 235defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 236defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 237defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 238defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 239defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 240 241// This is for simple LEAs with one or two input operands. 242def : WriteRes<WriteLEA, [JALU01]>; 243 244// Bit counts. 245defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 246defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 247defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 248defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 249defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 250 251// BMI1 BEXTR/BLS, BMI2 BZHI 252defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 253defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 254defm : X86WriteResPairUnsupported<WriteBZHI>; 255 256//////////////////////////////////////////////////////////////////////////////// 257// Integer shifts and rotates. 258//////////////////////////////////////////////////////////////////////////////// 259 260defm : JWriteResIntPair<WriteShift, [JALU01], 1>; 261defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 262defm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 263defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 264 265// SHLD/SHRD. 266defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 267defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 268defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 269defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 270 271//////////////////////////////////////////////////////////////////////////////// 272// Loads, stores, and moves, not folded with other operations. 273//////////////////////////////////////////////////////////////////////////////// 274 275def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 276def : WriteRes<WriteStore, [JSAGU]>; 277def : WriteRes<WriteStoreNT, [JSAGU]>; 278def : WriteRes<WriteMove, [JALU01]>; 279defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>; 280 281// Load/store MXCSR. 282def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 283def : WriteRes<WriteSTMXCSR, [JSAGU]>; 284 285// Treat misc copies as a move. 286def : InstRW<[WriteMove], (instrs COPY)>; 287 288//////////////////////////////////////////////////////////////////////////////// 289// Idioms that clear a register, like xorps %xmm0, %xmm0. 290// These can often bypass execution ports completely. 291//////////////////////////////////////////////////////////////////////////////// 292 293def : WriteRes<WriteZero, []>; 294 295//////////////////////////////////////////////////////////////////////////////// 296// Branches don't produce values, so they have no latency, but they still 297// consume resources. Indirect branches can fold loads. 298//////////////////////////////////////////////////////////////////////////////// 299 300defm : JWriteResIntPair<WriteJump, [JALU01], 1>; 301 302//////////////////////////////////////////////////////////////////////////////// 303// Special case scheduling classes. 304//////////////////////////////////////////////////////////////////////////////// 305 306def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 307def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 308def : WriteRes<WriteFence, [JSAGU]>; 309 310// Nops don't have dependencies, so there's no actual latency, but we set this 311// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 312def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 313 314def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { 315 let Latency = 3; 316 let ResourceCycles = [3]; 317 let NumMicroOps = 3; 318} 319 320def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 321 let Latency = 16; 322 let ResourceCycles = [3,16,16]; 323 let NumMicroOps = 5; 324} 325 326def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 327 let Latency = 17; 328 let ResourceCycles = [3,17,17]; 329 let NumMicroOps = 6; 330} 331 332def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 333 let Latency = 11; 334 let ResourceCycles = [3,1,1]; 335 let NumMicroOps = 5; 336} 337 338def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 339 let Latency = 11; 340 let ResourceCycles = [3,1,1]; 341 let NumMicroOps = 18; 342} 343 344def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 345 let Latency = 32; 346 let ResourceCycles = [6,1,1]; 347 let NumMicroOps = 28; 348} 349 350def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 351 let Latency = 19; 352 let ResourceCycles = [3,19,19]; 353 let NumMicroOps = 18; 354} 355 356def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 357 let Latency = 38; 358 let ResourceCycles = [6,38,38]; 359 let NumMicroOps = 28; 360} 361 362def JWriteCMPXCHGVariant : SchedWriteVariant<[ 363 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, 364 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, 365 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, 366 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, 367 SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, 368 SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, 369 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, 370 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, 371 SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, 372 SchedVar<NoSchedPred, [WriteCMPXCHG]> 373]>; 374 375// The first five reads are contributed by the memory load operand. 376// We ignore those reads and set a read-advance for the other input operands 377// including the implicit read of RAX. 378def : InstRW<[JWriteCMPXCHGVariant, 379 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 380 ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, 381 LCMPXCHG32, LCMPXCHG64, 382 CMPXCHG8rm, CMPXCHG16rm, 383 CMPXCHG32rm, CMPXCHG64rm)>; 384 385def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, 386 CMPXCHG32rr, CMPXCHG64rr)>; 387 388def : InstRW<[JWriteCMPXCHGVariant, 389 // Ignore reads contributed by the memory operand. 390 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 391 // Add a read-advance to every implicit register read. 392 ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, 393 CMPXCHG8B, CMPXCHG16B)>; 394 395def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 396 let Latency = 19; 397 let ResourceCycles = [1,19,19]; 398 let NumMicroOps = 1; 399} 400 401def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ 402 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, 403 SchedVar<NoSchedPred, [WriteALURMW]> 404]>; 405def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, 406 DEC8m, DEC16m, DEC32m, DEC64m, 407 NOT8m, NOT16m, NOT32m, NOT64m, 408 NEG8m, NEG16m, NEG32m, NEG64m)>; 409 410def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { 411 let Latency = 2; 412 let ResourceCycles = [3]; 413 let NumMicroOps = 3; 414} 415def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, 416 XADD32rr, XADD64rr)>; 417 418// This write defines the latency of the in/out register operand of a non-atomic 419// XADDrm. This is the first of a pair of writes that model non-atomic 420// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). 421// 422// We need two writes because the instruction latency differs from the output 423// register operand latency. In particular, the first write describes the first 424// (and only) output register operand of the instruction. However, the 425// instruction latency is set to the MAX of all the write latencies. That's why 426// a second write is needed in this case (see example below). 427// 428// Example: 429// XADD %ecx, (%rsp) ## Instruction latency: 11cy 430// ## ECX write Latency: 3cy 431// 432// Register ECX becomes available in 3 cycles. That is because the value of ECX 433// is exchanged with the value read from the stack pointer, and the load-to-use 434// latency is assumed to be 3cy. 435def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 436 let Latency = 3; // load-to-use latency 437 let ResourceCycles = [3]; 438 let NumMicroOps = 3; 439} 440 441// This write defines the latency of the in/out register operand of an atomic 442// XADDrm. This is the first of a sequence of two writes used to model atomic 443// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. 444// 445// 446// Example: 447// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy 448// ## ECX write Latency: 11cy 449// 450// The value of ECX becomes available only after 11cy from the start of 451// execution. This write is used to specifically set that operand latency. 452def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 453 let Latency = 11; 454 let ResourceCycles = [3]; 455 let NumMicroOps = 3; 456} 457 458// This write defines the latency of the in/out register operand of an atomic 459// XCHGrm. This write is the first of a sequence of two writes that describe 460// atomic XCHG operations. We need two writes because the instruction latency 461// differs from the output register write latency. We want to make sure that 462// the output register operand becomes visible after 11cy. However, we want to 463// set the instruction latency to 16cy. 464def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { 465 let Latency = 11; 466 let ResourceCycles = [2]; 467 let NumMicroOps = 2; 468} 469 470def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 471 let Latency = 11; 472 let ResourceCycles = [1, 1]; 473 let NumMicroOps = 1; 474} 475 476def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 477 let Latency = 16; 478 let ResourceCycles = [16, 16]; 479 let NumMicroOps = 1; 480} 481 482def JWriteXADDrm_Part1 : SchedWriteVariant<[ 483 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, 484 SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> 485]>; 486 487def JWriteXADDrm_Part2 : SchedWriteVariant<[ 488 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, 489 SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> 490]>; 491 492def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], 493 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, 494 LXADD8, LXADD16, LXADD32, LXADD64)>; 495 496def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], 497 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; 498 499 500//////////////////////////////////////////////////////////////////////////////// 501// Floating point. This covers both scalar and vector operations. 502//////////////////////////////////////////////////////////////////////////////// 503 504defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 505defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 506defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 507defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 508defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; 509defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; 510defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 511defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 512 513defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 514defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 515defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 516defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 517defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 518defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 519 520defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; 521defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; 522defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; 523defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; 524 525defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 526defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 527defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 528defm : X86WriteResUnsupported<WriteFMoveZ>; 529 530defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 531 532defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 533defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 534defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 535defm : X86WriteResPairUnsupported<WriteFAddZ>; 536defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 537defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 538defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 539defm : X86WriteResPairUnsupported<WriteFAdd64Z>; 540defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 541defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 542defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 543defm : X86WriteResPairUnsupported<WriteFCmpZ>; 544defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 545defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 546defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 547defm : X86WriteResPairUnsupported<WriteFCmp64Z>; 548defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 549defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>; 550defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 551defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 552defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 553defm : X86WriteResPairUnsupported<WriteFMulZ>; 554defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 555defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 556defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 557defm : X86WriteResPairUnsupported<WriteFMul64Z>; 558defm : X86WriteResPairUnsupported<WriteFMA>; 559defm : X86WriteResPairUnsupported<WriteFMAX>; 560defm : X86WriteResPairUnsupported<WriteFMAY>; 561defm : X86WriteResPairUnsupported<WriteFMAZ>; 562defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 563defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 564defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 565defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 566defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 567defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 568defm : X86WriteResPairUnsupported<WriteFRcpZ>; 569defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 570defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 571defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 572defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 573defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 574defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 575defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 576defm : X86WriteResPairUnsupported<WriteFDivZ>; 577defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 578defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 579defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 580defm : X86WriteResPairUnsupported<WriteFDiv64Z>; 581defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 582defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 583defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 584defm : X86WriteResPairUnsupported<WriteFSqrtZ>; 585defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 586defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 587defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 588defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 589defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 590defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 591defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 592defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 593defm : X86WriteResPairUnsupported<WriteFRndZ>; 594defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 595defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 596defm : X86WriteResPairUnsupported<WriteFLogicZ>; 597defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 598defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 599defm : X86WriteResPairUnsupported<WriteFTestZ>; 600defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 601defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 602defm : X86WriteResPairUnsupported<WriteFShuffleZ>; 603defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 604defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 605defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 606defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 607defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 608defm : X86WriteResPairUnsupported<WriteFBlendZ>; 609defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 610defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 611defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 612defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 613defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 614 615//////////////////////////////////////////////////////////////////////////////// 616// Conversions. 617//////////////////////////////////////////////////////////////////////////////// 618 619defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 620defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 621defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 622defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 623defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 624defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 625defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 626defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 627 628defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 629defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 630defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 631defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 632defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 633defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 634defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 635defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 636defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 637defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 638 639defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 640defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 641defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 642defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 643 644defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 645defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 646defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 647defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 648 649defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 650defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 651defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 652 653defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 654defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 655defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 656defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 657defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 658defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 659 660//////////////////////////////////////////////////////////////////////////////// 661// Vector integer operations. 662//////////////////////////////////////////////////////////////////////////////// 663 664defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 665defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; 666defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; 667defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 668defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 669defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 670defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 671 672defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 673defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 674defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 675defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 676defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 677defm : X86WriteResUnsupported<WriteVecMaskedStore32>; 678defm : X86WriteResUnsupported<WriteVecMaskedStore64>; 679defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; 680defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; 681 682defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 683defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 684defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 685defm : X86WriteResUnsupported<WriteVecMoveZ>; 686defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 687defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 688 689defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 690defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 691defm : X86WriteResPairUnsupported<WriteVecALUY>; 692defm : X86WriteResPairUnsupported<WriteVecALUZ>; 693defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 694defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 695defm : X86WriteResPairUnsupported<WriteVecShiftY>; 696defm : X86WriteResPairUnsupported<WriteVecShiftZ>; 697defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 698defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 699defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 700defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 701defm : X86WriteResPairUnsupported<WriteVarVecShift>; 702defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 703defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 704defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 705defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 706defm : X86WriteResPairUnsupported<WriteVecIMulY>; 707defm : X86WriteResPairUnsupported<WriteVecIMulZ>; 708defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 709defm : X86WriteResPairUnsupported<WritePMULLDY>; 710defm : X86WriteResPairUnsupported<WritePMULLDZ>; 711defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 712defm : X86WriteResPairUnsupported<WriteMPSADY>; 713defm : X86WriteResPairUnsupported<WriteMPSADZ>; 714defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 715defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 716defm : X86WriteResPairUnsupported<WritePSADBWY>; 717defm : X86WriteResPairUnsupported<WritePSADBWZ>; 718defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 719defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 720defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 721defm : X86WriteResPairUnsupported<WriteShuffleY>; 722defm : X86WriteResPairUnsupported<WriteShuffleZ>; 723defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 724defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 725defm : X86WriteResPairUnsupported<WriteVarShuffleY>; 726defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 727defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 728defm : X86WriteResPairUnsupported<WriteBlendY>; 729defm : X86WriteResPairUnsupported<WriteBlendZ>; 730defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 731defm : X86WriteResPairUnsupported<WriteVarBlendY>; 732defm : X86WriteResPairUnsupported<WriteVarBlendZ>; 733defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 734defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 735defm : X86WriteResPairUnsupported<WriteVecLogicY>; 736defm : X86WriteResPairUnsupported<WriteVecLogicZ>; 737defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 738defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 739defm : X86WriteResPairUnsupported<WriteVecTestZ>; 740defm : X86WriteResPairUnsupported<WriteShuffle256>; 741defm : X86WriteResPairUnsupported<WriteVPMOV256>; 742defm : X86WriteResPairUnsupported<WriteVarShuffle256>; 743 744//////////////////////////////////////////////////////////////////////////////// 745// Vector insert/extract operations. 746//////////////////////////////////////////////////////////////////////////////// 747 748defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 749defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 750defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 751defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 752 753//////////////////////////////////////////////////////////////////////////////// 754// SSE42 String instructions. 755//////////////////////////////////////////////////////////////////////////////// 756 757defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 758defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 759defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 760defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 761 762//////////////////////////////////////////////////////////////////////////////// 763// MOVMSK Instructions. 764//////////////////////////////////////////////////////////////////////////////// 765 766def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 767def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 768defm : X86WriteResUnsupported<WriteVecMOVMSKY>; 769def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 770 771//////////////////////////////////////////////////////////////////////////////// 772// AES Instructions. 773//////////////////////////////////////////////////////////////////////////////// 774 775defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 776defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 777defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 778 779//////////////////////////////////////////////////////////////////////////////// 780// Horizontal add/sub instructions. 781//////////////////////////////////////////////////////////////////////////////// 782 783defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 784defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 785defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 786defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 787defm : X86WriteResPairUnsupported<WritePHAddY>; 788 789//////////////////////////////////////////////////////////////////////////////// 790// Carry-less multiplication instructions. 791//////////////////////////////////////////////////////////////////////////////// 792 793defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 794 795//////////////////////////////////////////////////////////////////////////////// 796// SSE4A instructions. 797//////////////////////////////////////////////////////////////////////////////// 798 799def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 800 let Latency = 2; 801 let ResourceCycles = [1, 4]; 802} 803def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 804 805//////////////////////////////////////////////////////////////////////////////// 806// AVX instructions. 807//////////////////////////////////////////////////////////////////////////////// 808 809def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 810def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 811 812def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 813 let Latency = 6; 814 let ResourceCycles = [1, 2, 4]; 815 let NumMicroOps = 2; 816} 817def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 818 VBROADCASTSSYrm, 819 VBROADCASTF128)>; 820 821def JWriteJVZEROALL: SchedWriteRes<[]> { 822 let Latency = 90; 823 let NumMicroOps = 73; 824} 825def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 826 827def JWriteJVZEROUPPER: SchedWriteRes<[]> { 828 let Latency = 46; 829 let NumMicroOps = 37; 830} 831def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 832 833/////////////////////////////////////////////////////////////////////////////// 834// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ 835/////////////////////////////////////////////////////////////////////////////// 836 837def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { 838 let Latency = 34; 839 let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; 840 let NumMicroOps = 63; 841} 842def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, 843 VMASKMOVDQU, VMASKMOVDQU64)>; 844 845/////////////////////////////////////////////////////////////////////////////// 846// SchedWriteVariant definitions. 847/////////////////////////////////////////////////////////////////////////////// 848 849def JWriteZeroLatency : SchedWriteRes<[]> { 850 let Latency = 0; 851} 852 853def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 854 let NumMicroOps = 2; 855} 856 857// Certain instructions that use the same register for both source 858// operands do not have a real dependency on the previous contents of the 859// register, and thus, do not have to wait before completing. They can be 860// optimized out at register renaming stage. 861// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 862// 15h Processors". 863// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 864// Section 21.8 [Dependency-breaking instructions]. 865 866def JWriteZeroIdiom : SchedWriteVariant<[ 867 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 868 SchedVar<NoSchedPred, [WriteALU]> 869]>; 870def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 871 XOR32rr, XOR64rr)>; 872 873def JWriteFZeroIdiom : SchedWriteVariant<[ 874 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 875 SchedVar<NoSchedPred, [WriteFLogic]> 876]>; 877def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 878 ANDNPSrr, VANDNPSrr, 879 ANDNPDrr, VANDNPDrr)>; 880 881def JWriteFZeroIdiomY : SchedWriteVariant<[ 882 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 883 SchedVar<NoSchedPred, [WriteFLogicY]> 884]>; 885def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 886 VANDNPSYrr, VANDNPDYrr)>; 887 888def JWriteVZeroIdiomLogic : SchedWriteVariant<[ 889 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 890 SchedVar<NoSchedPred, [WriteVecLogic]> 891]>; 892def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>; 893 894def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 895 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 896 SchedVar<NoSchedPred, [WriteVecLogicX]> 897]>; 898def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 899 PANDNrr, VPANDNrr)>; 900 901def JWriteVZeroIdiomALU : SchedWriteVariant<[ 902 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 903 SchedVar<NoSchedPred, [WriteVecALU]> 904]>; 905def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr, 906 MMX_PSUBQrr, MMX_PSUBWrr, 907 MMX_PSUBSBrr, MMX_PSUBSWrr, 908 MMX_PSUBUSBrr, MMX_PSUBUSWrr, 909 MMX_PCMPGTBrr, MMX_PCMPGTDrr, 910 MMX_PCMPGTWrr)>; 911 912def JWriteVZeroIdiomALUX : SchedWriteVariant<[ 913 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 914 SchedVar<NoSchedPred, [WriteVecALUX]> 915]>; 916def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 917 PSUBDrr, VPSUBDrr, 918 PSUBQrr, VPSUBQrr, 919 PSUBWrr, VPSUBWrr, 920 PSUBSBrr, VPSUBSBrr, 921 PSUBSWrr, VPSUBSWrr, 922 PSUBUSBrr, VPSUBUSBrr, 923 PSUBUSWrr, VPSUBUSWrr, 924 PCMPGTBrr, VPCMPGTBrr, 925 PCMPGTDrr, VPCMPGTDrr, 926 PCMPGTQrr, VPCMPGTQrr, 927 PCMPGTWrr, VPCMPGTWrr)>; 928 929def JWriteVPERM2F128 : SchedWriteVariant<[ 930 SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 931 SchedVar<NoSchedPred, [WriteFShuffle256]> 932]>; 933def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 934 935// This write is used for slow LEA instructions. 936def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 937 let Latency = 2; 938} 939 940// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 941// with a `Scale` value different than 1. 942def JSlowLEAPredicate : MCSchedPredicate< 943 CheckAny<[ 944 // A 3-operand LEA (base, index, offset). 945 IsThreeOperandsLEAFn, 946 // An LEA with a "Scale" different than 1. 947 CheckAll<[ 948 CheckIsImmOperand<2>, 949 CheckNot<CheckImmOperand<2, 1>> 950 ]> 951 ]> 952>; 953 954def JWriteLEA : SchedWriteVariant<[ 955 SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 956 SchedVar<NoSchedPred, [WriteLEA]> 957]>; 958 959def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 960 961def JSlowLEA16r : SchedWriteRes<[JALU01]> { 962 let Latency = 3; 963 let ResourceCycles = [4]; 964} 965 966def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 967 968/////////////////////////////////////////////////////////////////////////////// 969// Dependency breaking instructions. 970/////////////////////////////////////////////////////////////////////////////// 971 972def : IsZeroIdiomFunction<[ 973 // GPR Zero-idioms. 974 DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 975 976 // MMX Zero-idioms. 977 DepBreakingClass<[ 978 MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr, 979 MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr, 980 MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr, 981 MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr 982 ], ZeroIdiomPredicate>, 983 984 // SSE Zero-idioms. 985 DepBreakingClass<[ 986 // fp variants. 987 XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 988 989 // int variants. 990 PXORrr, PANDNrr, 991 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 992 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 993 PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 994 ], ZeroIdiomPredicate>, 995 996 // AVX Zero-idioms. 997 DepBreakingClass<[ 998 // xmm fp variants. 999 VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 1000 1001 // xmm int variants. 1002 VPXORrr, VPANDNrr, 1003 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1004 VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 1005 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1006 1007 // ymm variants. 1008 VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 1009 ], ZeroIdiomPredicate>, 1010 1011 DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 1012]>; 1013 1014def : IsDepBreakingFunction<[ 1015 // GPR 1016 DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 1017 DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 1018 1019 // MMX 1020 DepBreakingClass<[ 1021 MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr 1022 ], ZeroIdiomPredicate>, 1023 1024 // SSE 1025 DepBreakingClass<[ 1026 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1027 ], ZeroIdiomPredicate>, 1028 1029 // AVX 1030 DepBreakingClass<[ 1031 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1032 ], ZeroIdiomPredicate> 1033]>; 1034 1035def : IsOptimizableRegisterMove<[ 1036 InstructionEquivalenceClass<[ 1037 // GPR variants. 1038 MOV32rr, MOV64rr, 1039 1040 // MMX variants. 1041 MMX_MOVQ64rr, 1042 1043 // SSE variants. 1044 MOVAPSrr, MOVUPSrr, 1045 MOVAPDrr, MOVUPDrr, 1046 MOVDQArr, MOVDQUrr, 1047 1048 // AVX variants. 1049 VMOVAPSrr, VMOVUPSrr, 1050 VMOVAPDrr, VMOVUPDrr, 1051 VMOVDQArr, VMOVDQUrr 1052 ], TruePred > 1053]>; 1054 1055} // SchedModel 1056