1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AMD btver2 (Jaguar) to support 10// instruction scheduling and other instruction cost heuristics. Based off AMD Software 11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 12// 13//===----------------------------------------------------------------------===// 14 15def BtVer2Model : SchedMachineModel { 16 // All x86 instructions are modeled as a single micro-op, and btver2 can 17 // decode 2 instructions per cycle. 18 let IssueWidth = 2; 19 let MicroOpBufferSize = 64; // Retire Control Unit 20 let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 21 let HighLatency = 25; 22 let MispredictPenalty = 14; // Minimum branch misdirection penalty 23 let PostRAScheduler = 1; 24 25 // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 26 // the scheduler to assign a default model to unrecognized opcodes. 27 let CompleteModel = 0; 28} 29 30let SchedModel = BtVer2Model in { 31 32// Jaguar can issue up to 6 micro-ops in one cycle 33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 39 40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 41// speculative version of the 64-bit integer registers. 42// Reference: www.realworldtech.com/jaguar/4/ 43// 44// The processor always keeps the different parts of an integer register 45// together. An instruction that writes to a part of a register will therefore 46// have a false dependence on any previous write to the same register or any 47// part of it. 48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 49// access" - Agner Fog's "microarchitecture.pdf". 50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 51 0, // Max moves that can be eliminated per cycle. 52 1>; // Restrict move elimination to zero regs. 53 54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 55// registers. Operations on 256-bit data types are cracked into two COPs. 56// Reference: www.realworldtech.com/jaguar/4/ 57 58// The PRF in the floating point unit can eliminate a move from a MMX or SSE 59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 60// dependency breaking instruction, or via VZEROALL). 61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 62// instructions" - Agner Fog's "microarchitecture.pdf" 63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 64 0, // Max moves that can be eliminated per cycle. 65 1>; // Restrict move elimination to zero regs. 66 67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 68// retire up to two macro-ops per cycle. 69// Reference: "Software Optimization Guide for AMD Family 16h Processors" 70def JRCU : RetireControlUnit<64, 2>; 71 72// Integer Pipe Scheduler 73def JALU01 : ProcResGroup<[JALU0, JALU1]> { 74 let BufferSize=20; 75} 76 77// AGU Pipe Scheduler 78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 79 let BufferSize=12; 80} 81 82// Fpu Pipe Scheduler 83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 84 let BufferSize=18; 85} 86 87// Functional units 88def JDiv : ProcResource<1>; // integer division 89def JMul : ProcResource<1>; // integer multiplication 90def JVALU0 : ProcResource<1>; // vector integer 91def JVALU1 : ProcResource<1>; // vector integer 92def JVIMUL : ProcResource<1>; // vector integer multiplication 93def JSTC : ProcResource<1>; // vector store/convert 94def JFPM : ProcResource<1>; // FP multiplication 95def JFPA : ProcResource<1>; // FP addition 96 97// Functional unit groups 98def JFPX : ProcResGroup<[JFPA, JFPM]>; 99def JVALU : ProcResGroup<[JVALU0, JVALU1]>; 100 101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 102// cycles after the memory operand. 103def : ReadAdvance<ReadAfterLd, 3>; 104 105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 106// cycles after the memory operand. 107def : ReadAdvance<ReadAfterVecLd, 5>; 108def : ReadAdvance<ReadAfterVecXLd, 5>; 109def : ReadAdvance<ReadAfterVecYLd, 5>; 110 111/// "Additional 6 cycle transfer operation which moves a floating point 112/// operation input value from the integer unit to the floating point unit. 113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 114def : ReadAdvance<ReadInt2Fpu, -6>; 115 116// Many SchedWrites are defined in pairs with and without a folded load. 117// Instructions with folded loads are usually micro-fused, so they only appear 118// as two micro-ops when dispatched by the schedulers. 119// This multiclass defines the resource usage for variants with and without 120// folded loads. 121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 122 list<ProcResourceKind> ExePorts, 123 int Lat, list<int> Res = [], int UOps = 1, 124 int LoadUOps = 0> { 125 // Register variant is using a single cycle on ExePort. 126 def : WriteRes<SchedRW, ExePorts> { 127 let Latency = Lat; 128 let ResourceCycles = Res; 129 let NumMicroOps = UOps; 130 } 131 132 // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 133 // latency. 134 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 135 let Latency = !add(Lat, 3); 136 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 137 let NumMicroOps = !add(UOps, LoadUOps); 138 } 139} 140 141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 142 list<ProcResourceKind> ExePorts, 143 int Lat, list<int> Res = [], int UOps = 1, 144 int LoadUOps = 0> { 145 // Register variant is using a single cycle on ExePort. 146 def : WriteRes<SchedRW, ExePorts> { 147 let Latency = Lat; 148 let ResourceCycles = Res; 149 let NumMicroOps = UOps; 150 } 151 152 // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 153 // latency. 154 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 155 let Latency = !add(Lat, 5); 156 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 157 let NumMicroOps = !add(UOps, LoadUOps); 158 } 159} 160 161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 162 list<ProcResourceKind> ExePorts, 163 int Lat, list<int> Res = [2], int UOps = 2, 164 int LoadUOps = 0> { 165 // Register variant is using a single cycle on ExePort. 166 def : WriteRes<SchedRW, ExePorts> { 167 let Latency = Lat; 168 let ResourceCycles = Res; 169 let NumMicroOps = UOps; 170 } 171 172 // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 173 // latency. 174 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 175 let Latency = !add(Lat, 5); 176 let ResourceCycles = !listconcat([2], Res); 177 let NumMicroOps = !add(UOps, LoadUOps); 178 } 179} 180 181// Instructions that have local forwarding disabled have an extra +1cy latency. 182 183// A folded store needs a cycle on the SAGU for the store data, most RMW 184// instructions don't need an extra uop. ALU RMW operations don't seem to 185// benefit from STLF, and their observed latency is 6cy. That is the reason why 186// this write adds two extra cycles (instead of just 1cy for the store). 187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; 188 189//////////////////////////////////////////////////////////////////////////////// 190// Arithmetic. 191//////////////////////////////////////////////////////////////////////////////// 192 193defm : JWriteResIntPair<WriteALU, [JALU01], 1>; 194defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 195 196defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 197defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 198defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; 199defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; 200defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; 201 202defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; 203defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; 204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; 205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; 206defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; 207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; 208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; 209defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; 211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; 212defm : X86WriteResUnsupported<WriteIMulH>; 213defm : X86WriteResUnsupported<WriteIMulHLd>; 214defm : X86WriteResPairUnsupported<WriteMULX32>; 215defm : X86WriteResPairUnsupported<WriteMULX64>; 216 217defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 218defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 219defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 220defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 221defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 222defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 223defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 224defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 225 226defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 227 228defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 229defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 230def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 231def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 232def : WriteRes<WriteLAHFSAHF, [JALU01]>; 233 234defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 235defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 236defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 237defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 238defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 239defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 240 241// This is for simple LEAs with one or two input operands. 242def : WriteRes<WriteLEA, [JALU01]>; 243 244// Bit counts. 245defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 246defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 247defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 248defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 249defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 250 251// BMI1 BEXTR/BLS, BMI2 BZHI 252defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 253defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 254defm : X86WriteResPairUnsupported<WriteBZHI>; 255 256//////////////////////////////////////////////////////////////////////////////// 257// Integer shifts and rotates. 258//////////////////////////////////////////////////////////////////////////////// 259 260defm : JWriteResIntPair<WriteShift, [JALU01], 1>; 261defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 262defm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 263defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 264 265// SHLD/SHRD. 266defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 267defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 268defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 269defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 270 271//////////////////////////////////////////////////////////////////////////////// 272// Loads, stores, and moves, not folded with other operations. 273//////////////////////////////////////////////////////////////////////////////// 274 275def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 276def : WriteRes<WriteStore, [JSAGU]>; 277def : WriteRes<WriteStoreNT, [JSAGU]>; 278def : WriteRes<WriteMove, [JALU01]>; 279defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>; 280 281// Load/store MXCSR. 282def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 283def : WriteRes<WriteSTMXCSR, [JSAGU]>; 284 285// Treat misc copies as a move. 286def : InstRW<[WriteMove], (instrs COPY)>; 287 288//////////////////////////////////////////////////////////////////////////////// 289// Idioms that clear a register, like xorps %xmm0, %xmm0. 290// These can often bypass execution ports completely. 291//////////////////////////////////////////////////////////////////////////////// 292 293def : WriteRes<WriteZero, []>; 294 295//////////////////////////////////////////////////////////////////////////////// 296// Branches don't produce values, so they have no latency, but they still 297// consume resources. Indirect branches can fold loads. 298//////////////////////////////////////////////////////////////////////////////// 299 300defm : JWriteResIntPair<WriteJump, [JALU01], 1>; 301 302//////////////////////////////////////////////////////////////////////////////// 303// Special case scheduling classes. 304//////////////////////////////////////////////////////////////////////////////// 305 306def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 307def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 308def : WriteRes<WriteFence, [JSAGU]>; 309 310// Nops don't have dependencies, so there's no actual latency, but we set this 311// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 312def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 313 314def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { 315 let Latency = 3; 316 let ResourceCycles = [3]; 317 let NumMicroOps = 3; 318} 319 320def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 321 let Latency = 16; 322 let ResourceCycles = [3,16,16]; 323 let NumMicroOps = 5; 324} 325 326def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 327 let Latency = 17; 328 let ResourceCycles = [3,17,17]; 329 let NumMicroOps = 6; 330} 331 332def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 333 let Latency = 11; 334 let ResourceCycles = [3,1,1]; 335 let NumMicroOps = 5; 336} 337 338def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 339 let Latency = 11; 340 let ResourceCycles = [3,1,1]; 341 let NumMicroOps = 18; 342} 343 344def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 345 let Latency = 32; 346 let ResourceCycles = [6,1,1]; 347 let NumMicroOps = 28; 348} 349 350def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 351 let Latency = 19; 352 let ResourceCycles = [3,19,19]; 353 let NumMicroOps = 18; 354} 355 356def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 357 let Latency = 38; 358 let ResourceCycles = [6,38,38]; 359 let NumMicroOps = 28; 360} 361 362def JWriteCMPXCHGVariant : SchedWriteVariant<[ 363 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, 364 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, 365 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, 366 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, 367 SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, 368 SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, 369 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, 370 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, 371 SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, 372 SchedVar<NoSchedPred, [WriteCMPXCHG]> 373]>; 374 375// The first five reads are contributed by the memory load operand. 376// We ignore those reads and set a read-advance for the other input operands 377// including the implicit read of RAX. 378def : InstRW<[JWriteCMPXCHGVariant, 379 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 380 ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, 381 LCMPXCHG32, LCMPXCHG64, 382 CMPXCHG8rm, CMPXCHG16rm, 383 CMPXCHG32rm, CMPXCHG64rm)>; 384 385def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, 386 CMPXCHG32rr, CMPXCHG64rr)>; 387 388def : InstRW<[JWriteCMPXCHGVariant, 389 // Ignore reads contributed by the memory operand. 390 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 391 // Add a read-advance to every implicit register read. 392 ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, 393 CMPXCHG8B, CMPXCHG16B)>; 394 395def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 396 let Latency = 19; 397 let ResourceCycles = [1,19,19]; 398 let NumMicroOps = 1; 399} 400 401def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ 402 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, 403 SchedVar<NoSchedPred, [WriteALURMW]> 404]>; 405def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, 406 DEC8m, DEC16m, DEC32m, DEC64m, 407 NOT8m, NOT16m, NOT32m, NOT64m, 408 NEG8m, NEG16m, NEG32m, NEG64m)>; 409 410def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { 411 let Latency = 2; 412 let ResourceCycles = [3]; 413 let NumMicroOps = 3; 414} 415def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, 416 XADD32rr, XADD64rr)>; 417 418// This write defines the latency of the in/out register operand of a non-atomic 419// XADDrm. This is the first of a pair of writes that model non-atomic 420// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). 421// 422// We need two writes because the instruction latency differs from the output 423// register operand latency. In particular, the first write describes the first 424// (and only) output register operand of the instruction. However, the 425// instruction latency is set to the MAX of all the write latencies. That's why 426// a second write is needed in this case (see example below). 427// 428// Example: 429// XADD %ecx, (%rsp) ## Instruction latency: 11cy 430// ## ECX write Latency: 3cy 431// 432// Register ECX becomes available in 3 cycles. That is because the value of ECX 433// is exchanged with the value read from the stack pointer, and the load-to-use 434// latency is assumed to be 3cy. 435def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 436 let Latency = 3; // load-to-use latency 437 let ResourceCycles = [3]; 438 let NumMicroOps = 3; 439} 440 441// This write defines the latency of the in/out register operand of an atomic 442// XADDrm. This is the first of a sequence of two writes used to model atomic 443// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. 444// 445// 446// Example: 447// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy 448// ## ECX write Latency: 11cy 449// 450// The value of ECX becomes available only after 11cy from the start of 451// execution. This write is used to specifically set that operand latency. 452def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 453 let Latency = 11; 454 let ResourceCycles = [3]; 455 let NumMicroOps = 3; 456} 457 458// This write defines the latency of the in/out register operand of an atomic 459// XCHGrm. This write is the first of a sequence of two writes that describe 460// atomic XCHG operations. We need two writes because the instruction latency 461// differs from the output register write latency. We want to make sure that 462// the output register operand becomes visible after 11cy. However, we want to 463// set the instruction latency to 16cy. 464def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { 465 let Latency = 11; 466 let ResourceCycles = [2]; 467 let NumMicroOps = 2; 468} 469 470def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 471 let Latency = 11; 472 let ResourceCycles = [1, 1]; 473 let NumMicroOps = 1; 474} 475 476def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 477 let Latency = 16; 478 let ResourceCycles = [16, 16]; 479 let NumMicroOps = 1; 480} 481 482def JWriteXADDrm_Part1 : SchedWriteVariant<[ 483 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, 484 SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> 485]>; 486 487def JWriteXADDrm_Part2 : SchedWriteVariant<[ 488 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, 489 SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> 490]>; 491 492def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], 493 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, 494 LXADD8, LXADD16, LXADD32, LXADD64)>; 495 496def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], 497 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; 498 499 500//////////////////////////////////////////////////////////////////////////////// 501// Floating point. This covers both scalar and vector operations. 502//////////////////////////////////////////////////////////////////////////////// 503 504defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 505defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 506defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 507defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 508defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; 509defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; 510defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 511defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 512 513defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 514defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 515defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 516defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 517defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 518defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 519 520defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; 521defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; 522defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; 523defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; 524 525defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 526defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 527defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 528defm : X86WriteResUnsupported<WriteFMoveZ>; 529 530defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 531 532defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 533defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 534defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 535defm : X86WriteResPairUnsupported<WriteFAddZ>; 536defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 537defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 538defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 539defm : X86WriteResPairUnsupported<WriteFAdd64Z>; 540defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 541defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 542defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 543defm : X86WriteResPairUnsupported<WriteFCmpZ>; 544defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 545defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 546defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 547defm : X86WriteResPairUnsupported<WriteFCmp64Z>; 548defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 549defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>; 550defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 551defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 552defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 553defm : X86WriteResPairUnsupported<WriteFMulZ>; 554defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 555defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 556defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 557defm : X86WriteResPairUnsupported<WriteFMul64Z>; 558defm : X86WriteResPairUnsupported<WriteFMA>; 559defm : X86WriteResPairUnsupported<WriteFMAX>; 560defm : X86WriteResPairUnsupported<WriteFMAY>; 561defm : X86WriteResPairUnsupported<WriteFMAZ>; 562defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 563defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 564defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 565defm : X86WriteResPairUnsupported<WriteDPPSZ>; 566defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 567defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 568defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 569defm : X86WriteResPairUnsupported<WriteFRcpZ>; 570defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 571defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 572defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 573defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 574defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 575defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 576defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 577defm : X86WriteResPairUnsupported<WriteFDivZ>; 578defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 579defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 580defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 581defm : X86WriteResPairUnsupported<WriteFDiv64Z>; 582defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 583defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 584defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 585defm : X86WriteResPairUnsupported<WriteFSqrtZ>; 586defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 587defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 588defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 589defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 590defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 591defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 592defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 593defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 594defm : X86WriteResPairUnsupported<WriteFRndZ>; 595defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 596defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 597defm : X86WriteResPairUnsupported<WriteFLogicZ>; 598defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 599defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 600defm : X86WriteResPairUnsupported<WriteFTestZ>; 601defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 602defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 603defm : X86WriteResPairUnsupported<WriteFShuffleZ>; 604defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 605defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 606defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 607defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 608defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 609defm : X86WriteResPairUnsupported<WriteFBlendZ>; 610defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 611defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 612defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 613defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 614defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 615 616//////////////////////////////////////////////////////////////////////////////// 617// Conversions. 618//////////////////////////////////////////////////////////////////////////////// 619 620defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 621defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 622defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 623defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 624defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 625defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 626defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 627defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 628 629defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 630defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 631defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 632defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 633defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 634defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 635defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 636defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 637defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 638defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 639 640defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 641defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 642defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 643defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 644 645defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 646defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 647defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 648defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 649 650defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 651defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 652defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 653 654defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 655defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 656defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 657defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 658defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 659defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 660 661//////////////////////////////////////////////////////////////////////////////// 662// Vector integer operations. 663//////////////////////////////////////////////////////////////////////////////// 664 665defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 666defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; 667defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; 668defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 669defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 670defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 671defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 672 673defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 674defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 675defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 676defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 677defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 678defm : X86WriteResUnsupported<WriteVecMaskedStore32>; 679defm : X86WriteResUnsupported<WriteVecMaskedStore64>; 680defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; 681defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; 682 683defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 684defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 685defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 686defm : X86WriteResUnsupported<WriteVecMoveZ>; 687defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 688defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 689 690defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 691defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 692defm : X86WriteResPairUnsupported<WriteVecALUY>; 693defm : X86WriteResPairUnsupported<WriteVecALUZ>; 694defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 695defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 696defm : X86WriteResPairUnsupported<WriteVecShiftY>; 697defm : X86WriteResPairUnsupported<WriteVecShiftZ>; 698defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 699defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 700defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 701defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 702defm : X86WriteResPairUnsupported<WriteVarVecShift>; 703defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 704defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 705defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 706defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 707defm : X86WriteResPairUnsupported<WriteVecIMulY>; 708defm : X86WriteResPairUnsupported<WriteVecIMulZ>; 709defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 710defm : X86WriteResPairUnsupported<WritePMULLDY>; 711defm : X86WriteResPairUnsupported<WritePMULLDZ>; 712defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 713defm : X86WriteResPairUnsupported<WriteMPSADY>; 714defm : X86WriteResPairUnsupported<WriteMPSADZ>; 715defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 716defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 717defm : X86WriteResPairUnsupported<WritePSADBWY>; 718defm : X86WriteResPairUnsupported<WritePSADBWZ>; 719defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 720defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 721defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 722defm : X86WriteResPairUnsupported<WriteShuffleY>; 723defm : X86WriteResPairUnsupported<WriteShuffleZ>; 724defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 725defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 726defm : X86WriteResPairUnsupported<WriteVarShuffleY>; 727defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 728defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 729defm : X86WriteResPairUnsupported<WriteBlendY>; 730defm : X86WriteResPairUnsupported<WriteBlendZ>; 731defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 732defm : X86WriteResPairUnsupported<WriteVarBlendY>; 733defm : X86WriteResPairUnsupported<WriteVarBlendZ>; 734defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 735defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 736defm : X86WriteResPairUnsupported<WriteVecLogicY>; 737defm : X86WriteResPairUnsupported<WriteVecLogicZ>; 738defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 739defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 740defm : X86WriteResPairUnsupported<WriteVecTestZ>; 741defm : X86WriteResPairUnsupported<WriteShuffle256>; 742defm : X86WriteResPairUnsupported<WriteVPMOV256>; 743defm : X86WriteResPairUnsupported<WriteVarShuffle256>; 744 745//////////////////////////////////////////////////////////////////////////////// 746// Vector insert/extract operations. 747//////////////////////////////////////////////////////////////////////////////// 748 749defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 750defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 751defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 752defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 753 754//////////////////////////////////////////////////////////////////////////////// 755// SSE42 String instructions. 756//////////////////////////////////////////////////////////////////////////////// 757 758defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 759defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 760defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 761defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 762 763//////////////////////////////////////////////////////////////////////////////// 764// MOVMSK Instructions. 765//////////////////////////////////////////////////////////////////////////////// 766 767def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 768def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 769defm : X86WriteResUnsupported<WriteVecMOVMSKY>; 770def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 771 772//////////////////////////////////////////////////////////////////////////////// 773// AES Instructions. 774//////////////////////////////////////////////////////////////////////////////// 775 776defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 777defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 778defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 779 780//////////////////////////////////////////////////////////////////////////////// 781// Horizontal add/sub instructions. 782//////////////////////////////////////////////////////////////////////////////// 783 784defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 785defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 786defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 787defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 788defm : X86WriteResPairUnsupported<WritePHAddY>; 789 790//////////////////////////////////////////////////////////////////////////////// 791// Carry-less multiplication instructions. 792//////////////////////////////////////////////////////////////////////////////// 793 794defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 795 796//////////////////////////////////////////////////////////////////////////////// 797// SSE4A instructions. 798//////////////////////////////////////////////////////////////////////////////// 799 800def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 801 let Latency = 2; 802 let ResourceCycles = [1, 4]; 803} 804def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 805 806//////////////////////////////////////////////////////////////////////////////// 807// AVX instructions. 808//////////////////////////////////////////////////////////////////////////////// 809 810def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 811def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 812 813def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 814 let Latency = 6; 815 let ResourceCycles = [1, 2, 4]; 816 let NumMicroOps = 2; 817} 818def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 819 VBROADCASTSSYrm, 820 VBROADCASTF128)>; 821 822def JWriteJVZEROALL: SchedWriteRes<[]> { 823 let Latency = 90; 824 let NumMicroOps = 73; 825} 826def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 827 828def JWriteJVZEROUPPER: SchedWriteRes<[]> { 829 let Latency = 46; 830 let NumMicroOps = 37; 831} 832def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 833 834/////////////////////////////////////////////////////////////////////////////// 835// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ 836/////////////////////////////////////////////////////////////////////////////// 837 838def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { 839 let Latency = 34; 840 let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; 841 let NumMicroOps = 63; 842} 843def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, 844 VMASKMOVDQU, VMASKMOVDQU64)>; 845 846/////////////////////////////////////////////////////////////////////////////// 847// SchedWriteVariant definitions. 848/////////////////////////////////////////////////////////////////////////////// 849 850def JWriteZeroLatency : SchedWriteRes<[]> { 851 let Latency = 0; 852} 853 854def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 855 let NumMicroOps = 2; 856} 857 858// Certain instructions that use the same register for both source 859// operands do not have a real dependency on the previous contents of the 860// register, and thus, do not have to wait before completing. They can be 861// optimized out at register renaming stage. 862// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 863// 15h Processors". 864// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 865// Section 21.8 [Dependency-breaking instructions]. 866 867def JWriteZeroIdiom : SchedWriteVariant<[ 868 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 869 SchedVar<NoSchedPred, [WriteALU]> 870]>; 871def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 872 XOR32rr, XOR64rr)>; 873 874def JWriteFZeroIdiom : SchedWriteVariant<[ 875 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 876 SchedVar<NoSchedPred, [WriteFLogic]> 877]>; 878def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 879 ANDNPSrr, VANDNPSrr, 880 ANDNPDrr, VANDNPDrr)>; 881 882def JWriteFZeroIdiomY : SchedWriteVariant<[ 883 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 884 SchedVar<NoSchedPred, [WriteFLogicY]> 885]>; 886def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 887 VANDNPSYrr, VANDNPDYrr)>; 888 889def JWriteVZeroIdiomLogic : SchedWriteVariant<[ 890 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 891 SchedVar<NoSchedPred, [WriteVecLogic]> 892]>; 893def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>; 894 895def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 896 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 897 SchedVar<NoSchedPred, [WriteVecLogicX]> 898]>; 899def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 900 PANDNrr, VPANDNrr)>; 901 902def JWriteVZeroIdiomALU : SchedWriteVariant<[ 903 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 904 SchedVar<NoSchedPred, [WriteVecALU]> 905]>; 906def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr, 907 MMX_PSUBQrr, MMX_PSUBWrr, 908 MMX_PSUBSBrr, MMX_PSUBSWrr, 909 MMX_PSUBUSBrr, MMX_PSUBUSWrr, 910 MMX_PCMPGTBrr, MMX_PCMPGTDrr, 911 MMX_PCMPGTWrr)>; 912 913def JWriteVZeroIdiomALUX : SchedWriteVariant<[ 914 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 915 SchedVar<NoSchedPred, [WriteVecALUX]> 916]>; 917def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 918 PSUBDrr, VPSUBDrr, 919 PSUBQrr, VPSUBQrr, 920 PSUBWrr, VPSUBWrr, 921 PSUBSBrr, VPSUBSBrr, 922 PSUBSWrr, VPSUBSWrr, 923 PSUBUSBrr, VPSUBUSBrr, 924 PSUBUSWrr, VPSUBUSWrr, 925 PCMPGTBrr, VPCMPGTBrr, 926 PCMPGTDrr, VPCMPGTDrr, 927 PCMPGTQrr, VPCMPGTQrr, 928 PCMPGTWrr, VPCMPGTWrr)>; 929 930def JWriteVPERM2F128 : SchedWriteVariant<[ 931 SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 932 SchedVar<NoSchedPred, [WriteFShuffle256]> 933]>; 934def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 935 936// This write is used for slow LEA instructions. 937def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 938 let Latency = 2; 939} 940 941// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 942// with a `Scale` value different than 1. 943def JSlowLEAPredicate : MCSchedPredicate< 944 CheckAny<[ 945 // A 3-operand LEA (base, index, offset). 946 IsThreeOperandsLEAFn, 947 // An LEA with a "Scale" different than 1. 948 CheckAll<[ 949 CheckIsImmOperand<2>, 950 CheckNot<CheckImmOperand<2, 1>> 951 ]> 952 ]> 953>; 954 955def JWriteLEA : SchedWriteVariant<[ 956 SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 957 SchedVar<NoSchedPred, [WriteLEA]> 958]>; 959 960def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 961 962def JSlowLEA16r : SchedWriteRes<[JALU01]> { 963 let Latency = 3; 964 let ResourceCycles = [4]; 965} 966 967def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 968 969/////////////////////////////////////////////////////////////////////////////// 970// Dependency breaking instructions. 971/////////////////////////////////////////////////////////////////////////////// 972 973def : IsZeroIdiomFunction<[ 974 // GPR Zero-idioms. 975 DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 976 977 // MMX Zero-idioms. 978 DepBreakingClass<[ 979 MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr, 980 MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr, 981 MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr, 982 MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr 983 ], ZeroIdiomPredicate>, 984 985 // SSE Zero-idioms. 986 DepBreakingClass<[ 987 // fp variants. 988 XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 989 990 // int variants. 991 PXORrr, PANDNrr, 992 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 993 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 994 PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 995 ], ZeroIdiomPredicate>, 996 997 // AVX Zero-idioms. 998 DepBreakingClass<[ 999 // xmm fp variants. 1000 VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 1001 1002 // xmm int variants. 1003 VPXORrr, VPANDNrr, 1004 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1005 VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 1006 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1007 1008 // ymm variants. 1009 VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 1010 ], ZeroIdiomPredicate>, 1011 1012 DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 1013]>; 1014 1015def : IsDepBreakingFunction<[ 1016 // GPR 1017 DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 1018 DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 1019 1020 // MMX 1021 DepBreakingClass<[ 1022 MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr 1023 ], ZeroIdiomPredicate>, 1024 1025 // SSE 1026 DepBreakingClass<[ 1027 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1028 ], ZeroIdiomPredicate>, 1029 1030 // AVX 1031 DepBreakingClass<[ 1032 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1033 ], ZeroIdiomPredicate> 1034]>; 1035 1036def : IsOptimizableRegisterMove<[ 1037 InstructionEquivalenceClass<[ 1038 // GPR variants. 1039 MOV32rr, MOV64rr, 1040 1041 // MMX variants. 1042 MMX_MOVQ64rr, 1043 1044 // SSE variants. 1045 MOVAPSrr, MOVUPSrr, 1046 MOVAPDrr, MOVUPDrr, 1047 MOVDQArr, MOVDQUrr, 1048 1049 // AVX variants. 1050 VMOVAPSrr, VMOVUPSrr, 1051 VMOVAPDrr, VMOVUPDrr, 1052 VMOVDQArr, VMOVDQUrr 1053 ], TruePred > 1054]>; 1055 1056} // SchedModel 1057