1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AMD btver2 (Jaguar) to support 10// instruction scheduling and other instruction cost heuristics. Based off AMD Software 11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 12// 13//===----------------------------------------------------------------------===// 14 15def BtVer2Model : SchedMachineModel { 16 // All x86 instructions are modeled as a single micro-op, and btver2 can 17 // decode 2 instructions per cycle. 18 let IssueWidth = 2; 19 let MicroOpBufferSize = 64; // Retire Control Unit 20 let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 21 let HighLatency = 25; 22 let MispredictPenalty = 14; // Minimum branch misdirection penalty 23 let PostRAScheduler = 1; 24 25 // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 26 // the scheduler to assign a default model to unrecognized opcodes. 27 let CompleteModel = 0; 28} 29 30let SchedModel = BtVer2Model in { 31 32// Jaguar can issue up to 6 micro-ops in one cycle 33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 39 40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 41// speculative version of the 64-bit integer registers. 42// Reference: www.realworldtech.com/jaguar/4/ 43// 44// The processor always keeps the different parts of an integer register 45// together. An instruction that writes to a part of a register will therefore 46// have a false dependence on any previous write to the same register or any 47// part of it. 48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 49// access" - Agner Fog's "microarchitecture.pdf". 50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 51 0, // Max moves that can be eliminated per cycle. 52 1>; // Restrict move elimination to zero regs. 53 54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 55// registers. Operations on 256-bit data types are cracked into two COPs. 56// Reference: www.realworldtech.com/jaguar/4/ 57 58// The PRF in the floating point unit can eliminate a move from a MMX or SSE 59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 60// dependency breaking instruction, or via VZEROALL). 61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 62// instructions" - Agner Fog's "microarchitecture.pdf" 63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 64 0, // Max moves that can be eliminated per cycle. 65 1>; // Restrict move elimination to zero regs. 66 67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 68// retire up to two macro-ops per cycle. 69// Reference: "Software Optimization Guide for AMD Family 16h Processors" 70def JRCU : RetireControlUnit<64, 2>; 71 72// Integer Pipe Scheduler 73def JALU01 : ProcResGroup<[JALU0, JALU1]> { 74 let BufferSize=20; 75} 76 77// AGU Pipe Scheduler 78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 79 let BufferSize=12; 80} 81 82// Fpu Pipe Scheduler 83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 84 let BufferSize=18; 85} 86 87// Functional units 88def JDiv : ProcResource<1>; // integer division 89def JMul : ProcResource<1>; // integer multiplication 90def JVALU0 : ProcResource<1>; // vector integer 91def JVALU1 : ProcResource<1>; // vector integer 92def JVIMUL : ProcResource<1>; // vector integer multiplication 93def JSTC : ProcResource<1>; // vector store/convert 94def JFPM : ProcResource<1>; // FP multiplication 95def JFPA : ProcResource<1>; // FP addition 96 97// Functional unit groups 98def JFPX : ProcResGroup<[JFPA, JFPM]>; 99def JVALU : ProcResGroup<[JVALU0, JVALU1]>; 100 101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 102// cycles after the memory operand. 103def : ReadAdvance<ReadAfterLd, 3>; 104 105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 106// cycles after the memory operand. 107def : ReadAdvance<ReadAfterVecLd, 5>; 108def : ReadAdvance<ReadAfterVecXLd, 5>; 109def : ReadAdvance<ReadAfterVecYLd, 5>; 110 111/// "Additional 6 cycle transfer operation which moves a floating point 112/// operation input value from the integer unit to the floating point unit. 113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 114def : ReadAdvance<ReadInt2Fpu, -6>; 115 116// Many SchedWrites are defined in pairs with and without a folded load. 117// Instructions with folded loads are usually micro-fused, so they only appear 118// as two micro-ops when dispatched by the schedulers. 119// This multiclass defines the resource usage for variants with and without 120// folded loads. 121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 122 list<ProcResourceKind> ExePorts, 123 int Lat, list<int> Res = [], int UOps = 1, 124 int LoadUOps = 0> { 125 // Register variant is using a single cycle on ExePort. 126 def : WriteRes<SchedRW, ExePorts> { 127 let Latency = Lat; 128 let ResourceCycles = Res; 129 let NumMicroOps = UOps; 130 } 131 132 // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 133 // latency. 134 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 135 let Latency = !add(Lat, 3); 136 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 137 let NumMicroOps = !add(UOps, LoadUOps); 138 } 139} 140 141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 142 list<ProcResourceKind> ExePorts, 143 int Lat, list<int> Res = [], int UOps = 1, 144 int LoadUOps = 0> { 145 // Register variant is using a single cycle on ExePort. 146 def : WriteRes<SchedRW, ExePorts> { 147 let Latency = Lat; 148 let ResourceCycles = Res; 149 let NumMicroOps = UOps; 150 } 151 152 // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 153 // latency. 154 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 155 let Latency = !add(Lat, 5); 156 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 157 let NumMicroOps = !add(UOps, LoadUOps); 158 } 159} 160 161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 162 list<ProcResourceKind> ExePorts, 163 int Lat, list<int> Res = [2], int UOps = 2, 164 int LoadUOps = 0> { 165 // Register variant is using a single cycle on ExePort. 166 def : WriteRes<SchedRW, ExePorts> { 167 let Latency = Lat; 168 let ResourceCycles = Res; 169 let NumMicroOps = UOps; 170 } 171 172 // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 173 // latency. 174 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 175 let Latency = !add(Lat, 5); 176 let ResourceCycles = !listconcat([2], Res); 177 let NumMicroOps = !add(UOps, LoadUOps); 178 } 179} 180 181// Instructions that have local forwarding disabled have an extra +1cy latency. 182 183// A folded store needs a cycle on the SAGU for the store data, most RMW 184// instructions don't need an extra uop. ALU RMW operations don't seem to 185// benefit from STLF, and their observed latency is 6cy. That is the reason why 186// this write adds two extra cycles (instead of just 1cy for the store). 187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; 188 189//////////////////////////////////////////////////////////////////////////////// 190// Arithmetic. 191//////////////////////////////////////////////////////////////////////////////// 192 193defm : JWriteResIntPair<WriteALU, [JALU01], 1>; 194defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 195 196defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 197defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 198defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; 199defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; 200defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; 201 202defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; 203defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; 204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; 205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; 206defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; 207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; 208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; 209defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; 211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; 212defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; 213 214defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 215defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 216defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 217defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 218defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 219defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 220defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 221defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 222 223defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 224 225defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 226defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 227def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 228def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 229def : WriteRes<WriteLAHFSAHF, [JALU01]>; 230 231defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 232defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 233defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 234defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 235defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 236defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 237 238// This is for simple LEAs with one or two input operands. 239def : WriteRes<WriteLEA, [JALU01]>; 240 241// Bit counts. 242defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 243defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 244defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 245defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 246defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 247 248// BMI1 BEXTR/BLS, BMI2 BZHI 249defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 250defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 251defm : X86WriteResPairUnsupported<WriteBZHI>; 252 253//////////////////////////////////////////////////////////////////////////////// 254// Integer shifts and rotates. 255//////////////////////////////////////////////////////////////////////////////// 256 257defm : JWriteResIntPair<WriteShift, [JALU01], 1>; 258defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 259defm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 260defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 261 262// SHLD/SHRD. 263defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 264defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 265defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 266defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 267 268//////////////////////////////////////////////////////////////////////////////// 269// Loads, stores, and moves, not folded with other operations. 270//////////////////////////////////////////////////////////////////////////////// 271 272def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 273def : WriteRes<WriteStore, [JSAGU]>; 274def : WriteRes<WriteStoreNT, [JSAGU]>; 275def : WriteRes<WriteMove, [JALU01]>; 276defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>; 277 278// Load/store MXCSR. 279def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 280def : WriteRes<WriteSTMXCSR, [JSAGU]>; 281 282// Treat misc copies as a move. 283def : InstRW<[WriteMove], (instrs COPY)>; 284 285//////////////////////////////////////////////////////////////////////////////// 286// Idioms that clear a register, like xorps %xmm0, %xmm0. 287// These can often bypass execution ports completely. 288//////////////////////////////////////////////////////////////////////////////// 289 290def : WriteRes<WriteZero, []>; 291 292//////////////////////////////////////////////////////////////////////////////// 293// Branches don't produce values, so they have no latency, but they still 294// consume resources. Indirect branches can fold loads. 295//////////////////////////////////////////////////////////////////////////////// 296 297defm : JWriteResIntPair<WriteJump, [JALU01], 1>; 298 299//////////////////////////////////////////////////////////////////////////////// 300// Special case scheduling classes. 301//////////////////////////////////////////////////////////////////////////////// 302 303def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 304def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 305def : WriteRes<WriteFence, [JSAGU]>; 306 307// Nops don't have dependencies, so there's no actual latency, but we set this 308// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 309def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 310 311def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { 312 let Latency = 3; 313 let ResourceCycles = [3]; 314 let NumMicroOps = 3; 315} 316 317def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 318 let Latency = 16; 319 let ResourceCycles = [3,16,16]; 320 let NumMicroOps = 5; 321} 322 323def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 324 let Latency = 17; 325 let ResourceCycles = [3,17,17]; 326 let NumMicroOps = 6; 327} 328 329def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 330 let Latency = 11; 331 let ResourceCycles = [3,1,1]; 332 let NumMicroOps = 5; 333} 334 335def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 336 let Latency = 11; 337 let ResourceCycles = [3,1,1]; 338 let NumMicroOps = 18; 339} 340 341def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 342 let Latency = 32; 343 let ResourceCycles = [6,1,1]; 344 let NumMicroOps = 28; 345} 346 347def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 348 let Latency = 19; 349 let ResourceCycles = [3,19,19]; 350 let NumMicroOps = 18; 351} 352 353def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 354 let Latency = 38; 355 let ResourceCycles = [6,38,38]; 356 let NumMicroOps = 28; 357} 358 359def JWriteCMPXCHGVariant : SchedWriteVariant<[ 360 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, 361 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, 362 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, 363 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, 364 SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, 365 SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, 366 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, 367 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, 368 SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, 369 SchedVar<NoSchedPred, [WriteCMPXCHG]> 370]>; 371 372// The first five reads are contributed by the memory load operand. 373// We ignore those reads and set a read-advance for the other input operands 374// including the implicit read of RAX. 375def : InstRW<[JWriteCMPXCHGVariant, 376 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 377 ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, 378 LCMPXCHG32, LCMPXCHG64, 379 CMPXCHG8rm, CMPXCHG16rm, 380 CMPXCHG32rm, CMPXCHG64rm)>; 381 382def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, 383 CMPXCHG32rr, CMPXCHG64rr)>; 384 385def : InstRW<[JWriteCMPXCHGVariant, 386 // Ignore reads contributed by the memory operand. 387 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 388 // Add a read-advance to every implicit register read. 389 ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, 390 CMPXCHG8B, CMPXCHG16B)>; 391 392def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 393 let Latency = 19; 394 let ResourceCycles = [1,19,19]; 395 let NumMicroOps = 1; 396} 397 398def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ 399 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, 400 SchedVar<NoSchedPred, [WriteALURMW]> 401]>; 402def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, 403 DEC8m, DEC16m, DEC32m, DEC64m, 404 NOT8m, NOT16m, NOT32m, NOT64m, 405 NEG8m, NEG16m, NEG32m, NEG64m)>; 406 407def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { 408 let Latency = 2; 409 let ResourceCycles = [3]; 410 let NumMicroOps = 3; 411} 412def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, 413 XADD32rr, XADD64rr)>; 414 415// This write defines the latency of the in/out register operand of a non-atomic 416// XADDrm. This is the first of a pair of writes that model non-atomic 417// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). 418// 419// We need two writes because the instruction latency differs from the output 420// register operand latency. In particular, the first write describes the first 421// (and only) output register operand of the instruction. However, the 422// instruction latency is set to the MAX of all the write latencies. That's why 423// a second write is needed in this case (see example below). 424// 425// Example: 426// XADD %ecx, (%rsp) ## Instruction latency: 11cy 427// ## ECX write Latency: 3cy 428// 429// Register ECX becomes available in 3 cycles. That is because the value of ECX 430// is exchanged with the value read from the stack pointer, and the load-to-use 431// latency is assumed to be 3cy. 432def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 433 let Latency = 3; // load-to-use latency 434 let ResourceCycles = [3]; 435 let NumMicroOps = 3; 436} 437 438// This write defines the latency of the in/out register operand of an atomic 439// XADDrm. This is the first of a sequence of two writes used to model atomic 440// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. 441// 442// 443// Example: 444// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy 445// ## ECX write Latency: 11cy 446// 447// The value of ECX becomes available only after 11cy from the start of 448// execution. This write is used to specifically set that operand latency. 449def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 450 let Latency = 11; 451 let ResourceCycles = [3]; 452 let NumMicroOps = 3; 453} 454 455// This write defines the latency of the in/out register operand of an atomic 456// XCHGrm. This write is the first of a sequence of two writes that describe 457// atomic XCHG operations. We need two writes because the instruction latency 458// differs from the output register write latency. We want to make sure that 459// the output register operand becomes visible after 11cy. However, we want to 460// set the instruction latency to 16cy. 461def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { 462 let Latency = 11; 463 let ResourceCycles = [2]; 464 let NumMicroOps = 2; 465} 466 467def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 468 let Latency = 11; 469 let ResourceCycles = [1, 1]; 470 let NumMicroOps = 1; 471} 472 473def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 474 let Latency = 16; 475 let ResourceCycles = [16, 16]; 476 let NumMicroOps = 1; 477} 478 479def JWriteXADDrm_Part1 : SchedWriteVariant<[ 480 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, 481 SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> 482]>; 483 484def JWriteXADDrm_Part2 : SchedWriteVariant<[ 485 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, 486 SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> 487]>; 488 489def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], 490 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, 491 LXADD8, LXADD16, LXADD32, LXADD64)>; 492 493def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], 494 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; 495 496 497//////////////////////////////////////////////////////////////////////////////// 498// Floating point. This covers both scalar and vector operations. 499//////////////////////////////////////////////////////////////////////////////// 500 501defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 502defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 503defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 504defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 505defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; 506defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; 507defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 508defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 509 510defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 511defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 512defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 513defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 514defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 515defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 516 517defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; 518defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; 519defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; 520defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; 521 522defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 523defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 524defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 525 526defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 527 528defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 529defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 530defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 531defm : X86WriteResPairUnsupported<WriteFAddZ>; 532defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 533defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 534defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 535defm : X86WriteResPairUnsupported<WriteFAdd64Z>; 536defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 537defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 538defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 539defm : X86WriteResPairUnsupported<WriteFCmpZ>; 540defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 541defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 542defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 543defm : X86WriteResPairUnsupported<WriteFCmp64Z>; 544defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 545defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>; 546defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 547defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 548defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 549defm : X86WriteResPairUnsupported<WriteFMulZ>; 550defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 551defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 552defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 553defm : X86WriteResPairUnsupported<WriteFMul64Z>; 554defm : X86WriteResPairUnsupported<WriteFMA>; 555defm : X86WriteResPairUnsupported<WriteFMAX>; 556defm : X86WriteResPairUnsupported<WriteFMAY>; 557defm : X86WriteResPairUnsupported<WriteFMAZ>; 558defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 559defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 560defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 561defm : X86WriteResPairUnsupported<WriteDPPSZ>; 562defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 563defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 564defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 565defm : X86WriteResPairUnsupported<WriteFRcpZ>; 566defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 567defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 568defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 569defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 570defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 571defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 572defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 573defm : X86WriteResPairUnsupported<WriteFDivZ>; 574defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 575defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 576defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 577defm : X86WriteResPairUnsupported<WriteFDiv64Z>; 578defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 579defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 580defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 581defm : X86WriteResPairUnsupported<WriteFSqrtZ>; 582defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 583defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 584defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 585defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 586defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 587defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 588defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 589defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 590defm : X86WriteResPairUnsupported<WriteFRndZ>; 591defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 592defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 593defm : X86WriteResPairUnsupported<WriteFLogicZ>; 594defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 595defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 596defm : X86WriteResPairUnsupported<WriteFTestZ>; 597defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 598defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 599defm : X86WriteResPairUnsupported<WriteFShuffleZ>; 600defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 601defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 602defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 603defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 604defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 605defm : X86WriteResPairUnsupported<WriteFBlendZ>; 606defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 607defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 608defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 609defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 610defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 611 612//////////////////////////////////////////////////////////////////////////////// 613// Conversions. 614//////////////////////////////////////////////////////////////////////////////// 615 616defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 617defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 618defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 619defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 620defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 621defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 622defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 623defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 624 625defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 626defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 627defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 628defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 629defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 630defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 631defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 632defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 633defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 634defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 635 636defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 637defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 638defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 639defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 640 641defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 642defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 643defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 644defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 645 646defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 647defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 648defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 649 650defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 651defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 652defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 653defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 654defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 655defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 656 657//////////////////////////////////////////////////////////////////////////////// 658// Vector integer operations. 659//////////////////////////////////////////////////////////////////////////////// 660 661defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 662defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; 663defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; 664defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 665defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 666defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 667defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 668 669defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 670defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 671defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 672defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 673defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 674defm : X86WriteResUnsupported<WriteVecMaskedStore32>; 675defm : X86WriteResUnsupported<WriteVecMaskedStore64>; 676defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; 677defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; 678 679defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 680defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 681defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 682defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 683defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 684 685defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 686defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 687defm : X86WriteResPairUnsupported<WriteVecALUY>; 688defm : X86WriteResPairUnsupported<WriteVecALUZ>; 689defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 690defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 691defm : X86WriteResPairUnsupported<WriteVecShiftY>; 692defm : X86WriteResPairUnsupported<WriteVecShiftZ>; 693defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 694defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 695defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 696defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 697defm : X86WriteResPairUnsupported<WriteVarVecShift>; 698defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 699defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 700defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 701defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 702defm : X86WriteResPairUnsupported<WriteVecIMulY>; 703defm : X86WriteResPairUnsupported<WriteVecIMulZ>; 704defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 705defm : X86WriteResPairUnsupported<WritePMULLDY>; 706defm : X86WriteResPairUnsupported<WritePMULLDZ>; 707defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 708defm : X86WriteResPairUnsupported<WriteMPSADY>; 709defm : X86WriteResPairUnsupported<WriteMPSADZ>; 710defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 711defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 712defm : X86WriteResPairUnsupported<WritePSADBWY>; 713defm : X86WriteResPairUnsupported<WritePSADBWZ>; 714defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 715defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 716defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 717defm : X86WriteResPairUnsupported<WriteShuffleY>; 718defm : X86WriteResPairUnsupported<WriteShuffleZ>; 719defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 720defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 721defm : X86WriteResPairUnsupported<WriteVarShuffleY>; 722defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 723defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 724defm : X86WriteResPairUnsupported<WriteBlendY>; 725defm : X86WriteResPairUnsupported<WriteBlendZ>; 726defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 727defm : X86WriteResPairUnsupported<WriteVarBlendY>; 728defm : X86WriteResPairUnsupported<WriteVarBlendZ>; 729defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 730defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 731defm : X86WriteResPairUnsupported<WriteVecLogicY>; 732defm : X86WriteResPairUnsupported<WriteVecLogicZ>; 733defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 734defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 735defm : X86WriteResPairUnsupported<WriteVecTestZ>; 736defm : X86WriteResPairUnsupported<WriteShuffle256>; 737defm : X86WriteResPairUnsupported<WriteVPMOV256>; 738defm : X86WriteResPairUnsupported<WriteVarShuffle256>; 739 740//////////////////////////////////////////////////////////////////////////////// 741// Vector insert/extract operations. 742//////////////////////////////////////////////////////////////////////////////// 743 744defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 745defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 746defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 747defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 748 749//////////////////////////////////////////////////////////////////////////////// 750// SSE42 String instructions. 751//////////////////////////////////////////////////////////////////////////////// 752 753defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 754defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 755defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 756defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 757 758//////////////////////////////////////////////////////////////////////////////// 759// MOVMSK Instructions. 760//////////////////////////////////////////////////////////////////////////////// 761 762def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 763def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 764defm : X86WriteResUnsupported<WriteVecMOVMSKY>; 765def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 766 767//////////////////////////////////////////////////////////////////////////////// 768// AES Instructions. 769//////////////////////////////////////////////////////////////////////////////// 770 771defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 772defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 773defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 774 775//////////////////////////////////////////////////////////////////////////////// 776// Horizontal add/sub instructions. 777//////////////////////////////////////////////////////////////////////////////// 778 779defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 780defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 781defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 782defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 783defm : X86WriteResPairUnsupported<WritePHAddY>; 784 785//////////////////////////////////////////////////////////////////////////////// 786// Carry-less multiplication instructions. 787//////////////////////////////////////////////////////////////////////////////// 788 789defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 790 791//////////////////////////////////////////////////////////////////////////////// 792// SSE4A instructions. 793//////////////////////////////////////////////////////////////////////////////// 794 795def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 796 let Latency = 2; 797 let ResourceCycles = [1, 4]; 798} 799def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 800 801//////////////////////////////////////////////////////////////////////////////// 802// AVX instructions. 803//////////////////////////////////////////////////////////////////////////////// 804 805def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 806def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 807 808def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 809 let Latency = 6; 810 let ResourceCycles = [1, 2, 4]; 811 let NumMicroOps = 2; 812} 813def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 814 VBROADCASTSSYrm, 815 VBROADCASTF128)>; 816 817def JWriteJVZEROALL: SchedWriteRes<[]> { 818 let Latency = 90; 819 let NumMicroOps = 73; 820} 821def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 822 823def JWriteJVZEROUPPER: SchedWriteRes<[]> { 824 let Latency = 46; 825 let NumMicroOps = 37; 826} 827def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 828 829/////////////////////////////////////////////////////////////////////////////// 830// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ 831/////////////////////////////////////////////////////////////////////////////// 832 833def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { 834 let Latency = 34; 835 let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; 836 let NumMicroOps = 63; 837} 838def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32, 839 VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>; 840 841/////////////////////////////////////////////////////////////////////////////// 842// SchedWriteVariant definitions. 843/////////////////////////////////////////////////////////////////////////////// 844 845def JWriteZeroLatency : SchedWriteRes<[]> { 846 let Latency = 0; 847} 848 849def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 850 let NumMicroOps = 2; 851} 852 853// Certain instructions that use the same register for both source 854// operands do not have a real dependency on the previous contents of the 855// register, and thus, do not have to wait before completing. They can be 856// optimized out at register renaming stage. 857// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 858// 15h Processors". 859// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 860// Section 21.8 [Dependency-breaking instructions]. 861 862def JWriteZeroIdiom : SchedWriteVariant<[ 863 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 864 SchedVar<NoSchedPred, [WriteALU]> 865]>; 866def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 867 XOR32rr, XOR64rr)>; 868 869def JWriteFZeroIdiom : SchedWriteVariant<[ 870 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 871 SchedVar<NoSchedPred, [WriteFLogic]> 872]>; 873def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 874 ANDNPSrr, VANDNPSrr, 875 ANDNPDrr, VANDNPDrr)>; 876 877def JWriteFZeroIdiomY : SchedWriteVariant<[ 878 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 879 SchedVar<NoSchedPred, [WriteFLogicY]> 880]>; 881def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 882 VANDNPSYrr, VANDNPDYrr)>; 883 884def JWriteVZeroIdiomLogic : SchedWriteVariant<[ 885 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 886 SchedVar<NoSchedPred, [WriteVecLogic]> 887]>; 888def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; 889 890def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 891 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 892 SchedVar<NoSchedPred, [WriteVecLogicX]> 893]>; 894def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 895 PANDNrr, VPANDNrr)>; 896 897def JWriteVZeroIdiomALU : SchedWriteVariant<[ 898 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 899 SchedVar<NoSchedPred, [WriteVecALU]> 900]>; 901def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, 902 MMX_PSUBQirr, MMX_PSUBWirr, 903 MMX_PSUBSBirr, MMX_PSUBSWirr, 904 MMX_PSUBUSBirr, MMX_PSUBUSWirr, 905 MMX_PCMPGTBirr, MMX_PCMPGTDirr, 906 MMX_PCMPGTWirr)>; 907 908def JWriteVZeroIdiomALUX : SchedWriteVariant<[ 909 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 910 SchedVar<NoSchedPred, [WriteVecALUX]> 911]>; 912def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 913 PSUBDrr, VPSUBDrr, 914 PSUBQrr, VPSUBQrr, 915 PSUBWrr, VPSUBWrr, 916 PSUBSBrr, VPSUBSBrr, 917 PSUBSWrr, VPSUBSWrr, 918 PSUBUSBrr, VPSUBUSBrr, 919 PSUBUSWrr, VPSUBUSWrr, 920 PCMPGTBrr, VPCMPGTBrr, 921 PCMPGTDrr, VPCMPGTDrr, 922 PCMPGTQrr, VPCMPGTQrr, 923 PCMPGTWrr, VPCMPGTWrr)>; 924 925def JWriteVPERM2F128 : SchedWriteVariant<[ 926 SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 927 SchedVar<NoSchedPred, [WriteFShuffle256]> 928]>; 929def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 930 931// This write is used for slow LEA instructions. 932def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 933 let Latency = 2; 934} 935 936// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 937// with a `Scale` value different than 1. 938def JSlowLEAPredicate : MCSchedPredicate< 939 CheckAny<[ 940 // A 3-operand LEA (base, index, offset). 941 IsThreeOperandsLEAFn, 942 // An LEA with a "Scale" different than 1. 943 CheckAll<[ 944 CheckIsImmOperand<2>, 945 CheckNot<CheckImmOperand<2, 1>> 946 ]> 947 ]> 948>; 949 950def JWriteLEA : SchedWriteVariant<[ 951 SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 952 SchedVar<NoSchedPred, [WriteLEA]> 953]>; 954 955def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 956 957def JSlowLEA16r : SchedWriteRes<[JALU01]> { 958 let Latency = 3; 959 let ResourceCycles = [4]; 960} 961 962def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 963 964/////////////////////////////////////////////////////////////////////////////// 965// Dependency breaking instructions. 966/////////////////////////////////////////////////////////////////////////////// 967 968def : IsZeroIdiomFunction<[ 969 // GPR Zero-idioms. 970 DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 971 972 // MMX Zero-idioms. 973 DepBreakingClass<[ 974 MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, 975 MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, 976 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, 977 MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr 978 ], ZeroIdiomPredicate>, 979 980 // SSE Zero-idioms. 981 DepBreakingClass<[ 982 // fp variants. 983 XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 984 985 // int variants. 986 PXORrr, PANDNrr, 987 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 988 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 989 PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 990 ], ZeroIdiomPredicate>, 991 992 // AVX Zero-idioms. 993 DepBreakingClass<[ 994 // xmm fp variants. 995 VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 996 997 // xmm int variants. 998 VPXORrr, VPANDNrr, 999 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 1000 VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 1001 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1002 1003 // ymm variants. 1004 VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 1005 ], ZeroIdiomPredicate>, 1006 1007 DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 1008]>; 1009 1010def : IsDepBreakingFunction<[ 1011 // GPR 1012 DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 1013 DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 1014 1015 // MMX 1016 DepBreakingClass<[ 1017 MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr 1018 ], ZeroIdiomPredicate>, 1019 1020 // SSE 1021 DepBreakingClass<[ 1022 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1023 ], ZeroIdiomPredicate>, 1024 1025 // AVX 1026 DepBreakingClass<[ 1027 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1028 ], ZeroIdiomPredicate> 1029]>; 1030 1031def : IsOptimizableRegisterMove<[ 1032 InstructionEquivalenceClass<[ 1033 // GPR variants. 1034 MOV32rr, MOV64rr, 1035 1036 // MMX variants. 1037 MMX_MOVQ64rr, 1038 1039 // SSE variants. 1040 MOVAPSrr, MOVUPSrr, 1041 MOVAPDrr, MOVUPDrr, 1042 MOVDQArr, MOVDQUrr, 1043 1044 // AVX variants. 1045 VMOVAPSrr, VMOVUPSrr, 1046 VMOVAPDrr, VMOVUPDrr, 1047 VMOVDQArr, VMOVDQUrr 1048 ], TruePred > 1049]>; 1050 1051} // SchedModel 1052