1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AMD btver2 (Jaguar) to support 10// instruction scheduling and other instruction cost heuristics. Based off AMD Software 11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 12// 13//===----------------------------------------------------------------------===// 14 15def BtVer2Model : SchedMachineModel { 16 // All x86 instructions are modeled as a single micro-op, and btver2 can 17 // decode 2 instructions per cycle. 18 let IssueWidth = 2; 19 let MicroOpBufferSize = 64; // Retire Control Unit 20 let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 21 let HighLatency = 25; 22 let MispredictPenalty = 14; // Minimum branch misdirection penalty 23 let PostRAScheduler = 1; 24 25 // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 26 // the scheduler to assign a default model to unrecognized opcodes. 27 let CompleteModel = 0; 28} 29 30let SchedModel = BtVer2Model in { 31 32// Jaguar can issue up to 6 micro-ops in one cycle 33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 39 40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 41// speculative version of the 64-bit integer registers. 42// Reference: www.realworldtech.com/jaguar/4/ 43// 44// The processor always keeps the different parts of an integer register 45// together. An instruction that writes to a part of a register will therefore 46// have a false dependence on any previous write to the same register or any 47// part of it. 48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 49// access" - Agner Fog's "microarchitecture.pdf". 50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 51 0, // Max moves that can be eliminated per cycle. 52 1>; // Restrict move elimination to zero regs. 53 54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 55// registers. Operations on 256-bit data types are cracked into two COPs. 56// Reference: www.realworldtech.com/jaguar/4/ 57 58// The PRF in the floating point unit can eliminate a move from a MMX or SSE 59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 60// dependency breaking instruction, or via VZEROALL). 61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 62// instructions" - Agner Fog's "microarchitecture.pdf" 63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 64 0, // Max moves that can be eliminated per cycle. 65 1>; // Restrict move elimination to zero regs. 66 67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 68// retire up to two macro-ops per cycle. 69// Reference: "Software Optimization Guide for AMD Family 16h Processors" 70def JRCU : RetireControlUnit<64, 2>; 71 72// Integer Pipe Scheduler 73def JALU01 : ProcResGroup<[JALU0, JALU1]> { 74 let BufferSize=20; 75} 76 77// AGU Pipe Scheduler 78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 79 let BufferSize=12; 80} 81 82// Fpu Pipe Scheduler 83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 84 let BufferSize=18; 85} 86 87// Functional units 88def JDiv : ProcResource<1>; // integer division 89def JMul : ProcResource<1>; // integer multiplication 90def JVALU0 : ProcResource<1>; // vector integer 91def JVALU1 : ProcResource<1>; // vector integer 92def JVIMUL : ProcResource<1>; // vector integer multiplication 93def JSTC : ProcResource<1>; // vector store/convert 94def JFPM : ProcResource<1>; // FP multiplication 95def JFPA : ProcResource<1>; // FP addition 96 97// Functional unit groups 98def JFPX : ProcResGroup<[JFPA, JFPM]>; 99def JVALU : ProcResGroup<[JVALU0, JVALU1]>; 100 101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 102// cycles after the memory operand. 103def : ReadAdvance<ReadAfterLd, 3>; 104 105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 106// cycles after the memory operand. 107def : ReadAdvance<ReadAfterVecLd, 5>; 108def : ReadAdvance<ReadAfterVecXLd, 5>; 109def : ReadAdvance<ReadAfterVecYLd, 5>; 110 111/// "Additional 6 cycle transfer operation which moves a floating point 112/// operation input value from the integer unit to the floating point unit. 113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 114def : ReadAdvance<ReadInt2Fpu, -6>; 115 116// Many SchedWrites are defined in pairs with and without a folded load. 117// Instructions with folded loads are usually micro-fused, so they only appear 118// as two micro-ops when dispatched by the schedulers. 119// This multiclass defines the resource usage for variants with and without 120// folded loads. 121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 122 list<ProcResourceKind> ExePorts, 123 int Lat, list<int> Res = [], int UOps = 1, 124 int LoadUOps = 0> { 125 // Register variant is using a single cycle on ExePort. 126 def : WriteRes<SchedRW, ExePorts> { 127 let Latency = Lat; 128 let ResourceCycles = Res; 129 let NumMicroOps = UOps; 130 } 131 132 // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 133 // latency. 134 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 135 let Latency = !add(Lat, 3); 136 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 137 let NumMicroOps = !add(UOps, LoadUOps); 138 } 139} 140 141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 142 list<ProcResourceKind> ExePorts, 143 int Lat, list<int> Res = [], int UOps = 1, 144 int LoadUOps = 0> { 145 // Register variant is using a single cycle on ExePort. 146 def : WriteRes<SchedRW, ExePorts> { 147 let Latency = Lat; 148 let ResourceCycles = Res; 149 let NumMicroOps = UOps; 150 } 151 152 // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 153 // latency. 154 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 155 let Latency = !add(Lat, 5); 156 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 157 let NumMicroOps = !add(UOps, LoadUOps); 158 } 159} 160 161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 162 list<ProcResourceKind> ExePorts, 163 int Lat, list<int> Res = [2], int UOps = 2, 164 int LoadUOps = 0> { 165 // Register variant is using a single cycle on ExePort. 166 def : WriteRes<SchedRW, ExePorts> { 167 let Latency = Lat; 168 let ResourceCycles = Res; 169 let NumMicroOps = UOps; 170 } 171 172 // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 173 // latency. 174 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 175 let Latency = !add(Lat, 5); 176 let ResourceCycles = !listconcat([2], Res); 177 let NumMicroOps = !add(UOps, LoadUOps); 178 } 179} 180 181// Instructions that have local forwarding disabled have an extra +1cy latency. 182 183// A folded store needs a cycle on the SAGU for the store data, most RMW 184// instructions don't need an extra uop. ALU RMW operations don't seem to 185// benefit from STLF, and their observed latency is 6cy. That is the reason why 186// this write adds two extra cycles (instead of just 1cy for the store). 187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; 188 189//////////////////////////////////////////////////////////////////////////////// 190// Arithmetic. 191//////////////////////////////////////////////////////////////////////////////// 192 193defm : JWriteResIntPair<WriteALU, [JALU01], 1>; 194defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 195 196defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 197defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 198defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; 199defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; 200defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; 201 202defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; 203defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; 204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; 205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; 206defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; 207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; 208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; 209defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; 211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; 212defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; 213 214defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 215defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 216defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 217defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 218defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 219defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 220defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 221defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 222 223defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 224 225defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 226defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 227def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 228def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 229def : WriteRes<WriteLAHFSAHF, [JALU01]>; 230 231defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 232defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 233defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 234defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 235defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 236defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 237 238// This is for simple LEAs with one or two input operands. 239def : WriteRes<WriteLEA, [JALU01]>; 240 241// Bit counts. 242defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 243defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 244defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 245defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 246defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 247 248// BMI1 BEXTR/BLS, BMI2 BZHI 249defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 250defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 251defm : X86WriteResPairUnsupported<WriteBZHI>; 252 253//////////////////////////////////////////////////////////////////////////////// 254// Integer shifts and rotates. 255//////////////////////////////////////////////////////////////////////////////// 256 257defm : JWriteResIntPair<WriteShift, [JALU01], 1>; 258defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 259defm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 260defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 261 262// SHLD/SHRD. 263defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 264defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 265defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 266defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 267 268//////////////////////////////////////////////////////////////////////////////// 269// Loads, stores, and moves, not folded with other operations. 270//////////////////////////////////////////////////////////////////////////////// 271 272def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 273def : WriteRes<WriteStore, [JSAGU]>; 274def : WriteRes<WriteStoreNT, [JSAGU]>; 275def : WriteRes<WriteMove, [JALU01]>; 276 277// Load/store MXCSR. 278def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 279def : WriteRes<WriteSTMXCSR, [JSAGU]>; 280 281// Treat misc copies as a move. 282def : InstRW<[WriteMove], (instrs COPY)>; 283 284//////////////////////////////////////////////////////////////////////////////// 285// Idioms that clear a register, like xorps %xmm0, %xmm0. 286// These can often bypass execution ports completely. 287//////////////////////////////////////////////////////////////////////////////// 288 289def : WriteRes<WriteZero, []>; 290 291//////////////////////////////////////////////////////////////////////////////// 292// Branches don't produce values, so they have no latency, but they still 293// consume resources. Indirect branches can fold loads. 294//////////////////////////////////////////////////////////////////////////////// 295 296defm : JWriteResIntPair<WriteJump, [JALU01], 1>; 297 298//////////////////////////////////////////////////////////////////////////////// 299// Special case scheduling classes. 300//////////////////////////////////////////////////////////////////////////////// 301 302def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 303def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 304def : WriteRes<WriteFence, [JSAGU]>; 305 306// Nops don't have dependencies, so there's no actual latency, but we set this 307// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 308def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 309 310def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { 311 let Latency = 3; 312 let ResourceCycles = [3]; 313 let NumMicroOps = 3; 314} 315 316def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 317 let Latency = 16; 318 let ResourceCycles = [3,16,16]; 319 let NumMicroOps = 5; 320} 321 322def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 323 let Latency = 17; 324 let ResourceCycles = [3,17,17]; 325 let NumMicroOps = 6; 326} 327 328def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 329 let Latency = 11; 330 let ResourceCycles = [3,1,1]; 331 let NumMicroOps = 5; 332} 333 334def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 335 let Latency = 11; 336 let ResourceCycles = [3,1,1]; 337 let NumMicroOps = 18; 338} 339 340def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 341 let Latency = 32; 342 let ResourceCycles = [6,1,1]; 343 let NumMicroOps = 28; 344} 345 346def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 347 let Latency = 19; 348 let ResourceCycles = [3,19,19]; 349 let NumMicroOps = 18; 350} 351 352def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 353 let Latency = 38; 354 let ResourceCycles = [6,38,38]; 355 let NumMicroOps = 28; 356} 357 358def JWriteCMPXCHGVariant : SchedWriteVariant<[ 359 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, 360 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, 361 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, 362 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, 363 SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, 364 SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, 365 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, 366 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, 367 SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, 368 SchedVar<NoSchedPred, [WriteCMPXCHG]> 369]>; 370 371// The first five reads are contributed by the memory load operand. 372// We ignore those reads and set a read-advance for the other input operands 373// including the implicit read of RAX. 374def : InstRW<[JWriteCMPXCHGVariant, 375 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 376 ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, 377 LCMPXCHG32, LCMPXCHG64, 378 CMPXCHG8rm, CMPXCHG16rm, 379 CMPXCHG32rm, CMPXCHG64rm)>; 380 381def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, 382 CMPXCHG32rr, CMPXCHG64rr)>; 383 384def : InstRW<[JWriteCMPXCHGVariant, 385 // Ignore reads contributed by the memory operand. 386 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 387 // Add a read-advance to every implicit register read. 388 ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, 389 CMPXCHG8B, CMPXCHG16B)>; 390 391def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 392 let Latency = 19; 393 let ResourceCycles = [1,19,19]; 394 let NumMicroOps = 1; 395} 396 397def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ 398 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, 399 SchedVar<NoSchedPred, [WriteALURMW]> 400]>; 401def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, 402 DEC8m, DEC16m, DEC32m, DEC64m, 403 NOT8m, NOT16m, NOT32m, NOT64m, 404 NEG8m, NEG16m, NEG32m, NEG64m)>; 405 406def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { 407 let Latency = 2; 408 let ResourceCycles = [3]; 409 let NumMicroOps = 3; 410} 411def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, 412 XADD32rr, XADD64rr)>; 413 414// This write defines the latency of the in/out register operand of a non-atomic 415// XADDrm. This is the first of a pair of writes that model non-atomic 416// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). 417// 418// We need two writes because the instruction latency differs from the output 419// register operand latency. In particular, the first write describes the first 420// (and only) output register operand of the instruction. However, the 421// instruction latency is set to the MAX of all the write latencies. That's why 422// a second write is needed in this case (see example below). 423// 424// Example: 425// XADD %ecx, (%rsp) ## Instruction latency: 11cy 426// ## ECX write Latency: 3cy 427// 428// Register ECX becomes available in 3 cycles. That is because the value of ECX 429// is exchanged with the value read from the stack pointer, and the load-to-use 430// latency is assumed to be 3cy. 431def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 432 let Latency = 3; // load-to-use latency 433 let ResourceCycles = [3]; 434 let NumMicroOps = 3; 435} 436 437// This write defines the latency of the in/out register operand of an atomic 438// XADDrm. This is the first of a sequence of two writes used to model atomic 439// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. 440// 441// 442// Example: 443// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy 444// ## ECX write Latency: 11cy 445// 446// The value of ECX becomes available only after 11cy from the start of 447// execution. This write is used to specifically set that operand latency. 448def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 449 let Latency = 11; 450 let ResourceCycles = [3]; 451 let NumMicroOps = 3; 452} 453 454// This write defines the latency of the in/out register operand of an atomic 455// XCHGrm. This write is the first of a sequence of two writes that describe 456// atomic XCHG operations. We need two writes because the instruction latency 457// differs from the output register write latency. We want to make sure that 458// the output register operand becomes visible after 11cy. However, we want to 459// set the instruction latency to 16cy. 460def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { 461 let Latency = 11; 462 let ResourceCycles = [2]; 463 let NumMicroOps = 2; 464} 465 466def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 467 let Latency = 11; 468 let ResourceCycles = [1, 1]; 469 let NumMicroOps = 1; 470} 471 472def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 473 let Latency = 16; 474 let ResourceCycles = [16, 16]; 475 let NumMicroOps = 1; 476} 477 478def JWriteXADDrm_Part1 : SchedWriteVariant<[ 479 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, 480 SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> 481]>; 482 483def JWriteXADDrm_Part2 : SchedWriteVariant<[ 484 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, 485 SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> 486]>; 487 488def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], 489 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, 490 LXADD8, LXADD16, LXADD32, LXADD64)>; 491 492def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], 493 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; 494 495 496//////////////////////////////////////////////////////////////////////////////// 497// Floating point. This covers both scalar and vector operations. 498//////////////////////////////////////////////////////////////////////////////// 499 500defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 501defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 502defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 503defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 504defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; 505defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; 506defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 507defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 508 509defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 510defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 511defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 512defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 513defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 514defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 515 516defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; 517defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; 518defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; 519defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; 520 521defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 522defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 523defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 524 525defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 526 527defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 528defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 529defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 530defm : X86WriteResPairUnsupported<WriteFAddZ>; 531defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 532defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 533defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 534defm : X86WriteResPairUnsupported<WriteFAdd64Z>; 535defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 536defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 537defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 538defm : X86WriteResPairUnsupported<WriteFCmpZ>; 539defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 540defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 541defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 542defm : X86WriteResPairUnsupported<WriteFCmp64Z>; 543defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 544defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>; 545defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 546defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 547defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 548defm : X86WriteResPairUnsupported<WriteFMulZ>; 549defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 550defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 551defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 552defm : X86WriteResPairUnsupported<WriteFMul64Z>; 553defm : X86WriteResPairUnsupported<WriteFMA>; 554defm : X86WriteResPairUnsupported<WriteFMAX>; 555defm : X86WriteResPairUnsupported<WriteFMAY>; 556defm : X86WriteResPairUnsupported<WriteFMAZ>; 557defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 558defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 559defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 560defm : X86WriteResPairUnsupported<WriteDPPSZ>; 561defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 562defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 563defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 564defm : X86WriteResPairUnsupported<WriteFRcpZ>; 565defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 566defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 567defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 568defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 569defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 570defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 571defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 572defm : X86WriteResPairUnsupported<WriteFDivZ>; 573defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 574defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 575defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 576defm : X86WriteResPairUnsupported<WriteFDiv64Z>; 577defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 578defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 579defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 580defm : X86WriteResPairUnsupported<WriteFSqrtZ>; 581defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 582defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 583defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 584defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 585defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 586defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 587defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 588defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 589defm : X86WriteResPairUnsupported<WriteFRndZ>; 590defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 591defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 592defm : X86WriteResPairUnsupported<WriteFLogicZ>; 593defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 594defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 595defm : X86WriteResPairUnsupported<WriteFTestZ>; 596defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 597defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 598defm : X86WriteResPairUnsupported<WriteFShuffleZ>; 599defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 600defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 601defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 602defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 603defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 604defm : X86WriteResPairUnsupported<WriteFBlendZ>; 605defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 606defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 607defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 608defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 609defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 610 611//////////////////////////////////////////////////////////////////////////////// 612// Conversions. 613//////////////////////////////////////////////////////////////////////////////// 614 615defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 616defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 617defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 618defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 619defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 620defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 621defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 622defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 623 624defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 625defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 626defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 627defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 628defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 629defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 630defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 631defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 632defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 633defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 634 635defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 636defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 637defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 638defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 639 640defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 641defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 642defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 643defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 644 645defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 646defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 647defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 648 649defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 650defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 651defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 652defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 653defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 654defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 655 656//////////////////////////////////////////////////////////////////////////////// 657// Vector integer operations. 658//////////////////////////////////////////////////////////////////////////////// 659 660defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 661defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; 662defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; 663defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 664defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 665defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 666defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 667 668defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 669defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 670defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 671defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 672defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 673defm : X86WriteResUnsupported<WriteVecMaskedStore32>; 674defm : X86WriteResUnsupported<WriteVecMaskedStore64>; 675defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; 676defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; 677 678defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 679defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 680defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 681defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 682defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 683 684defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 685defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 686defm : X86WriteResPairUnsupported<WriteVecALUY>; 687defm : X86WriteResPairUnsupported<WriteVecALUZ>; 688defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 689defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 690defm : X86WriteResPairUnsupported<WriteVecShiftY>; 691defm : X86WriteResPairUnsupported<WriteVecShiftZ>; 692defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 693defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 694defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 695defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 696defm : X86WriteResPairUnsupported<WriteVarVecShift>; 697defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 698defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 699defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 700defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 701defm : X86WriteResPairUnsupported<WriteVecIMulY>; 702defm : X86WriteResPairUnsupported<WriteVecIMulZ>; 703defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 704defm : X86WriteResPairUnsupported<WritePMULLDY>; 705defm : X86WriteResPairUnsupported<WritePMULLDZ>; 706defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 707defm : X86WriteResPairUnsupported<WriteMPSADY>; 708defm : X86WriteResPairUnsupported<WriteMPSADZ>; 709defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 710defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 711defm : X86WriteResPairUnsupported<WritePSADBWY>; 712defm : X86WriteResPairUnsupported<WritePSADBWZ>; 713defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 714defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 715defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 716defm : X86WriteResPairUnsupported<WriteShuffleY>; 717defm : X86WriteResPairUnsupported<WriteShuffleZ>; 718defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 719defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 720defm : X86WriteResPairUnsupported<WriteVarShuffleY>; 721defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 722defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 723defm : X86WriteResPairUnsupported<WriteBlendY>; 724defm : X86WriteResPairUnsupported<WriteBlendZ>; 725defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 726defm : X86WriteResPairUnsupported<WriteVarBlendY>; 727defm : X86WriteResPairUnsupported<WriteVarBlendZ>; 728defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 729defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 730defm : X86WriteResPairUnsupported<WriteVecLogicY>; 731defm : X86WriteResPairUnsupported<WriteVecLogicZ>; 732defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 733defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 734defm : X86WriteResPairUnsupported<WriteVecTestZ>; 735defm : X86WriteResPairUnsupported<WriteShuffle256>; 736defm : X86WriteResPairUnsupported<WriteVarShuffle256>; 737 738//////////////////////////////////////////////////////////////////////////////// 739// Vector insert/extract operations. 740//////////////////////////////////////////////////////////////////////////////// 741 742defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 743defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 744defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 745defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 746 747//////////////////////////////////////////////////////////////////////////////// 748// SSE42 String instructions. 749//////////////////////////////////////////////////////////////////////////////// 750 751defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 752defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 753defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 754defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 755 756//////////////////////////////////////////////////////////////////////////////// 757// MOVMSK Instructions. 758//////////////////////////////////////////////////////////////////////////////// 759 760def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 761def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 762defm : X86WriteResUnsupported<WriteVecMOVMSKY>; 763def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 764 765//////////////////////////////////////////////////////////////////////////////// 766// AES Instructions. 767//////////////////////////////////////////////////////////////////////////////// 768 769defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 770defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 771defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 772 773//////////////////////////////////////////////////////////////////////////////// 774// Horizontal add/sub instructions. 775//////////////////////////////////////////////////////////////////////////////// 776 777defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 778defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 779defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 780defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 781defm : X86WriteResPairUnsupported<WritePHAddY>; 782 783//////////////////////////////////////////////////////////////////////////////// 784// Carry-less multiplication instructions. 785//////////////////////////////////////////////////////////////////////////////// 786 787defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 788 789//////////////////////////////////////////////////////////////////////////////// 790// SSE4A instructions. 791//////////////////////////////////////////////////////////////////////////////// 792 793def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 794 let Latency = 2; 795 let ResourceCycles = [1, 4]; 796} 797def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 798 799//////////////////////////////////////////////////////////////////////////////// 800// AVX instructions. 801//////////////////////////////////////////////////////////////////////////////// 802 803def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 804def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 805 806def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 807 let Latency = 6; 808 let ResourceCycles = [1, 2, 4]; 809 let NumMicroOps = 2; 810} 811def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 812 VBROADCASTSSYrm, 813 VBROADCASTF128)>; 814 815def JWriteJVZEROALL: SchedWriteRes<[]> { 816 let Latency = 90; 817 let NumMicroOps = 73; 818} 819def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 820 821def JWriteJVZEROUPPER: SchedWriteRes<[]> { 822 let Latency = 46; 823 let NumMicroOps = 37; 824} 825def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 826 827/////////////////////////////////////////////////////////////////////////////// 828// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ 829/////////////////////////////////////////////////////////////////////////////// 830 831def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { 832 let Latency = 34; 833 let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; 834 let NumMicroOps = 63; 835} 836def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, 837 VMASKMOVDQU, VMASKMOVDQU64)>; 838 839/////////////////////////////////////////////////////////////////////////////// 840// SchedWriteVariant definitions. 841/////////////////////////////////////////////////////////////////////////////// 842 843def JWriteZeroLatency : SchedWriteRes<[]> { 844 let Latency = 0; 845} 846 847def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 848 let NumMicroOps = 2; 849} 850 851// Certain instructions that use the same register for both source 852// operands do not have a real dependency on the previous contents of the 853// register, and thus, do not have to wait before completing. They can be 854// optimized out at register renaming stage. 855// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 856// 15h Processors". 857// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 858// Section 21.8 [Dependency-breaking instructions]. 859 860def JWriteZeroIdiom : SchedWriteVariant<[ 861 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 862 SchedVar<NoSchedPred, [WriteALU]> 863]>; 864def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 865 XOR32rr, XOR64rr)>; 866 867def JWriteFZeroIdiom : SchedWriteVariant<[ 868 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 869 SchedVar<NoSchedPred, [WriteFLogic]> 870]>; 871def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 872 ANDNPSrr, VANDNPSrr, 873 ANDNPDrr, VANDNPDrr)>; 874 875def JWriteFZeroIdiomY : SchedWriteVariant<[ 876 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 877 SchedVar<NoSchedPred, [WriteFLogicY]> 878]>; 879def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 880 VANDNPSYrr, VANDNPDYrr)>; 881 882def JWriteVZeroIdiomLogic : SchedWriteVariant<[ 883 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 884 SchedVar<NoSchedPred, [WriteVecLogic]> 885]>; 886def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; 887 888def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 889 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 890 SchedVar<NoSchedPred, [WriteVecLogicX]> 891]>; 892def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 893 PANDNrr, VPANDNrr)>; 894 895def JWriteVZeroIdiomALU : SchedWriteVariant<[ 896 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 897 SchedVar<NoSchedPred, [WriteVecALU]> 898]>; 899def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, 900 MMX_PSUBQirr, MMX_PSUBWirr, 901 MMX_PSUBSBirr, MMX_PSUBSWirr, 902 MMX_PSUBUSBirr, MMX_PSUBUSWirr, 903 MMX_PCMPGTBirr, MMX_PCMPGTDirr, 904 MMX_PCMPGTWirr)>; 905 906def JWriteVZeroIdiomALUX : SchedWriteVariant<[ 907 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 908 SchedVar<NoSchedPred, [WriteVecALUX]> 909]>; 910def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 911 PSUBDrr, VPSUBDrr, 912 PSUBQrr, VPSUBQrr, 913 PSUBWrr, VPSUBWrr, 914 PSUBSBrr, VPSUBSBrr, 915 PSUBSWrr, VPSUBSWrr, 916 PSUBUSBrr, VPSUBUSBrr, 917 PSUBUSWrr, VPSUBUSWrr, 918 PCMPGTBrr, VPCMPGTBrr, 919 PCMPGTDrr, VPCMPGTDrr, 920 PCMPGTQrr, VPCMPGTQrr, 921 PCMPGTWrr, VPCMPGTWrr)>; 922 923def JWriteVPERM2F128 : SchedWriteVariant<[ 924 SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 925 SchedVar<NoSchedPred, [WriteFShuffle256]> 926]>; 927def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 928 929// This write is used for slow LEA instructions. 930def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 931 let Latency = 2; 932} 933 934// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 935// with a `Scale` value different than 1. 936def JSlowLEAPredicate : MCSchedPredicate< 937 CheckAny<[ 938 // A 3-operand LEA (base, index, offset). 939 IsThreeOperandsLEAFn, 940 // An LEA with a "Scale" different than 1. 941 CheckAll<[ 942 CheckIsImmOperand<2>, 943 CheckNot<CheckImmOperand<2, 1>> 944 ]> 945 ]> 946>; 947 948def JWriteLEA : SchedWriteVariant<[ 949 SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 950 SchedVar<NoSchedPred, [WriteLEA]> 951]>; 952 953def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 954 955def JSlowLEA16r : SchedWriteRes<[JALU01]> { 956 let Latency = 3; 957 let ResourceCycles = [4]; 958} 959 960def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 961 962/////////////////////////////////////////////////////////////////////////////// 963// Dependency breaking instructions. 964/////////////////////////////////////////////////////////////////////////////// 965 966def : IsZeroIdiomFunction<[ 967 // GPR Zero-idioms. 968 DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 969 970 // MMX Zero-idioms. 971 DepBreakingClass<[ 972 MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, 973 MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, 974 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, 975 MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr 976 ], ZeroIdiomPredicate>, 977 978 // SSE Zero-idioms. 979 DepBreakingClass<[ 980 // fp variants. 981 XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 982 983 // int variants. 984 PXORrr, PANDNrr, 985 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 986 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 987 PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 988 ], ZeroIdiomPredicate>, 989 990 // AVX Zero-idioms. 991 DepBreakingClass<[ 992 // xmm fp variants. 993 VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 994 995 // xmm int variants. 996 VPXORrr, VPANDNrr, 997 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 998 VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 999 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 1000 1001 // ymm variants. 1002 VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 1003 ], ZeroIdiomPredicate>, 1004 1005 DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 1006]>; 1007 1008def : IsDepBreakingFunction<[ 1009 // GPR 1010 DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 1011 DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 1012 1013 // MMX 1014 DepBreakingClass<[ 1015 MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr 1016 ], ZeroIdiomPredicate>, 1017 1018 // SSE 1019 DepBreakingClass<[ 1020 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1021 ], ZeroIdiomPredicate>, 1022 1023 // AVX 1024 DepBreakingClass<[ 1025 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1026 ], ZeroIdiomPredicate> 1027]>; 1028 1029def : IsOptimizableRegisterMove<[ 1030 InstructionEquivalenceClass<[ 1031 // GPR variants. 1032 MOV32rr, MOV64rr, 1033 1034 // MMX variants. 1035 MMX_MOVQ64rr, 1036 1037 // SSE variants. 1038 MOVAPSrr, MOVUPSrr, 1039 MOVAPDrr, MOVUPDrr, 1040 MOVDQArr, MOVDQUrr, 1041 1042 // AVX variants. 1043 VMOVAPSrr, VMOVUPSrr, 1044 VMOVAPDrr, VMOVUPDrr, 1045 VMOVDQArr, VMOVDQUrr 1046 ], TruePred > 1047]>; 1048 1049} // SchedModel 1050