1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AMD btver2 (Jaguar) to support 10// instruction scheduling and other instruction cost heuristics. Based off AMD Software 11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 12// 13//===----------------------------------------------------------------------===// 14 15def BtVer2Model : SchedMachineModel { 16 // All x86 instructions are modeled as a single micro-op, and btver2 can 17 // decode 2 instructions per cycle. 18 let IssueWidth = 2; 19 let MicroOpBufferSize = 64; // Retire Control Unit 20 let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 21 let HighLatency = 25; 22 let MispredictPenalty = 14; // Minimum branch misdirection penalty 23 let PostRAScheduler = 1; 24 25 // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 26 // the scheduler to assign a default model to unrecognized opcodes. 27 let CompleteModel = 0; 28} 29 30let SchedModel = BtVer2Model in { 31 32// Jaguar can issue up to 6 micro-ops in one cycle 33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 39 40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 41// speculative version of the 64-bit integer registers. 42// Reference: www.realworldtech.com/jaguar/4/ 43// 44// The processor always keeps the different parts of an integer register 45// together. An instruction that writes to a part of a register will therefore 46// have a false dependence on any previous write to the same register or any 47// part of it. 48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 49// access" - Agner Fog's "microarchitecture.pdf". 50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 51 0, // Max moves that can be eliminated per cycle. 52 1>; // Restrict move elimination to zero regs. 53 54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 55// registers. Operations on 256-bit data types are cracked into two COPs. 56// Reference: www.realworldtech.com/jaguar/4/ 57 58// The PRF in the floating point unit can eliminate a move from a MMX or SSE 59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 60// dependency breaking instruction, or via VZEROALL). 61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 62// instructions" - Agner Fog's "microarchitecture.pdf" 63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 64 0, // Max moves that can be eliminated per cycle. 65 1>; // Restrict move elimination to zero regs. 66 67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 68// retire up to two macro-ops per cycle. 69// Reference: "Software Optimization Guide for AMD Family 16h Processors" 70def JRCU : RetireControlUnit<64, 2>; 71 72// Integer Pipe Scheduler 73def JALU01 : ProcResGroup<[JALU0, JALU1]> { 74 let BufferSize=20; 75} 76 77// AGU Pipe Scheduler 78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 79 let BufferSize=12; 80} 81 82// Fpu Pipe Scheduler 83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 84 let BufferSize=18; 85} 86 87// Functional units 88def JDiv : ProcResource<1>; // integer division 89def JMul : ProcResource<1>; // integer multiplication 90def JVALU0 : ProcResource<1>; // vector integer 91def JVALU1 : ProcResource<1>; // vector integer 92def JVIMUL : ProcResource<1>; // vector integer multiplication 93def JSTC : ProcResource<1>; // vector store/convert 94def JFPM : ProcResource<1>; // FP multiplication 95def JFPA : ProcResource<1>; // FP addition 96 97// Functional unit groups 98def JFPX : ProcResGroup<[JFPA, JFPM]>; 99def JVALU : ProcResGroup<[JVALU0, JVALU1]>; 100 101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 102// cycles after the memory operand. 103def : ReadAdvance<ReadAfterLd, 3>; 104 105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 106// cycles after the memory operand. 107def : ReadAdvance<ReadAfterVecLd, 5>; 108def : ReadAdvance<ReadAfterVecXLd, 5>; 109def : ReadAdvance<ReadAfterVecYLd, 5>; 110 111/// "Additional 6 cycle transfer operation which moves a floating point 112/// operation input value from the integer unit to the floating point unit. 113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 114def : ReadAdvance<ReadInt2Fpu, -6>; 115 116// Many SchedWrites are defined in pairs with and without a folded load. 117// Instructions with folded loads are usually micro-fused, so they only appear 118// as two micro-ops when dispatched by the schedulers. 119// This multiclass defines the resource usage for variants with and without 120// folded loads. 121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 122 list<ProcResourceKind> ExePorts, 123 int Lat, list<int> Res = [], int UOps = 1, 124 int LoadUOps = 0> { 125 // Register variant is using a single cycle on ExePort. 126 def : WriteRes<SchedRW, ExePorts> { 127 let Latency = Lat; 128 let ResourceCycles = Res; 129 let NumMicroOps = UOps; 130 } 131 132 // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 133 // latency. 134 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 135 let Latency = !add(Lat, 3); 136 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 137 let NumMicroOps = !add(UOps, LoadUOps); 138 } 139} 140 141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 142 list<ProcResourceKind> ExePorts, 143 int Lat, list<int> Res = [], int UOps = 1, 144 int LoadUOps = 0> { 145 // Register variant is using a single cycle on ExePort. 146 def : WriteRes<SchedRW, ExePorts> { 147 let Latency = Lat; 148 let ResourceCycles = Res; 149 let NumMicroOps = UOps; 150 } 151 152 // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 153 // latency. 154 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 155 let Latency = !add(Lat, 5); 156 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 157 let NumMicroOps = !add(UOps, LoadUOps); 158 } 159} 160 161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 162 list<ProcResourceKind> ExePorts, 163 int Lat, list<int> Res = [2], int UOps = 2, 164 int LoadUOps = 0> { 165 // Register variant is using a single cycle on ExePort. 166 def : WriteRes<SchedRW, ExePorts> { 167 let Latency = Lat; 168 let ResourceCycles = Res; 169 let NumMicroOps = UOps; 170 } 171 172 // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 173 // latency. 174 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 175 let Latency = !add(Lat, 5); 176 let ResourceCycles = !listconcat([2], Res); 177 let NumMicroOps = !add(UOps, LoadUOps); 178 } 179} 180 181// Instructions that have local forwarding disabled have an extra +1cy latency. 182 183// A folded store needs a cycle on the SAGU for the store data, most RMW 184// instructions don't need an extra uop. ALU RMW operations don't seem to 185// benefit from STLF, and their observed latency is 6cy. That is the reason why 186// this write adds two extra cycles (instead of just 1cy for the store). 187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; 188 189//////////////////////////////////////////////////////////////////////////////// 190// Arithmetic. 191//////////////////////////////////////////////////////////////////////////////// 192 193defm : JWriteResIntPair<WriteALU, [JALU01], 1>; 194defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 195 196defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 197defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 198defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; 199defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; 200defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; 201 202defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; 203defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; 204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; 205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; 206defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; 207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; 208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; 209defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; 211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; 212defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; 213 214defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 215defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 216defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 217defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 218defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 219defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 220defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 221defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 222 223defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 224 225defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 226defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 227def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 228def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 229def : WriteRes<WriteLAHFSAHF, [JALU01]>; 230 231defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 232defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 233defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 234defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 235defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 236defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 237 238// This is for simple LEAs with one or two input operands. 239def : WriteRes<WriteLEA, [JALU01]>; 240 241// Bit counts. 242defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 243defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 244defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 245defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 246defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 247 248// BMI1 BEXTR/BLS, BMI2 BZHI 249defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 250defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 251defm : X86WriteResPairUnsupported<WriteBZHI>; 252 253//////////////////////////////////////////////////////////////////////////////// 254// Integer shifts and rotates. 255//////////////////////////////////////////////////////////////////////////////// 256 257defm : JWriteResIntPair<WriteShift, [JALU01], 1>; 258defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 259defm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 260defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 261 262// SHLD/SHRD. 263defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 264defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 265defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 266defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 267 268//////////////////////////////////////////////////////////////////////////////// 269// Loads, stores, and moves, not folded with other operations. 270//////////////////////////////////////////////////////////////////////////////// 271 272def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 273def : WriteRes<WriteStore, [JSAGU]>; 274def : WriteRes<WriteStoreNT, [JSAGU]>; 275def : WriteRes<WriteMove, [JALU01]>; 276 277// Load/store MXCSR. 278def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 279def : WriteRes<WriteSTMXCSR, [JSAGU]>; 280 281// Treat misc copies as a move. 282def : InstRW<[WriteMove], (instrs COPY)>; 283 284//////////////////////////////////////////////////////////////////////////////// 285// Idioms that clear a register, like xorps %xmm0, %xmm0. 286// These can often bypass execution ports completely. 287//////////////////////////////////////////////////////////////////////////////// 288 289def : WriteRes<WriteZero, []>; 290 291//////////////////////////////////////////////////////////////////////////////// 292// Branches don't produce values, so they have no latency, but they still 293// consume resources. Indirect branches can fold loads. 294//////////////////////////////////////////////////////////////////////////////// 295 296defm : JWriteResIntPair<WriteJump, [JALU01], 1>; 297 298//////////////////////////////////////////////////////////////////////////////// 299// Special case scheduling classes. 300//////////////////////////////////////////////////////////////////////////////// 301 302def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 303def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 304def : WriteRes<WriteFence, [JSAGU]>; 305 306// Nops don't have dependencies, so there's no actual latency, but we set this 307// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 308def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 309 310def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { 311 let Latency = 3; 312 let ResourceCycles = [3]; 313 let NumMicroOps = 3; 314} 315 316def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 317 let Latency = 16; 318 let ResourceCycles = [3,16,16]; 319 let NumMicroOps = 5; 320} 321 322def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 323 let Latency = 17; 324 let ResourceCycles = [3,17,17]; 325 let NumMicroOps = 6; 326} 327 328def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 329 let Latency = 11; 330 let ResourceCycles = [3,1,1]; 331 let NumMicroOps = 5; 332} 333 334def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 335 let Latency = 11; 336 let ResourceCycles = [3,1,1]; 337 let NumMicroOps = 18; 338} 339 340def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 341 let Latency = 32; 342 let ResourceCycles = [6,1,1]; 343 let NumMicroOps = 28; 344} 345 346def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 347 let Latency = 19; 348 let ResourceCycles = [3,19,19]; 349 let NumMicroOps = 18; 350} 351 352def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 353 let Latency = 38; 354 let ResourceCycles = [6,38,38]; 355 let NumMicroOps = 28; 356} 357 358def JWriteCMPXCHGVariant : SchedWriteVariant<[ 359 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, 360 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, 361 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, 362 SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, 363 SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, 364 SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, 365 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, 366 SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, 367 SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, 368 SchedVar<NoSchedPred, [WriteCMPXCHG]> 369]>; 370 371// The first five reads are contributed by the memory load operand. 372// We ignore those reads and set a read-advance for the other input operands 373// including the implicit read of RAX. 374def : InstRW<[JWriteCMPXCHGVariant, 375 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 376 ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, 377 LCMPXCHG32, LCMPXCHG64, 378 CMPXCHG8rm, CMPXCHG16rm, 379 CMPXCHG32rm, CMPXCHG64rm)>; 380 381def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, 382 CMPXCHG32rr, CMPXCHG64rr)>; 383 384def : InstRW<[JWriteCMPXCHGVariant, 385 // Ignore reads contributed by the memory operand. 386 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, 387 // Add a read-advance to every implicit register read. 388 ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, 389 CMPXCHG8B, CMPXCHG16B)>; 390 391def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { 392 let Latency = 19; 393 let ResourceCycles = [1,19,19]; 394 let NumMicroOps = 1; 395} 396 397def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ 398 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, 399 SchedVar<NoSchedPred, [WriteALURMW]> 400]>; 401def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, 402 DEC8m, DEC16m, DEC32m, DEC64m, 403 NOT8m, NOT16m, NOT32m, NOT64m, 404 NEG8m, NEG16m, NEG32m, NEG64m)>; 405 406def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { 407 let Latency = 2; 408 let ResourceCycles = [3]; 409 let NumMicroOps = 3; 410} 411def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, 412 XADD32rr, XADD64rr)>; 413 414// This write defines the latency of the in/out register operand of a non-atomic 415// XADDrm. This is the first of a pair of writes that model non-atomic 416// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). 417// 418// We need two writes because the instruction latency differs from the output 419// register operand latency. In particular, the first write describes the first 420// (and only) output register operand of the instruction. However, the 421// instruction latency is set to the MAX of all the write latencies. That's why 422// a second write is needed in this case (see example below). 423// 424// Example: 425// XADD %ecx, (%rsp) ## Instruction latency: 11cy 426// ## ECX write Latency: 3cy 427// 428// Register ECX becomes available in 3 cycles. That is because the value of ECX 429// is exchanged with the value read from the stack pointer, and the load-to-use 430// latency is assumed to be 3cy. 431def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 432 let Latency = 3; // load-to-use latency 433 let ResourceCycles = [3]; 434 let NumMicroOps = 3; 435} 436 437// This write defines the latency of the in/out register operand of an atomic 438// XADDrm. This is the first of a sequence of two writes used to model atomic 439// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. 440// 441// 442// Example: 443// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy 444// ## ECX write Latency: 11cy 445// 446// The value of ECX becomes available only after 11cy from the start of 447// execution. This write is used to specifically set that operand latency. 448def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { 449 let Latency = 11; 450 let ResourceCycles = [3]; 451 let NumMicroOps = 3; 452} 453 454// This write defines the latency of the in/out register operand of an atomic 455// XCHGrm. This write is the first of a sequence of two writes that describe 456// atomic XCHG operations. We need two writes because the instruction latency 457// differs from the output register write latency. We want to make sure that 458// the output register operand becomes visible after 11cy. However, we want to 459// set the instruction latency to 16cy. 460def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { 461 let Latency = 11; 462 let ResourceCycles = [2]; 463 let NumMicroOps = 2; 464} 465 466def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 467 let Latency = 11; 468 let ResourceCycles = [1, 1]; 469 let NumMicroOps = 1; 470} 471 472def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { 473 let Latency = 16; 474 let ResourceCycles = [16, 16]; 475 let NumMicroOps = 1; 476} 477 478def JWriteXADDrm_Part1 : SchedWriteVariant<[ 479 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, 480 SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> 481]>; 482 483def JWriteXADDrm_Part2 : SchedWriteVariant<[ 484 SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, 485 SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> 486]>; 487 488def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], 489 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, 490 LXADD8, LXADD16, LXADD32, LXADD64)>; 491 492def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], 493 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; 494 495 496//////////////////////////////////////////////////////////////////////////////// 497// Floating point. This covers both scalar and vector operations. 498//////////////////////////////////////////////////////////////////////////////// 499 500defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 501defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 502defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 503defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 504defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; 505defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; 506defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 507defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 508 509defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 510defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 511defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 512defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 513defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 514defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 515 516defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; 517defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; 518defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; 519defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; 520 521defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 522defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 523defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 524 525defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 526 527defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 528defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 529defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 530defm : X86WriteResPairUnsupported<WriteFAddZ>; 531defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 532defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 533defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 534defm : X86WriteResPairUnsupported<WriteFAdd64Z>; 535defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 536defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 537defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 538defm : X86WriteResPairUnsupported<WriteFCmpZ>; 539defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 540defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 541defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 542defm : X86WriteResPairUnsupported<WriteFCmp64Z>; 543defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 544defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 545defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 546defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 547defm : X86WriteResPairUnsupported<WriteFMulZ>; 548defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 549defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 550defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 551defm : X86WriteResPairUnsupported<WriteFMul64Z>; 552defm : X86WriteResPairUnsupported<WriteFMA>; 553defm : X86WriteResPairUnsupported<WriteFMAX>; 554defm : X86WriteResPairUnsupported<WriteFMAY>; 555defm : X86WriteResPairUnsupported<WriteFMAZ>; 556defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 557defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 558defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 559defm : X86WriteResPairUnsupported<WriteDPPSZ>; 560defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 561defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 562defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 563defm : X86WriteResPairUnsupported<WriteFRcpZ>; 564defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 565defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 566defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 567defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 568defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 569defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 570defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 571defm : X86WriteResPairUnsupported<WriteFDivZ>; 572defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 573defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 574defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 575defm : X86WriteResPairUnsupported<WriteFDiv64Z>; 576defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 577defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 578defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 579defm : X86WriteResPairUnsupported<WriteFSqrtZ>; 580defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 581defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 582defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 583defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 584defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 585defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 586defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 587defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 588defm : X86WriteResPairUnsupported<WriteFRndZ>; 589defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 590defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 591defm : X86WriteResPairUnsupported<WriteFLogicZ>; 592defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 593defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 594defm : X86WriteResPairUnsupported<WriteFTestZ>; 595defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 596defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 597defm : X86WriteResPairUnsupported<WriteFShuffleZ>; 598defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 599defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 600defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 601defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 602defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 603defm : X86WriteResPairUnsupported<WriteFBlendZ>; 604defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 605defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 606defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 607defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 608defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 609 610//////////////////////////////////////////////////////////////////////////////// 611// Conversions. 612//////////////////////////////////////////////////////////////////////////////// 613 614defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 615defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 616defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 617defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 618defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 619defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 620defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 621defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 622 623defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 624defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 625defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 626defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 627defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 628defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 629defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 630defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 631defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 632defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 633 634defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 635defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 636defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 637defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 638 639defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 640defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 641defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 642defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 643 644defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 645defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 646defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 647 648defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 649defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 650defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 651defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 652defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 653defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 654 655//////////////////////////////////////////////////////////////////////////////// 656// Vector integer operations. 657//////////////////////////////////////////////////////////////////////////////// 658 659defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 660defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; 661defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; 662defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 663defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 664defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 665defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 666 667defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 668defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 669defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; 670defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 671defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 672defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; 673defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; 674 675defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 676defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 677defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 678defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 679defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 680 681defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 682defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 683defm : X86WriteResPairUnsupported<WriteVecALUY>; 684defm : X86WriteResPairUnsupported<WriteVecALUZ>; 685defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 686defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 687defm : X86WriteResPairUnsupported<WriteVecShiftY>; 688defm : X86WriteResPairUnsupported<WriteVecShiftZ>; 689defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 690defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 691defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 692defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 693defm : X86WriteResPairUnsupported<WriteVarVecShift>; 694defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 695defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 696defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 697defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 698defm : X86WriteResPairUnsupported<WriteVecIMulY>; 699defm : X86WriteResPairUnsupported<WriteVecIMulZ>; 700defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 701defm : X86WriteResPairUnsupported<WritePMULLDY>; 702defm : X86WriteResPairUnsupported<WritePMULLDZ>; 703defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 704defm : X86WriteResPairUnsupported<WriteMPSADY>; 705defm : X86WriteResPairUnsupported<WriteMPSADZ>; 706defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 707defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 708defm : X86WriteResPairUnsupported<WritePSADBWY>; 709defm : X86WriteResPairUnsupported<WritePSADBWZ>; 710defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 711defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 712defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 713defm : X86WriteResPairUnsupported<WriteShuffleY>; 714defm : X86WriteResPairUnsupported<WriteShuffleZ>; 715defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 716defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 717defm : X86WriteResPairUnsupported<WriteVarShuffleY>; 718defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 719defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 720defm : X86WriteResPairUnsupported<WriteBlendY>; 721defm : X86WriteResPairUnsupported<WriteBlendZ>; 722defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 723defm : X86WriteResPairUnsupported<WriteVarBlendY>; 724defm : X86WriteResPairUnsupported<WriteVarBlendZ>; 725defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 726defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 727defm : X86WriteResPairUnsupported<WriteVecLogicY>; 728defm : X86WriteResPairUnsupported<WriteVecLogicZ>; 729defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 730defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 731defm : X86WriteResPairUnsupported<WriteVecTestZ>; 732defm : X86WriteResPairUnsupported<WriteShuffle256>; 733defm : X86WriteResPairUnsupported<WriteVarShuffle256>; 734 735//////////////////////////////////////////////////////////////////////////////// 736// Vector insert/extract operations. 737//////////////////////////////////////////////////////////////////////////////// 738 739defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 740defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 741defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 742defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 743 744//////////////////////////////////////////////////////////////////////////////// 745// SSE42 String instructions. 746//////////////////////////////////////////////////////////////////////////////// 747 748defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 749defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 750defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 751defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 752 753//////////////////////////////////////////////////////////////////////////////// 754// MOVMSK Instructions. 755//////////////////////////////////////////////////////////////////////////////// 756 757def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 758def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 759defm : X86WriteResUnsupported<WriteVecMOVMSKY>; 760def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 761 762//////////////////////////////////////////////////////////////////////////////// 763// AES Instructions. 764//////////////////////////////////////////////////////////////////////////////// 765 766defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 767defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 768defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 769 770//////////////////////////////////////////////////////////////////////////////// 771// Horizontal add/sub instructions. 772//////////////////////////////////////////////////////////////////////////////// 773 774defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 775defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 776defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 777defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 778defm : X86WriteResPairUnsupported<WritePHAddY>; 779 780//////////////////////////////////////////////////////////////////////////////// 781// Carry-less multiplication instructions. 782//////////////////////////////////////////////////////////////////////////////// 783 784defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 785 786//////////////////////////////////////////////////////////////////////////////// 787// SSE4A instructions. 788//////////////////////////////////////////////////////////////////////////////// 789 790def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 791 let Latency = 2; 792 let ResourceCycles = [1, 4]; 793} 794def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 795 796//////////////////////////////////////////////////////////////////////////////// 797// AVX instructions. 798//////////////////////////////////////////////////////////////////////////////// 799 800def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 801def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 802 803def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 804 let Latency = 6; 805 let ResourceCycles = [1, 2, 4]; 806 let NumMicroOps = 2; 807} 808def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 809 VBROADCASTSSYrm, 810 VBROADCASTF128)>; 811 812def JWriteJVZEROALL: SchedWriteRes<[]> { 813 let Latency = 90; 814 let NumMicroOps = 73; 815} 816def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 817 818def JWriteJVZEROUPPER: SchedWriteRes<[]> { 819 let Latency = 46; 820 let NumMicroOps = 37; 821} 822def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 823 824/////////////////////////////////////////////////////////////////////////////// 825// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ 826/////////////////////////////////////////////////////////////////////////////// 827 828def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { 829 let Latency = 34; 830 let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; 831 let NumMicroOps = 63; 832} 833def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, 834 VMASKMOVDQU, VMASKMOVDQU64)>; 835 836/////////////////////////////////////////////////////////////////////////////// 837// SchedWriteVariant definitions. 838/////////////////////////////////////////////////////////////////////////////// 839 840def JWriteZeroLatency : SchedWriteRes<[]> { 841 let Latency = 0; 842} 843 844def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 845 let NumMicroOps = 2; 846} 847 848// Certain instructions that use the same register for both source 849// operands do not have a real dependency on the previous contents of the 850// register, and thus, do not have to wait before completing. They can be 851// optimized out at register renaming stage. 852// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 853// 15h Processors". 854// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 855// Section 21.8 [Dependency-breaking instructions]. 856 857def JWriteZeroIdiom : SchedWriteVariant<[ 858 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 859 SchedVar<NoSchedPred, [WriteALU]> 860]>; 861def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 862 XOR32rr, XOR64rr)>; 863 864def JWriteFZeroIdiom : SchedWriteVariant<[ 865 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 866 SchedVar<NoSchedPred, [WriteFLogic]> 867]>; 868def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 869 ANDNPSrr, VANDNPSrr, 870 ANDNPDrr, VANDNPDrr)>; 871 872def JWriteFZeroIdiomY : SchedWriteVariant<[ 873 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 874 SchedVar<NoSchedPred, [WriteFLogicY]> 875]>; 876def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 877 VANDNPSYrr, VANDNPDYrr)>; 878 879def JWriteVZeroIdiomLogic : SchedWriteVariant<[ 880 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 881 SchedVar<NoSchedPred, [WriteVecLogic]> 882]>; 883def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; 884 885def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 886 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 887 SchedVar<NoSchedPred, [WriteVecLogicX]> 888]>; 889def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 890 PANDNrr, VPANDNrr)>; 891 892def JWriteVZeroIdiomALU : SchedWriteVariant<[ 893 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 894 SchedVar<NoSchedPred, [WriteVecALU]> 895]>; 896def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, 897 MMX_PSUBQirr, MMX_PSUBWirr, 898 MMX_PSUBSBirr, MMX_PSUBSWirr, 899 MMX_PSUBUSBirr, MMX_PSUBUSWirr, 900 MMX_PCMPGTBirr, MMX_PCMPGTDirr, 901 MMX_PCMPGTWirr)>; 902 903def JWriteVZeroIdiomALUX : SchedWriteVariant<[ 904 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 905 SchedVar<NoSchedPred, [WriteVecALUX]> 906]>; 907def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 908 PSUBDrr, VPSUBDrr, 909 PSUBQrr, VPSUBQrr, 910 PSUBWrr, VPSUBWrr, 911 PSUBSBrr, VPSUBSBrr, 912 PSUBSWrr, VPSUBSWrr, 913 PSUBUSBrr, VPSUBUSBrr, 914 PSUBUSWrr, VPSUBUSWrr, 915 PCMPGTBrr, VPCMPGTBrr, 916 PCMPGTDrr, VPCMPGTDrr, 917 PCMPGTQrr, VPCMPGTQrr, 918 PCMPGTWrr, VPCMPGTWrr)>; 919 920def JWriteVPERM2F128 : SchedWriteVariant<[ 921 SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 922 SchedVar<NoSchedPred, [WriteFShuffle256]> 923]>; 924def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 925 926// This write is used for slow LEA instructions. 927def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 928 let Latency = 2; 929} 930 931// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 932// with a `Scale` value different than 1. 933def JSlowLEAPredicate : MCSchedPredicate< 934 CheckAny<[ 935 // A 3-operand LEA (base, index, offset). 936 IsThreeOperandsLEAFn, 937 // An LEA with a "Scale" different than 1. 938 CheckAll<[ 939 CheckIsImmOperand<2>, 940 CheckNot<CheckImmOperand<2, 1>> 941 ]> 942 ]> 943>; 944 945def JWriteLEA : SchedWriteVariant<[ 946 SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 947 SchedVar<NoSchedPred, [WriteLEA]> 948]>; 949 950def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 951 952def JSlowLEA16r : SchedWriteRes<[JALU01]> { 953 let Latency = 3; 954 let ResourceCycles = [4]; 955} 956 957def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 958 959/////////////////////////////////////////////////////////////////////////////// 960// Dependency breaking instructions. 961/////////////////////////////////////////////////////////////////////////////// 962 963def : IsZeroIdiomFunction<[ 964 // GPR Zero-idioms. 965 DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 966 967 // MMX Zero-idioms. 968 DepBreakingClass<[ 969 MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, 970 MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, 971 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, 972 MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr 973 ], ZeroIdiomPredicate>, 974 975 // SSE Zero-idioms. 976 DepBreakingClass<[ 977 // fp variants. 978 XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 979 980 // int variants. 981 PXORrr, PANDNrr, 982 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 983 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 984 PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 985 ], ZeroIdiomPredicate>, 986 987 // AVX Zero-idioms. 988 DepBreakingClass<[ 989 // xmm fp variants. 990 VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 991 992 // xmm int variants. 993 VPXORrr, VPANDNrr, 994 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 995 VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 996 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 997 998 // ymm variants. 999 VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 1000 ], ZeroIdiomPredicate>, 1001 1002 DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 1003]>; 1004 1005def : IsDepBreakingFunction<[ 1006 // GPR 1007 DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 1008 DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 1009 1010 // MMX 1011 DepBreakingClass<[ 1012 MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr 1013 ], ZeroIdiomPredicate>, 1014 1015 // SSE 1016 DepBreakingClass<[ 1017 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 1018 ], ZeroIdiomPredicate>, 1019 1020 // AVX 1021 DepBreakingClass<[ 1022 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 1023 ], ZeroIdiomPredicate> 1024]>; 1025 1026def : IsOptimizableRegisterMove<[ 1027 InstructionEquivalenceClass<[ 1028 // GPR variants. 1029 MOV32rr, MOV64rr, 1030 1031 // MMX variants. 1032 MMX_MOVQ64rr, 1033 1034 // SSE variants. 1035 MOVAPSrr, MOVUPSrr, 1036 MOVAPDrr, MOVUPDrr, 1037 MOVDQArr, MOVDQUrr, 1038 1039 // AVX variants. 1040 VMOVAPSrr, VMOVUPSrr, 1041 VMOVAPDrr, VMOVUPDrr, 1042 VMOVDQArr, VMOVDQUrr 1043 ], TruePred > 1044]>; 1045 1046} // SchedModel 1047