1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "gem/i915_gem_context.h" 137 138 #include "i915_drv.h" 139 #include "i915_perf.h" 140 #include "i915_trace.h" 141 #include "i915_vgpu.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 152 #define RING_EXECLIST_QFULL (1 << 0x2) 153 #define RING_EXECLIST1_VALID (1 << 0x3) 154 #define RING_EXECLIST0_VALID (1 << 0x4) 155 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 156 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 157 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 158 159 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 160 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 162 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 163 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 164 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 165 166 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 167 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 168 169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 170 171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 173 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 174 #define GEN12_IDLE_CTX_ID 0x7FF 175 #define GEN12_CSB_CTX_VALID(csb_dw) \ 176 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 177 178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 180 #define WA_TAIL_DWORDS 2 181 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 182 183 struct virtual_engine { 184 struct intel_engine_cs base; 185 struct intel_context context; 186 187 /* 188 * We allow only a single request through the virtual engine at a time 189 * (each request in the timeline waits for the completion fence of 190 * the previous before being submitted). By restricting ourselves to 191 * only submitting a single request, each request is placed on to a 192 * physical to maximise load spreading (by virtue of the late greedy 193 * scheduling -- each real engine takes the next available request 194 * upon idling). 195 */ 196 struct i915_request *request; 197 198 /* 199 * We keep a rbtree of available virtual engines inside each physical 200 * engine, sorted by priority. Here we preallocate the nodes we need 201 * for the virtual engine, indexed by physical_engine->id. 202 */ 203 struct ve_node { 204 struct rb_node rb; 205 int prio; 206 } nodes[I915_NUM_ENGINES]; 207 208 /* 209 * Keep track of bonded pairs -- restrictions upon on our selection 210 * of physical engines any particular request may be submitted to. 211 * If we receive a submit-fence from a master engine, we will only 212 * use one of sibling_mask physical engines. 213 */ 214 struct ve_bond { 215 const struct intel_engine_cs *master; 216 intel_engine_mask_t sibling_mask; 217 } *bonds; 218 unsigned int num_bonds; 219 220 /* And finally, which physical engines this virtual engine maps onto. */ 221 unsigned int num_siblings; 222 struct intel_engine_cs *siblings[0]; 223 }; 224 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 226 { 227 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 228 return container_of(engine, struct virtual_engine, base); 229 } 230 231 static int __execlists_context_alloc(struct intel_context *ce, 232 struct intel_engine_cs *engine); 233 234 static void execlists_init_reg_state(u32 *reg_state, 235 const struct intel_context *ce, 236 const struct intel_engine_cs *engine, 237 const struct intel_ring *ring, 238 bool close); 239 static void 240 __execlists_update_reg_state(const struct intel_context *ce, 241 const struct intel_engine_cs *engine); 242 243 static void mark_eio(struct i915_request *rq) 244 { 245 if (i915_request_completed(rq)) 246 return; 247 248 GEM_BUG_ON(i915_request_signaled(rq)); 249 250 dma_fence_set_error(&rq->fence, -EIO); 251 i915_request_mark_complete(rq); 252 } 253 254 static struct i915_request * 255 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 256 { 257 struct i915_request *active = rq; 258 259 rcu_read_lock(); 260 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 261 if (i915_request_completed(rq)) 262 break; 263 264 active = rq; 265 } 266 rcu_read_unlock(); 267 268 return active; 269 } 270 271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 272 { 273 return (i915_ggtt_offset(engine->status_page.vma) + 274 I915_GEM_HWS_PREEMPT_ADDR); 275 } 276 277 static inline void 278 ring_set_paused(const struct intel_engine_cs *engine, int state) 279 { 280 /* 281 * We inspect HWS_PREEMPT with a semaphore inside 282 * engine->emit_fini_breadcrumb. If the dword is true, 283 * the ring is paused as the semaphore will busywait 284 * until the dword is false. 285 */ 286 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 287 if (state) 288 wmb(); 289 } 290 291 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 292 { 293 return rb_entry(rb, struct i915_priolist, node); 294 } 295 296 static inline int rq_prio(const struct i915_request *rq) 297 { 298 return rq->sched.attr.priority; 299 } 300 301 static int effective_prio(const struct i915_request *rq) 302 { 303 int prio = rq_prio(rq); 304 305 /* 306 * If this request is special and must not be interrupted at any 307 * cost, so be it. Note we are only checking the most recent request 308 * in the context and so may be masking an earlier vip request. It 309 * is hoped that under the conditions where nopreempt is used, this 310 * will not matter (i.e. all requests to that context will be 311 * nopreempt for as long as desired). 312 */ 313 if (i915_request_has_nopreempt(rq)) 314 prio = I915_PRIORITY_UNPREEMPTABLE; 315 316 /* 317 * On unwinding the active request, we give it a priority bump 318 * if it has completed waiting on any semaphore. If we know that 319 * the request has already started, we can prevent an unwanted 320 * preempt-to-idle cycle by taking that into account now. 321 */ 322 if (__i915_request_has_started(rq)) 323 prio |= I915_PRIORITY_NOSEMAPHORE; 324 325 /* Restrict mere WAIT boosts from triggering preemption */ 326 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 327 return prio | __NO_PREEMPTION; 328 } 329 330 static int queue_prio(const struct intel_engine_execlists *execlists) 331 { 332 struct i915_priolist *p; 333 struct rb_node *rb; 334 335 rb = rb_first_cached(&execlists->queue); 336 if (!rb) 337 return INT_MIN; 338 339 /* 340 * As the priolist[] are inverted, with the highest priority in [0], 341 * we have to flip the index value to become priority. 342 */ 343 p = to_priolist(rb); 344 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 345 } 346 347 static inline bool need_preempt(const struct intel_engine_cs *engine, 348 const struct i915_request *rq, 349 struct rb_node *rb) 350 { 351 int last_prio; 352 353 if (!intel_engine_has_semaphores(engine)) 354 return false; 355 356 /* 357 * Check if the current priority hint merits a preemption attempt. 358 * 359 * We record the highest value priority we saw during rescheduling 360 * prior to this dequeue, therefore we know that if it is strictly 361 * less than the current tail of ESLP[0], we do not need to force 362 * a preempt-to-idle cycle. 363 * 364 * However, the priority hint is a mere hint that we may need to 365 * preempt. If that hint is stale or we may be trying to preempt 366 * ourselves, ignore the request. 367 * 368 * More naturally we would write 369 * prio >= max(0, last); 370 * except that we wish to prevent triggering preemption at the same 371 * priority level: the task that is running should remain running 372 * to preserve FIFO ordering of dependencies. 373 */ 374 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 375 if (engine->execlists.queue_priority_hint <= last_prio) 376 return false; 377 378 /* 379 * Check against the first request in ELSP[1], it will, thanks to the 380 * power of PI, be the highest priority of that context. 381 */ 382 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 383 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 384 return true; 385 386 if (rb) { 387 struct virtual_engine *ve = 388 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 389 bool preempt = false; 390 391 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 392 struct i915_request *next; 393 394 rcu_read_lock(); 395 next = READ_ONCE(ve->request); 396 if (next) 397 preempt = rq_prio(next) > last_prio; 398 rcu_read_unlock(); 399 } 400 401 if (preempt) 402 return preempt; 403 } 404 405 /* 406 * If the inflight context did not trigger the preemption, then maybe 407 * it was the set of queued requests? Pick the highest priority in 408 * the queue (the first active priolist) and see if it deserves to be 409 * running instead of ELSP[0]. 410 * 411 * The highest priority request in the queue can not be either 412 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 413 * context, it's priority would not exceed ELSP[0] aka last_prio. 414 */ 415 return queue_prio(&engine->execlists) > last_prio; 416 } 417 418 __maybe_unused static inline bool 419 assert_priority_queue(const struct i915_request *prev, 420 const struct i915_request *next) 421 { 422 /* 423 * Without preemption, the prev may refer to the still active element 424 * which we refuse to let go. 425 * 426 * Even with preemption, there are times when we think it is better not 427 * to preempt and leave an ostensibly lower priority request in flight. 428 */ 429 if (i915_request_is_active(prev)) 430 return true; 431 432 return rq_prio(prev) >= rq_prio(next); 433 } 434 435 /* 436 * The context descriptor encodes various attributes of a context, 437 * including its GTT address and some flags. Because it's fairly 438 * expensive to calculate, we'll just do it once and cache the result, 439 * which remains valid until the context is unpinned. 440 * 441 * This is what a descriptor looks like, from LSB to MSB:: 442 * 443 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 444 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 445 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 446 * bits 53-54: mbz, reserved for use by hardware 447 * bits 55-63: group ID, currently unused and set to 0 448 * 449 * Starting from Gen11, the upper dword of the descriptor has a new format: 450 * 451 * bits 32-36: reserved 452 * bits 37-47: SW context ID 453 * bits 48:53: engine instance 454 * bit 54: mbz, reserved for use by hardware 455 * bits 55-60: SW counter 456 * bits 61-63: engine class 457 * 458 * engine info, SW context ID and SW counter need to form a unique number 459 * (Context ID) per lrc. 460 */ 461 static u64 462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 463 { 464 u64 desc; 465 466 desc = INTEL_LEGACY_32B_CONTEXT; 467 if (i915_vm_is_4lvl(ce->vm)) 468 desc = INTEL_LEGACY_64B_CONTEXT; 469 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 470 471 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 472 if (IS_GEN(engine->i915, 8)) 473 desc |= GEN8_CTX_L3LLC_COHERENT; 474 475 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 476 /* 477 * The following 32bits are copied into the OA reports (dword 2). 478 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 479 * anything below. 480 */ 481 if (INTEL_GEN(engine->i915) >= 11) { 482 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 483 /* bits 48-53 */ 484 485 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 486 /* bits 61-63 */ 487 } 488 489 return desc; 490 } 491 492 static u32 *set_offsets(u32 *regs, 493 const u8 *data, 494 const struct intel_engine_cs *engine) 495 #define NOP(x) (BIT(7) | (x)) 496 #define LRI(count, flags) ((flags) << 6 | (count)) 497 #define POSTED BIT(0) 498 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 499 #define REG16(x) \ 500 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 501 (((x) >> 2) & 0x7f) 502 #define END() 0 503 { 504 const u32 base = engine->mmio_base; 505 506 while (*data) { 507 u8 count, flags; 508 509 if (*data & BIT(7)) { /* skip */ 510 regs += *data++ & ~BIT(7); 511 continue; 512 } 513 514 count = *data & 0x3f; 515 flags = *data >> 6; 516 data++; 517 518 *regs = MI_LOAD_REGISTER_IMM(count); 519 if (flags & POSTED) 520 *regs |= MI_LRI_FORCE_POSTED; 521 if (INTEL_GEN(engine->i915) >= 11) 522 *regs |= MI_LRI_CS_MMIO; 523 regs++; 524 525 GEM_BUG_ON(!count); 526 do { 527 u32 offset = 0; 528 u8 v; 529 530 do { 531 v = *data++; 532 offset <<= 7; 533 offset |= v & ~BIT(7); 534 } while (v & BIT(7)); 535 536 *regs = base + (offset << 2); 537 regs += 2; 538 } while (--count); 539 } 540 541 return regs; 542 } 543 544 static const u8 gen8_xcs_offsets[] = { 545 NOP(1), 546 LRI(11, 0), 547 REG16(0x244), 548 REG(0x034), 549 REG(0x030), 550 REG(0x038), 551 REG(0x03c), 552 REG(0x168), 553 REG(0x140), 554 REG(0x110), 555 REG(0x11c), 556 REG(0x114), 557 REG(0x118), 558 559 NOP(9), 560 LRI(9, 0), 561 REG16(0x3a8), 562 REG16(0x28c), 563 REG16(0x288), 564 REG16(0x284), 565 REG16(0x280), 566 REG16(0x27c), 567 REG16(0x278), 568 REG16(0x274), 569 REG16(0x270), 570 571 NOP(13), 572 LRI(2, 0), 573 REG16(0x200), 574 REG(0x028), 575 576 END(), 577 }; 578 579 static const u8 gen9_xcs_offsets[] = { 580 NOP(1), 581 LRI(14, POSTED), 582 REG16(0x244), 583 REG(0x034), 584 REG(0x030), 585 REG(0x038), 586 REG(0x03c), 587 REG(0x168), 588 REG(0x140), 589 REG(0x110), 590 REG(0x11c), 591 REG(0x114), 592 REG(0x118), 593 REG(0x1c0), 594 REG(0x1c4), 595 REG(0x1c8), 596 597 NOP(3), 598 LRI(9, POSTED), 599 REG16(0x3a8), 600 REG16(0x28c), 601 REG16(0x288), 602 REG16(0x284), 603 REG16(0x280), 604 REG16(0x27c), 605 REG16(0x278), 606 REG16(0x274), 607 REG16(0x270), 608 609 NOP(13), 610 LRI(1, POSTED), 611 REG16(0x200), 612 613 NOP(13), 614 LRI(44, POSTED), 615 REG(0x028), 616 REG(0x09c), 617 REG(0x0c0), 618 REG(0x178), 619 REG(0x17c), 620 REG16(0x358), 621 REG(0x170), 622 REG(0x150), 623 REG(0x154), 624 REG(0x158), 625 REG16(0x41c), 626 REG16(0x600), 627 REG16(0x604), 628 REG16(0x608), 629 REG16(0x60c), 630 REG16(0x610), 631 REG16(0x614), 632 REG16(0x618), 633 REG16(0x61c), 634 REG16(0x620), 635 REG16(0x624), 636 REG16(0x628), 637 REG16(0x62c), 638 REG16(0x630), 639 REG16(0x634), 640 REG16(0x638), 641 REG16(0x63c), 642 REG16(0x640), 643 REG16(0x644), 644 REG16(0x648), 645 REG16(0x64c), 646 REG16(0x650), 647 REG16(0x654), 648 REG16(0x658), 649 REG16(0x65c), 650 REG16(0x660), 651 REG16(0x664), 652 REG16(0x668), 653 REG16(0x66c), 654 REG16(0x670), 655 REG16(0x674), 656 REG16(0x678), 657 REG16(0x67c), 658 REG(0x068), 659 660 END(), 661 }; 662 663 static const u8 gen12_xcs_offsets[] = { 664 NOP(1), 665 LRI(13, POSTED), 666 REG16(0x244), 667 REG(0x034), 668 REG(0x030), 669 REG(0x038), 670 REG(0x03c), 671 REG(0x168), 672 REG(0x140), 673 REG(0x110), 674 REG(0x1c0), 675 REG(0x1c4), 676 REG(0x1c8), 677 REG(0x180), 678 REG16(0x2b4), 679 680 NOP(5), 681 LRI(9, POSTED), 682 REG16(0x3a8), 683 REG16(0x28c), 684 REG16(0x288), 685 REG16(0x284), 686 REG16(0x280), 687 REG16(0x27c), 688 REG16(0x278), 689 REG16(0x274), 690 REG16(0x270), 691 692 END(), 693 }; 694 695 static const u8 gen8_rcs_offsets[] = { 696 NOP(1), 697 LRI(14, POSTED), 698 REG16(0x244), 699 REG(0x034), 700 REG(0x030), 701 REG(0x038), 702 REG(0x03c), 703 REG(0x168), 704 REG(0x140), 705 REG(0x110), 706 REG(0x11c), 707 REG(0x114), 708 REG(0x118), 709 REG(0x1c0), 710 REG(0x1c4), 711 REG(0x1c8), 712 713 NOP(3), 714 LRI(9, POSTED), 715 REG16(0x3a8), 716 REG16(0x28c), 717 REG16(0x288), 718 REG16(0x284), 719 REG16(0x280), 720 REG16(0x27c), 721 REG16(0x278), 722 REG16(0x274), 723 REG16(0x270), 724 725 NOP(13), 726 LRI(1, 0), 727 REG(0x0c8), 728 729 END(), 730 }; 731 732 static const u8 gen11_rcs_offsets[] = { 733 NOP(1), 734 LRI(15, POSTED), 735 REG16(0x244), 736 REG(0x034), 737 REG(0x030), 738 REG(0x038), 739 REG(0x03c), 740 REG(0x168), 741 REG(0x140), 742 REG(0x110), 743 REG(0x11c), 744 REG(0x114), 745 REG(0x118), 746 REG(0x1c0), 747 REG(0x1c4), 748 REG(0x1c8), 749 REG(0x180), 750 751 NOP(1), 752 LRI(9, POSTED), 753 REG16(0x3a8), 754 REG16(0x28c), 755 REG16(0x288), 756 REG16(0x284), 757 REG16(0x280), 758 REG16(0x27c), 759 REG16(0x278), 760 REG16(0x274), 761 REG16(0x270), 762 763 LRI(1, POSTED), 764 REG(0x1b0), 765 766 NOP(10), 767 LRI(1, 0), 768 REG(0x0c8), 769 770 END(), 771 }; 772 773 static const u8 gen12_rcs_offsets[] = { 774 NOP(1), 775 LRI(13, POSTED), 776 REG16(0x244), 777 REG(0x034), 778 REG(0x030), 779 REG(0x038), 780 REG(0x03c), 781 REG(0x168), 782 REG(0x140), 783 REG(0x110), 784 REG(0x1c0), 785 REG(0x1c4), 786 REG(0x1c8), 787 REG(0x180), 788 REG16(0x2b4), 789 790 NOP(5), 791 LRI(9, POSTED), 792 REG16(0x3a8), 793 REG16(0x28c), 794 REG16(0x288), 795 REG16(0x284), 796 REG16(0x280), 797 REG16(0x27c), 798 REG16(0x278), 799 REG16(0x274), 800 REG16(0x270), 801 802 LRI(3, POSTED), 803 REG(0x1b0), 804 REG16(0x5a8), 805 REG16(0x5ac), 806 807 NOP(6), 808 LRI(1, 0), 809 REG(0x0c8), 810 811 END(), 812 }; 813 814 #undef END 815 #undef REG16 816 #undef REG 817 #undef LRI 818 #undef NOP 819 820 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 821 { 822 /* 823 * The gen12+ lists only have the registers we program in the basic 824 * default state. We rely on the context image using relative 825 * addressing to automatic fixup the register state between the 826 * physical engines for virtual engine. 827 */ 828 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 829 !intel_engine_has_relative_mmio(engine)); 830 831 if (engine->class == RENDER_CLASS) { 832 if (INTEL_GEN(engine->i915) >= 12) 833 return gen12_rcs_offsets; 834 else if (INTEL_GEN(engine->i915) >= 11) 835 return gen11_rcs_offsets; 836 else 837 return gen8_rcs_offsets; 838 } else { 839 if (INTEL_GEN(engine->i915) >= 12) 840 return gen12_xcs_offsets; 841 else if (INTEL_GEN(engine->i915) >= 9) 842 return gen9_xcs_offsets; 843 else 844 return gen8_xcs_offsets; 845 } 846 } 847 848 static void unwind_wa_tail(struct i915_request *rq) 849 { 850 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); 851 assert_ring_tail_valid(rq->ring, rq->tail); 852 } 853 854 static struct i915_request * 855 __unwind_incomplete_requests(struct intel_engine_cs *engine) 856 { 857 struct i915_request *rq, *rn, *active = NULL; 858 struct list_head *uninitialized_var(pl); 859 int prio = I915_PRIORITY_INVALID; 860 861 lockdep_assert_held(&engine->active.lock); 862 863 list_for_each_entry_safe_reverse(rq, rn, 864 &engine->active.requests, 865 sched.link) { 866 867 if (i915_request_completed(rq)) 868 continue; /* XXX */ 869 870 __i915_request_unsubmit(rq); 871 unwind_wa_tail(rq); 872 873 /* 874 * Push the request back into the queue for later resubmission. 875 * If this request is not native to this physical engine (i.e. 876 * it came from a virtual source), push it back onto the virtual 877 * engine so that it can be moved across onto another physical 878 * engine as load dictates. 879 */ 880 if (likely(rq->execution_mask == engine->mask)) { 881 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 882 if (rq_prio(rq) != prio) { 883 prio = rq_prio(rq); 884 pl = i915_sched_lookup_priolist(engine, prio); 885 } 886 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 887 888 list_move(&rq->sched.link, pl); 889 active = rq; 890 } else { 891 struct intel_engine_cs *owner = rq->hw_context->engine; 892 893 /* 894 * Decouple the virtual breadcrumb before moving it 895 * back to the virtual engine -- we don't want the 896 * request to complete in the background and try 897 * and cancel the breadcrumb on the virtual engine 898 * (instead of the old engine where it is linked)! 899 */ 900 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 901 &rq->fence.flags)) { 902 spin_lock_nested(&rq->lock, 903 SINGLE_DEPTH_NESTING); 904 i915_request_cancel_breadcrumb(rq); 905 spin_unlock(&rq->lock); 906 } 907 rq->engine = owner; 908 owner->submit_request(rq); 909 active = NULL; 910 } 911 } 912 913 return active; 914 } 915 916 struct i915_request * 917 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 918 { 919 struct intel_engine_cs *engine = 920 container_of(execlists, typeof(*engine), execlists); 921 922 return __unwind_incomplete_requests(engine); 923 } 924 925 static inline void 926 execlists_context_status_change(struct i915_request *rq, unsigned long status) 927 { 928 /* 929 * Only used when GVT-g is enabled now. When GVT-g is disabled, 930 * The compiler should eliminate this function as dead-code. 931 */ 932 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 933 return; 934 935 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 936 status, rq); 937 } 938 939 static void intel_engine_context_in(struct intel_engine_cs *engine) 940 { 941 unsigned long flags; 942 943 if (READ_ONCE(engine->stats.enabled) == 0) 944 return; 945 946 write_seqlock_irqsave(&engine->stats.lock, flags); 947 948 if (engine->stats.enabled > 0) { 949 if (engine->stats.active++ == 0) 950 engine->stats.start = ktime_get(); 951 GEM_BUG_ON(engine->stats.active == 0); 952 } 953 954 write_sequnlock_irqrestore(&engine->stats.lock, flags); 955 } 956 957 static void intel_engine_context_out(struct intel_engine_cs *engine) 958 { 959 unsigned long flags; 960 961 if (READ_ONCE(engine->stats.enabled) == 0) 962 return; 963 964 write_seqlock_irqsave(&engine->stats.lock, flags); 965 966 if (engine->stats.enabled > 0) { 967 ktime_t last; 968 969 if (engine->stats.active && --engine->stats.active == 0) { 970 /* 971 * Decrement the active context count and in case GPU 972 * is now idle add up to the running total. 973 */ 974 last = ktime_sub(ktime_get(), engine->stats.start); 975 976 engine->stats.total = ktime_add(engine->stats.total, 977 last); 978 } else if (engine->stats.active == 0) { 979 /* 980 * After turning on engine stats, context out might be 981 * the first event in which case we account from the 982 * time stats gathering was turned on. 983 */ 984 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 985 986 engine->stats.total = ktime_add(engine->stats.total, 987 last); 988 } 989 } 990 991 write_sequnlock_irqrestore(&engine->stats.lock, flags); 992 } 993 994 static void restore_default_state(struct intel_context *ce, 995 struct intel_engine_cs *engine) 996 { 997 u32 *regs = ce->lrc_reg_state; 998 999 if (engine->pinned_default_state) 1000 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1001 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1002 engine->context_size - PAGE_SIZE); 1003 1004 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1005 } 1006 1007 static void reset_active(struct i915_request *rq, 1008 struct intel_engine_cs *engine) 1009 { 1010 struct intel_context * const ce = rq->hw_context; 1011 u32 head; 1012 1013 /* 1014 * The executing context has been cancelled. We want to prevent 1015 * further execution along this context and propagate the error on 1016 * to anything depending on its results. 1017 * 1018 * In __i915_request_submit(), we apply the -EIO and remove the 1019 * requests' payloads for any banned requests. But first, we must 1020 * rewind the context back to the start of the incomplete request so 1021 * that we do not jump back into the middle of the batch. 1022 * 1023 * We preserve the breadcrumbs and semaphores of the incomplete 1024 * requests so that inter-timeline dependencies (i.e other timelines) 1025 * remain correctly ordered. And we defer to __i915_request_submit() 1026 * so that all asynchronous waits are correctly handled. 1027 */ 1028 GEM_TRACE("%s(%s): { rq=%llx:%lld }\n", 1029 __func__, engine->name, rq->fence.context, rq->fence.seqno); 1030 1031 /* On resubmission of the active request, payload will be scrubbed */ 1032 if (i915_request_completed(rq)) 1033 head = rq->tail; 1034 else 1035 head = active_request(ce->timeline, rq)->head; 1036 ce->ring->head = intel_ring_wrap(ce->ring, head); 1037 intel_ring_update_space(ce->ring); 1038 1039 /* Scrub the context image to prevent replaying the previous batch */ 1040 restore_default_state(ce, engine); 1041 __execlists_update_reg_state(ce, engine); 1042 1043 /* We've switched away, so this should be a no-op, but intent matters */ 1044 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1045 } 1046 1047 static inline struct intel_engine_cs * 1048 __execlists_schedule_in(struct i915_request *rq) 1049 { 1050 struct intel_engine_cs * const engine = rq->engine; 1051 struct intel_context * const ce = rq->hw_context; 1052 1053 intel_context_get(ce); 1054 1055 if (unlikely(i915_gem_context_is_banned(ce->gem_context))) 1056 reset_active(rq, engine); 1057 1058 if (ce->tag) { 1059 /* Use a fixed tag for OA and friends */ 1060 ce->lrc_desc |= (u64)ce->tag << 32; 1061 } else { 1062 /* We don't need a strict matching tag, just different values */ 1063 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1064 ce->lrc_desc |= 1065 (u64)(engine->context_tag++ % NUM_CONTEXT_TAG) << 1066 GEN11_SW_CTX_ID_SHIFT; 1067 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1068 } 1069 1070 intel_gt_pm_get(engine->gt); 1071 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1072 intel_engine_context_in(engine); 1073 1074 return engine; 1075 } 1076 1077 static inline struct i915_request * 1078 execlists_schedule_in(struct i915_request *rq, int idx) 1079 { 1080 struct intel_context * const ce = rq->hw_context; 1081 struct intel_engine_cs *old; 1082 1083 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1084 trace_i915_request_in(rq, idx); 1085 1086 old = READ_ONCE(ce->inflight); 1087 do { 1088 if (!old) { 1089 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1090 break; 1091 } 1092 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1093 1094 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1095 return i915_request_get(rq); 1096 } 1097 1098 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1099 { 1100 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1101 struct i915_request *next = READ_ONCE(ve->request); 1102 1103 if (next && next->execution_mask & ~rq->execution_mask) 1104 tasklet_schedule(&ve->base.execlists.tasklet); 1105 } 1106 1107 static inline void 1108 __execlists_schedule_out(struct i915_request *rq, 1109 struct intel_engine_cs * const engine) 1110 { 1111 struct intel_context * const ce = rq->hw_context; 1112 1113 /* 1114 * NB process_csb() is not under the engine->active.lock and hence 1115 * schedule_out can race with schedule_in meaning that we should 1116 * refrain from doing non-trivial work here. 1117 */ 1118 1119 /* 1120 * If we have just completed this context, the engine may now be 1121 * idle and we want to re-enter powersaving. 1122 */ 1123 if (list_is_last(&rq->link, &ce->timeline->requests) && 1124 i915_request_completed(rq)) 1125 intel_engine_add_retire(engine, ce->timeline); 1126 1127 intel_engine_context_out(engine); 1128 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1129 intel_gt_pm_put_async(engine->gt); 1130 1131 /* 1132 * If this is part of a virtual engine, its next request may 1133 * have been blocked waiting for access to the active context. 1134 * We have to kick all the siblings again in case we need to 1135 * switch (e.g. the next request is not runnable on this 1136 * engine). Hopefully, we will already have submitted the next 1137 * request before the tasklet runs and do not need to rebuild 1138 * each virtual tree and kick everyone again. 1139 */ 1140 if (ce->engine != engine) 1141 kick_siblings(rq, ce); 1142 1143 intel_context_put(ce); 1144 } 1145 1146 static inline void 1147 execlists_schedule_out(struct i915_request *rq) 1148 { 1149 struct intel_context * const ce = rq->hw_context; 1150 struct intel_engine_cs *cur, *old; 1151 1152 trace_i915_request_out(rq); 1153 1154 old = READ_ONCE(ce->inflight); 1155 do 1156 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1157 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1158 if (!cur) 1159 __execlists_schedule_out(rq, old); 1160 1161 i915_request_put(rq); 1162 } 1163 1164 static u64 execlists_update_context(const struct i915_request *rq) 1165 { 1166 struct intel_context *ce = rq->hw_context; 1167 u64 desc; 1168 1169 ce->lrc_reg_state[CTX_RING_TAIL] = 1170 intel_ring_set_tail(rq->ring, rq->tail); 1171 1172 /* 1173 * Make sure the context image is complete before we submit it to HW. 1174 * 1175 * Ostensibly, writes (including the WCB) should be flushed prior to 1176 * an uncached write such as our mmio register access, the empirical 1177 * evidence (esp. on Braswell) suggests that the WC write into memory 1178 * may not be visible to the HW prior to the completion of the UC 1179 * register write and that we may begin execution from the context 1180 * before its image is complete leading to invalid PD chasing. 1181 * 1182 * Furthermore, Braswell, at least, wants a full mb to be sure that 1183 * the writes are coherent in memory (visible to the GPU) prior to 1184 * execution, and not just visible to other CPUs (as is the result of 1185 * wmb). 1186 */ 1187 mb(); 1188 1189 desc = ce->lrc_desc; 1190 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1191 1192 /* Wa_1607138340:tgl */ 1193 if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0)) 1194 desc |= CTX_DESC_FORCE_RESTORE; 1195 1196 return desc; 1197 } 1198 1199 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1200 { 1201 if (execlists->ctrl_reg) { 1202 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1203 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1204 } else { 1205 writel(upper_32_bits(desc), execlists->submit_reg); 1206 writel(lower_32_bits(desc), execlists->submit_reg); 1207 } 1208 } 1209 1210 static __maybe_unused void 1211 trace_ports(const struct intel_engine_execlists *execlists, 1212 const char *msg, 1213 struct i915_request * const *ports) 1214 { 1215 const struct intel_engine_cs *engine = 1216 container_of(execlists, typeof(*engine), execlists); 1217 1218 if (!ports[0]) 1219 return; 1220 1221 GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n", 1222 engine->name, msg, 1223 ports[0]->fence.context, 1224 ports[0]->fence.seqno, 1225 i915_request_completed(ports[0]) ? "!" : 1226 i915_request_started(ports[0]) ? "*" : 1227 "", 1228 ports[1] ? ports[1]->fence.context : 0, 1229 ports[1] ? ports[1]->fence.seqno : 0); 1230 } 1231 1232 static __maybe_unused bool 1233 assert_pending_valid(const struct intel_engine_execlists *execlists, 1234 const char *msg) 1235 { 1236 struct i915_request * const *port, *rq; 1237 struct intel_context *ce = NULL; 1238 1239 trace_ports(execlists, msg, execlists->pending); 1240 1241 if (!execlists->pending[0]) { 1242 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1243 return false; 1244 } 1245 1246 if (execlists->pending[execlists_num_ports(execlists)]) { 1247 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1248 execlists_num_ports(execlists)); 1249 return false; 1250 } 1251 1252 for (port = execlists->pending; (rq = *port); port++) { 1253 if (ce == rq->hw_context) { 1254 GEM_TRACE_ERR("Duplicate context in pending[%zd]\n", 1255 port - execlists->pending); 1256 return false; 1257 } 1258 1259 ce = rq->hw_context; 1260 if (i915_request_completed(rq)) 1261 continue; 1262 1263 if (i915_active_is_idle(&ce->active)) { 1264 GEM_TRACE_ERR("Inactive context in pending[%zd]\n", 1265 port - execlists->pending); 1266 return false; 1267 } 1268 1269 if (!i915_vma_is_pinned(ce->state)) { 1270 GEM_TRACE_ERR("Unpinned context in pending[%zd]\n", 1271 port - execlists->pending); 1272 return false; 1273 } 1274 1275 if (!i915_vma_is_pinned(ce->ring->vma)) { 1276 GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n", 1277 port - execlists->pending); 1278 return false; 1279 } 1280 } 1281 1282 return ce; 1283 } 1284 1285 static void execlists_submit_ports(struct intel_engine_cs *engine) 1286 { 1287 struct intel_engine_execlists *execlists = &engine->execlists; 1288 unsigned int n; 1289 1290 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1291 1292 /* 1293 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1294 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1295 * not be relinquished until the device is idle (see 1296 * i915_gem_idle_work_handler()). As a precaution, we make sure 1297 * that all ELSP are drained i.e. we have processed the CSB, 1298 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1299 */ 1300 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1301 1302 /* 1303 * ELSQ note: the submit queue is not cleared after being submitted 1304 * to the HW so we need to make sure we always clean it up. This is 1305 * currently ensured by the fact that we always write the same number 1306 * of elsq entries, keep this in mind before changing the loop below. 1307 */ 1308 for (n = execlists_num_ports(execlists); n--; ) { 1309 struct i915_request *rq = execlists->pending[n]; 1310 1311 write_desc(execlists, 1312 rq ? execlists_update_context(rq) : 0, 1313 n); 1314 } 1315 1316 /* we need to manually load the submit queue */ 1317 if (execlists->ctrl_reg) 1318 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1319 } 1320 1321 static bool ctx_single_port_submission(const struct intel_context *ce) 1322 { 1323 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1324 i915_gem_context_force_single_submission(ce->gem_context)); 1325 } 1326 1327 static bool can_merge_ctx(const struct intel_context *prev, 1328 const struct intel_context *next) 1329 { 1330 if (prev != next) 1331 return false; 1332 1333 if (ctx_single_port_submission(prev)) 1334 return false; 1335 1336 return true; 1337 } 1338 1339 static bool can_merge_rq(const struct i915_request *prev, 1340 const struct i915_request *next) 1341 { 1342 GEM_BUG_ON(prev == next); 1343 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1344 1345 /* 1346 * We do not submit known completed requests. Therefore if the next 1347 * request is already completed, we can pretend to merge it in 1348 * with the previous context (and we will skip updating the ELSP 1349 * and tracking). Thus hopefully keeping the ELSP full with active 1350 * contexts, despite the best efforts of preempt-to-busy to confuse 1351 * us. 1352 */ 1353 if (i915_request_completed(next)) 1354 return true; 1355 1356 if (unlikely((prev->flags ^ next->flags) & 1357 (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL))) 1358 return false; 1359 1360 if (!can_merge_ctx(prev->hw_context, next->hw_context)) 1361 return false; 1362 1363 return true; 1364 } 1365 1366 static void virtual_update_register_offsets(u32 *regs, 1367 struct intel_engine_cs *engine) 1368 { 1369 set_offsets(regs, reg_offsets(engine), engine); 1370 } 1371 1372 static bool virtual_matches(const struct virtual_engine *ve, 1373 const struct i915_request *rq, 1374 const struct intel_engine_cs *engine) 1375 { 1376 const struct intel_engine_cs *inflight; 1377 1378 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1379 return false; 1380 1381 /* 1382 * We track when the HW has completed saving the context image 1383 * (i.e. when we have seen the final CS event switching out of 1384 * the context) and must not overwrite the context image before 1385 * then. This restricts us to only using the active engine 1386 * while the previous virtualized request is inflight (so 1387 * we reuse the register offsets). This is a very small 1388 * hystersis on the greedy seelction algorithm. 1389 */ 1390 inflight = intel_context_inflight(&ve->context); 1391 if (inflight && inflight != engine) 1392 return false; 1393 1394 return true; 1395 } 1396 1397 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1398 struct intel_engine_cs *engine) 1399 { 1400 struct intel_engine_cs *old = ve->siblings[0]; 1401 1402 /* All unattached (rq->engine == old) must already be completed */ 1403 1404 spin_lock(&old->breadcrumbs.irq_lock); 1405 if (!list_empty(&ve->context.signal_link)) { 1406 list_move_tail(&ve->context.signal_link, 1407 &engine->breadcrumbs.signalers); 1408 intel_engine_queue_breadcrumbs(engine); 1409 } 1410 spin_unlock(&old->breadcrumbs.irq_lock); 1411 } 1412 1413 static struct i915_request * 1414 last_active(const struct intel_engine_execlists *execlists) 1415 { 1416 struct i915_request * const *last = READ_ONCE(execlists->active); 1417 1418 while (*last && i915_request_completed(*last)) 1419 last++; 1420 1421 return *last; 1422 } 1423 1424 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1425 { 1426 LIST_HEAD(list); 1427 1428 /* 1429 * We want to move the interrupted request to the back of 1430 * the round-robin list (i.e. its priority level), but 1431 * in doing so, we must then move all requests that were in 1432 * flight and were waiting for the interrupted request to 1433 * be run after it again. 1434 */ 1435 do { 1436 struct i915_dependency *p; 1437 1438 GEM_BUG_ON(i915_request_is_active(rq)); 1439 list_move_tail(&rq->sched.link, pl); 1440 1441 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 1442 struct i915_request *w = 1443 container_of(p->waiter, typeof(*w), sched); 1444 1445 /* Leave semaphores spinning on the other engines */ 1446 if (w->engine != rq->engine) 1447 continue; 1448 1449 /* No waiter should start before its signaler */ 1450 GEM_BUG_ON(i915_request_started(w) && 1451 !i915_request_completed(rq)); 1452 1453 GEM_BUG_ON(i915_request_is_active(w)); 1454 if (list_empty(&w->sched.link)) 1455 continue; /* Not yet submitted; unready */ 1456 1457 if (rq_prio(w) < rq_prio(rq)) 1458 continue; 1459 1460 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1461 list_move_tail(&w->sched.link, &list); 1462 } 1463 1464 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1465 } while (rq); 1466 } 1467 1468 static void defer_active(struct intel_engine_cs *engine) 1469 { 1470 struct i915_request *rq; 1471 1472 rq = __unwind_incomplete_requests(engine); 1473 if (!rq) 1474 return; 1475 1476 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1477 } 1478 1479 static bool 1480 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1481 { 1482 int hint; 1483 1484 if (!intel_engine_has_timeslices(engine)) 1485 return false; 1486 1487 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1488 return false; 1489 1490 hint = max(rq_prio(list_next_entry(rq, sched.link)), 1491 engine->execlists.queue_priority_hint); 1492 1493 return hint >= effective_prio(rq); 1494 } 1495 1496 static int 1497 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1498 { 1499 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1500 return INT_MIN; 1501 1502 return rq_prio(list_next_entry(rq, sched.link)); 1503 } 1504 1505 static inline unsigned long 1506 timeslice(const struct intel_engine_cs *engine) 1507 { 1508 return READ_ONCE(engine->props.timeslice_duration_ms); 1509 } 1510 1511 static unsigned long 1512 active_timeslice(const struct intel_engine_cs *engine) 1513 { 1514 const struct i915_request *rq = *engine->execlists.active; 1515 1516 if (i915_request_completed(rq)) 1517 return 0; 1518 1519 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1520 return 0; 1521 1522 return timeslice(engine); 1523 } 1524 1525 static void set_timeslice(struct intel_engine_cs *engine) 1526 { 1527 if (!intel_engine_has_timeslices(engine)) 1528 return; 1529 1530 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1531 } 1532 1533 static void record_preemption(struct intel_engine_execlists *execlists) 1534 { 1535 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1536 } 1537 1538 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1539 { 1540 struct i915_request *rq; 1541 1542 rq = last_active(&engine->execlists); 1543 if (!rq) 1544 return 0; 1545 1546 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1547 if (unlikely(i915_gem_context_is_banned(rq->gem_context))) 1548 return 1; 1549 1550 return READ_ONCE(engine->props.preempt_timeout_ms); 1551 } 1552 1553 static void set_preempt_timeout(struct intel_engine_cs *engine) 1554 { 1555 if (!intel_engine_has_preempt_reset(engine)) 1556 return; 1557 1558 set_timer_ms(&engine->execlists.preempt, 1559 active_preempt_timeout(engine)); 1560 } 1561 1562 static void execlists_dequeue(struct intel_engine_cs *engine) 1563 { 1564 struct intel_engine_execlists * const execlists = &engine->execlists; 1565 struct i915_request **port = execlists->pending; 1566 struct i915_request ** const last_port = port + execlists->port_mask; 1567 struct i915_request *last; 1568 struct rb_node *rb; 1569 bool submit = false; 1570 1571 /* 1572 * Hardware submission is through 2 ports. Conceptually each port 1573 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1574 * static for a context, and unique to each, so we only execute 1575 * requests belonging to a single context from each ring. RING_HEAD 1576 * is maintained by the CS in the context image, it marks the place 1577 * where it got up to last time, and through RING_TAIL we tell the CS 1578 * where we want to execute up to this time. 1579 * 1580 * In this list the requests are in order of execution. Consecutive 1581 * requests from the same context are adjacent in the ringbuffer. We 1582 * can combine these requests into a single RING_TAIL update: 1583 * 1584 * RING_HEAD...req1...req2 1585 * ^- RING_TAIL 1586 * since to execute req2 the CS must first execute req1. 1587 * 1588 * Our goal then is to point each port to the end of a consecutive 1589 * sequence of requests as being the most optimal (fewest wake ups 1590 * and context switches) submission. 1591 */ 1592 1593 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1594 struct virtual_engine *ve = 1595 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1596 struct i915_request *rq = READ_ONCE(ve->request); 1597 1598 if (!rq) { /* lazily cleanup after another engine handled rq */ 1599 rb_erase_cached(rb, &execlists->virtual); 1600 RB_CLEAR_NODE(rb); 1601 rb = rb_first_cached(&execlists->virtual); 1602 continue; 1603 } 1604 1605 if (!virtual_matches(ve, rq, engine)) { 1606 rb = rb_next(rb); 1607 continue; 1608 } 1609 1610 break; 1611 } 1612 1613 /* 1614 * If the queue is higher priority than the last 1615 * request in the currently active context, submit afresh. 1616 * We will resubmit again afterwards in case we need to split 1617 * the active context to interject the preemption request, 1618 * i.e. we will retrigger preemption following the ack in case 1619 * of trouble. 1620 */ 1621 last = last_active(execlists); 1622 if (last) { 1623 if (need_preempt(engine, last, rb)) { 1624 GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n", 1625 engine->name, 1626 last->fence.context, 1627 last->fence.seqno, 1628 last->sched.attr.priority, 1629 execlists->queue_priority_hint); 1630 record_preemption(execlists); 1631 1632 /* 1633 * Don't let the RING_HEAD advance past the breadcrumb 1634 * as we unwind (and until we resubmit) so that we do 1635 * not accidentally tell it to go backwards. 1636 */ 1637 ring_set_paused(engine, 1); 1638 1639 /* 1640 * Note that we have not stopped the GPU at this point, 1641 * so we are unwinding the incomplete requests as they 1642 * remain inflight and so by the time we do complete 1643 * the preemption, some of the unwound requests may 1644 * complete! 1645 */ 1646 __unwind_incomplete_requests(engine); 1647 1648 /* 1649 * If we need to return to the preempted context, we 1650 * need to skip the lite-restore and force it to 1651 * reload the RING_TAIL. Otherwise, the HW has a 1652 * tendency to ignore us rewinding the TAIL to the 1653 * end of an earlier request. 1654 */ 1655 last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1656 last = NULL; 1657 } else if (need_timeslice(engine, last) && 1658 timer_expired(&engine->execlists.timer)) { 1659 GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", 1660 engine->name, 1661 last->fence.context, 1662 last->fence.seqno, 1663 last->sched.attr.priority, 1664 execlists->queue_priority_hint); 1665 1666 ring_set_paused(engine, 1); 1667 defer_active(engine); 1668 1669 /* 1670 * Unlike for preemption, if we rewind and continue 1671 * executing the same context as previously active, 1672 * the order of execution will remain the same and 1673 * the tail will only advance. We do not need to 1674 * force a full context restore, as a lite-restore 1675 * is sufficient to resample the monotonic TAIL. 1676 * 1677 * If we switch to any other context, similarly we 1678 * will not rewind TAIL of current context, and 1679 * normal save/restore will preserve state and allow 1680 * us to later continue executing the same request. 1681 */ 1682 last = NULL; 1683 } else { 1684 /* 1685 * Otherwise if we already have a request pending 1686 * for execution after the current one, we can 1687 * just wait until the next CS event before 1688 * queuing more. In either case we will force a 1689 * lite-restore preemption event, but if we wait 1690 * we hopefully coalesce several updates into a single 1691 * submission. 1692 */ 1693 if (!list_is_last(&last->sched.link, 1694 &engine->active.requests)) { 1695 /* 1696 * Even if ELSP[1] is occupied and not worthy 1697 * of timeslices, our queue might be. 1698 */ 1699 if (!execlists->timer.expires && 1700 need_timeslice(engine, last)) 1701 set_timer_ms(&execlists->timer, 1702 timeslice(engine)); 1703 1704 return; 1705 } 1706 1707 /* 1708 * WaIdleLiteRestore:bdw,skl 1709 * Apply the wa NOOPs to prevent 1710 * ring:HEAD == rq:TAIL as we resubmit the 1711 * request. See gen8_emit_fini_breadcrumb() for 1712 * where we prepare the padding after the 1713 * end of the request. 1714 */ 1715 last->tail = last->wa_tail; 1716 } 1717 } 1718 1719 while (rb) { /* XXX virtual is always taking precedence */ 1720 struct virtual_engine *ve = 1721 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1722 struct i915_request *rq; 1723 1724 spin_lock(&ve->base.active.lock); 1725 1726 rq = ve->request; 1727 if (unlikely(!rq)) { /* lost the race to a sibling */ 1728 spin_unlock(&ve->base.active.lock); 1729 rb_erase_cached(rb, &execlists->virtual); 1730 RB_CLEAR_NODE(rb); 1731 rb = rb_first_cached(&execlists->virtual); 1732 continue; 1733 } 1734 1735 GEM_BUG_ON(rq != ve->request); 1736 GEM_BUG_ON(rq->engine != &ve->base); 1737 GEM_BUG_ON(rq->hw_context != &ve->context); 1738 1739 if (rq_prio(rq) >= queue_prio(execlists)) { 1740 if (!virtual_matches(ve, rq, engine)) { 1741 spin_unlock(&ve->base.active.lock); 1742 rb = rb_next(rb); 1743 continue; 1744 } 1745 1746 if (last && !can_merge_rq(last, rq)) { 1747 spin_unlock(&ve->base.active.lock); 1748 return; /* leave this for another */ 1749 } 1750 1751 GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", 1752 engine->name, 1753 rq->fence.context, 1754 rq->fence.seqno, 1755 i915_request_completed(rq) ? "!" : 1756 i915_request_started(rq) ? "*" : 1757 "", 1758 yesno(engine != ve->siblings[0])); 1759 1760 ve->request = NULL; 1761 ve->base.execlists.queue_priority_hint = INT_MIN; 1762 rb_erase_cached(rb, &execlists->virtual); 1763 RB_CLEAR_NODE(rb); 1764 1765 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1766 rq->engine = engine; 1767 1768 if (engine != ve->siblings[0]) { 1769 u32 *regs = ve->context.lrc_reg_state; 1770 unsigned int n; 1771 1772 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1773 1774 if (!intel_engine_has_relative_mmio(engine)) 1775 virtual_update_register_offsets(regs, 1776 engine); 1777 1778 if (!list_empty(&ve->context.signals)) 1779 virtual_xfer_breadcrumbs(ve, engine); 1780 1781 /* 1782 * Move the bound engine to the top of the list 1783 * for future execution. We then kick this 1784 * tasklet first before checking others, so that 1785 * we preferentially reuse this set of bound 1786 * registers. 1787 */ 1788 for (n = 1; n < ve->num_siblings; n++) { 1789 if (ve->siblings[n] == engine) { 1790 swap(ve->siblings[n], 1791 ve->siblings[0]); 1792 break; 1793 } 1794 } 1795 1796 GEM_BUG_ON(ve->siblings[0] != engine); 1797 } 1798 1799 if (__i915_request_submit(rq)) { 1800 submit = true; 1801 last = rq; 1802 } 1803 i915_request_put(rq); 1804 1805 /* 1806 * Hmm, we have a bunch of virtual engine requests, 1807 * but the first one was already completed (thanks 1808 * preempt-to-busy!). Keep looking at the veng queue 1809 * until we have no more relevant requests (i.e. 1810 * the normal submit queue has higher priority). 1811 */ 1812 if (!submit) { 1813 spin_unlock(&ve->base.active.lock); 1814 rb = rb_first_cached(&execlists->virtual); 1815 continue; 1816 } 1817 } 1818 1819 spin_unlock(&ve->base.active.lock); 1820 break; 1821 } 1822 1823 while ((rb = rb_first_cached(&execlists->queue))) { 1824 struct i915_priolist *p = to_priolist(rb); 1825 struct i915_request *rq, *rn; 1826 int i; 1827 1828 priolist_for_each_request_consume(rq, rn, p, i) { 1829 bool merge = true; 1830 1831 /* 1832 * Can we combine this request with the current port? 1833 * It has to be the same context/ringbuffer and not 1834 * have any exceptions (e.g. GVT saying never to 1835 * combine contexts). 1836 * 1837 * If we can combine the requests, we can execute both 1838 * by updating the RING_TAIL to point to the end of the 1839 * second request, and so we never need to tell the 1840 * hardware about the first. 1841 */ 1842 if (last && !can_merge_rq(last, rq)) { 1843 /* 1844 * If we are on the second port and cannot 1845 * combine this request with the last, then we 1846 * are done. 1847 */ 1848 if (port == last_port) 1849 goto done; 1850 1851 /* 1852 * We must not populate both ELSP[] with the 1853 * same LRCA, i.e. we must submit 2 different 1854 * contexts if we submit 2 ELSP. 1855 */ 1856 if (last->hw_context == rq->hw_context) 1857 goto done; 1858 1859 if (i915_request_has_sentinel(last)) 1860 goto done; 1861 1862 /* 1863 * If GVT overrides us we only ever submit 1864 * port[0], leaving port[1] empty. Note that we 1865 * also have to be careful that we don't queue 1866 * the same context (even though a different 1867 * request) to the second port. 1868 */ 1869 if (ctx_single_port_submission(last->hw_context) || 1870 ctx_single_port_submission(rq->hw_context)) 1871 goto done; 1872 1873 merge = false; 1874 } 1875 1876 if (__i915_request_submit(rq)) { 1877 if (!merge) { 1878 *port = execlists_schedule_in(last, port - execlists->pending); 1879 port++; 1880 last = NULL; 1881 } 1882 1883 GEM_BUG_ON(last && 1884 !can_merge_ctx(last->hw_context, 1885 rq->hw_context)); 1886 1887 submit = true; 1888 last = rq; 1889 } 1890 } 1891 1892 rb_erase_cached(&p->node, &execlists->queue); 1893 i915_priolist_free(p); 1894 } 1895 1896 done: 1897 /* 1898 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 1899 * 1900 * We choose the priority hint such that if we add a request of greater 1901 * priority than this, we kick the submission tasklet to decide on 1902 * the right order of submitting the requests to hardware. We must 1903 * also be prepared to reorder requests as they are in-flight on the 1904 * HW. We derive the priority hint then as the first "hole" in 1905 * the HW submission ports and if there are no available slots, 1906 * the priority of the lowest executing request, i.e. last. 1907 * 1908 * When we do receive a higher priority request ready to run from the 1909 * user, see queue_request(), the priority hint is bumped to that 1910 * request triggering preemption on the next dequeue (or subsequent 1911 * interrupt for secondary ports). 1912 */ 1913 execlists->queue_priority_hint = queue_prio(execlists); 1914 GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n", 1915 engine->name, execlists->queue_priority_hint, 1916 yesno(submit)); 1917 1918 if (submit) { 1919 *port = execlists_schedule_in(last, port - execlists->pending); 1920 execlists->switch_priority_hint = 1921 switch_prio(engine, *execlists->pending); 1922 1923 /* 1924 * Skip if we ended up with exactly the same set of requests, 1925 * e.g. trying to timeslice a pair of ordered contexts 1926 */ 1927 if (!memcmp(execlists->active, execlists->pending, 1928 (port - execlists->pending + 1) * sizeof(*port))) { 1929 do 1930 execlists_schedule_out(fetch_and_zero(port)); 1931 while (port-- != execlists->pending); 1932 1933 goto skip_submit; 1934 } 1935 1936 memset(port + 1, 0, (last_port - port) * sizeof(*port)); 1937 execlists_submit_ports(engine); 1938 1939 set_preempt_timeout(engine); 1940 } else { 1941 skip_submit: 1942 ring_set_paused(engine, 0); 1943 } 1944 } 1945 1946 static void 1947 cancel_port_requests(struct intel_engine_execlists * const execlists) 1948 { 1949 struct i915_request * const *port; 1950 1951 for (port = execlists->pending; *port; port++) 1952 execlists_schedule_out(*port); 1953 memset(execlists->pending, 0, sizeof(execlists->pending)); 1954 1955 /* Mark the end of active before we overwrite *active */ 1956 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 1957 execlists_schedule_out(*port); 1958 WRITE_ONCE(execlists->active, 1959 memset(execlists->inflight, 0, sizeof(execlists->inflight))); 1960 } 1961 1962 static inline void 1963 invalidate_csb_entries(const u32 *first, const u32 *last) 1964 { 1965 clflush((void *)first); 1966 clflush((void *)last); 1967 } 1968 1969 static inline bool 1970 reset_in_progress(const struct intel_engine_execlists *execlists) 1971 { 1972 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1973 } 1974 1975 /* 1976 * Starting with Gen12, the status has a new format: 1977 * 1978 * bit 0: switched to new queue 1979 * bit 1: reserved 1980 * bit 2: semaphore wait mode (poll or signal), only valid when 1981 * switch detail is set to "wait on semaphore" 1982 * bits 3-5: engine class 1983 * bits 6-11: engine instance 1984 * bits 12-14: reserved 1985 * bits 15-25: sw context id of the lrc the GT switched to 1986 * bits 26-31: sw counter of the lrc the GT switched to 1987 * bits 32-35: context switch detail 1988 * - 0: ctx complete 1989 * - 1: wait on sync flip 1990 * - 2: wait on vblank 1991 * - 3: wait on scanline 1992 * - 4: wait on semaphore 1993 * - 5: context preempted (not on SEMAPHORE_WAIT or 1994 * WAIT_FOR_EVENT) 1995 * bit 36: reserved 1996 * bits 37-43: wait detail (for switch detail 1 to 4) 1997 * bits 44-46: reserved 1998 * bits 47-57: sw context id of the lrc the GT switched away from 1999 * bits 58-63: sw counter of the lrc the GT switched away from 2000 */ 2001 static inline bool 2002 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2003 { 2004 u32 lower_dw = csb[0]; 2005 u32 upper_dw = csb[1]; 2006 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2007 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2008 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2009 2010 /* 2011 * The context switch detail is not guaranteed to be 5 when a preemption 2012 * occurs, so we can't just check for that. The check below works for 2013 * all the cases we care about, including preemptions of WAIT 2014 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2015 * would require some extra handling, but we don't support that. 2016 */ 2017 if (!ctx_away_valid || new_queue) { 2018 GEM_BUG_ON(!ctx_to_valid); 2019 return true; 2020 } 2021 2022 /* 2023 * switch detail = 5 is covered by the case above and we do not expect a 2024 * context switch on an unsuccessful wait instruction since we always 2025 * use polling mode. 2026 */ 2027 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2028 return false; 2029 } 2030 2031 static inline bool 2032 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2033 { 2034 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2035 } 2036 2037 static void process_csb(struct intel_engine_cs *engine) 2038 { 2039 struct intel_engine_execlists * const execlists = &engine->execlists; 2040 const u32 * const buf = execlists->csb_status; 2041 const u8 num_entries = execlists->csb_size; 2042 u8 head, tail; 2043 2044 /* 2045 * As we modify our execlists state tracking we require exclusive 2046 * access. Either we are inside the tasklet, or the tasklet is disabled 2047 * and we assume that is only inside the reset paths and so serialised. 2048 */ 2049 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2050 !reset_in_progress(execlists)); 2051 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2052 2053 /* 2054 * Note that csb_write, csb_status may be either in HWSP or mmio. 2055 * When reading from the csb_write mmio register, we have to be 2056 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2057 * the low 4bits. As it happens we know the next 4bits are always 2058 * zero and so we can simply masked off the low u8 of the register 2059 * and treat it identically to reading from the HWSP (without having 2060 * to use explicit shifting and masking, and probably bifurcating 2061 * the code to handle the legacy mmio read). 2062 */ 2063 head = execlists->csb_head; 2064 tail = READ_ONCE(*execlists->csb_write); 2065 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail); 2066 if (unlikely(head == tail)) 2067 return; 2068 2069 /* 2070 * Hopefully paired with a wmb() in HW! 2071 * 2072 * We must complete the read of the write pointer before any reads 2073 * from the CSB, so that we do not see stale values. Without an rmb 2074 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2075 * we perform the READ_ONCE(*csb_write). 2076 */ 2077 rmb(); 2078 2079 do { 2080 bool promote; 2081 2082 if (++head == num_entries) 2083 head = 0; 2084 2085 /* 2086 * We are flying near dragons again. 2087 * 2088 * We hold a reference to the request in execlist_port[] 2089 * but no more than that. We are operating in softirq 2090 * context and so cannot hold any mutex or sleep. That 2091 * prevents us stopping the requests we are processing 2092 * in port[] from being retired simultaneously (the 2093 * breadcrumb will be complete before we see the 2094 * context-switch). As we only hold the reference to the 2095 * request, any pointer chasing underneath the request 2096 * is subject to a potential use-after-free. Thus we 2097 * store all of the bookkeeping within port[] as 2098 * required, and avoid using unguarded pointers beneath 2099 * request itself. The same applies to the atomic 2100 * status notifier. 2101 */ 2102 2103 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n", 2104 engine->name, head, 2105 buf[2 * head + 0], buf[2 * head + 1]); 2106 2107 if (INTEL_GEN(engine->i915) >= 12) 2108 promote = gen12_csb_parse(execlists, buf + 2 * head); 2109 else 2110 promote = gen8_csb_parse(execlists, buf + 2 * head); 2111 if (promote) { 2112 struct i915_request * const *old = execlists->active; 2113 2114 /* Point active to the new ELSP; prevent overwriting */ 2115 WRITE_ONCE(execlists->active, execlists->pending); 2116 set_timeslice(engine); 2117 2118 if (!inject_preempt_hang(execlists)) 2119 ring_set_paused(engine, 0); 2120 2121 /* cancel old inflight, prepare for switch */ 2122 trace_ports(execlists, "preempted", old); 2123 while (*old) 2124 execlists_schedule_out(*old++); 2125 2126 /* switch pending to inflight */ 2127 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2128 WRITE_ONCE(execlists->active, 2129 memcpy(execlists->inflight, 2130 execlists->pending, 2131 execlists_num_ports(execlists) * 2132 sizeof(*execlists->pending))); 2133 2134 WRITE_ONCE(execlists->pending[0], NULL); 2135 } else { 2136 GEM_BUG_ON(!*execlists->active); 2137 2138 /* port0 completed, advanced to port1 */ 2139 trace_ports(execlists, "completed", execlists->active); 2140 2141 /* 2142 * We rely on the hardware being strongly 2143 * ordered, that the breadcrumb write is 2144 * coherent (visible from the CPU) before the 2145 * user interrupt and CSB is processed. 2146 */ 2147 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 2148 !reset_in_progress(execlists)); 2149 execlists_schedule_out(*execlists->active++); 2150 2151 GEM_BUG_ON(execlists->active - execlists->inflight > 2152 execlists_num_ports(execlists)); 2153 } 2154 } while (head != tail); 2155 2156 execlists->csb_head = head; 2157 2158 /* 2159 * Gen11 has proven to fail wrt global observation point between 2160 * entry and tail update, failing on the ordering and thus 2161 * we see an old entry in the context status buffer. 2162 * 2163 * Forcibly evict out entries for the next gpu csb update, 2164 * to increase the odds that we get a fresh entries with non 2165 * working hardware. The cost for doing so comes out mostly with 2166 * the wash as hardware, working or not, will need to do the 2167 * invalidation before. 2168 */ 2169 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2170 } 2171 2172 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2173 { 2174 lockdep_assert_held(&engine->active.lock); 2175 if (!engine->execlists.pending[0]) { 2176 rcu_read_lock(); /* protect peeking at execlists->active */ 2177 execlists_dequeue(engine); 2178 rcu_read_unlock(); 2179 } 2180 } 2181 2182 static noinline void preempt_reset(struct intel_engine_cs *engine) 2183 { 2184 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2185 unsigned long *lock = &engine->gt->reset.flags; 2186 2187 if (i915_modparams.reset < 3) 2188 return; 2189 2190 if (test_and_set_bit(bit, lock)) 2191 return; 2192 2193 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2194 tasklet_disable_nosync(&engine->execlists.tasklet); 2195 2196 GEM_TRACE("%s: preempt timeout %lu+%ums\n", 2197 engine->name, 2198 READ_ONCE(engine->props.preempt_timeout_ms), 2199 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); 2200 intel_engine_reset(engine, "preemption time out"); 2201 2202 tasklet_enable(&engine->execlists.tasklet); 2203 clear_and_wake_up_bit(bit, lock); 2204 } 2205 2206 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2207 { 2208 const struct timer_list *t = &engine->execlists.preempt; 2209 2210 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2211 return false; 2212 2213 if (!timer_expired(t)) 2214 return false; 2215 2216 return READ_ONCE(engine->execlists.pending[0]); 2217 } 2218 2219 /* 2220 * Check the unread Context Status Buffers and manage the submission of new 2221 * contexts to the ELSP accordingly. 2222 */ 2223 static void execlists_submission_tasklet(unsigned long data) 2224 { 2225 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2226 bool timeout = preempt_timeout(engine); 2227 2228 process_csb(engine); 2229 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2230 unsigned long flags; 2231 2232 spin_lock_irqsave(&engine->active.lock, flags); 2233 __execlists_submission_tasklet(engine); 2234 spin_unlock_irqrestore(&engine->active.lock, flags); 2235 2236 /* Recheck after serialising with direct-submission */ 2237 if (timeout && preempt_timeout(engine)) 2238 preempt_reset(engine); 2239 } 2240 } 2241 2242 static void __execlists_kick(struct intel_engine_execlists *execlists) 2243 { 2244 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2245 tasklet_hi_schedule(&execlists->tasklet); 2246 } 2247 2248 #define execlists_kick(t, member) \ 2249 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2250 2251 static void execlists_timeslice(struct timer_list *timer) 2252 { 2253 execlists_kick(timer, timer); 2254 } 2255 2256 static void execlists_preempt(struct timer_list *timer) 2257 { 2258 execlists_kick(timer, preempt); 2259 } 2260 2261 static void queue_request(struct intel_engine_cs *engine, 2262 struct i915_sched_node *node, 2263 int prio) 2264 { 2265 GEM_BUG_ON(!list_empty(&node->link)); 2266 list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); 2267 } 2268 2269 static void __submit_queue_imm(struct intel_engine_cs *engine) 2270 { 2271 struct intel_engine_execlists * const execlists = &engine->execlists; 2272 2273 if (reset_in_progress(execlists)) 2274 return; /* defer until we restart the engine following reset */ 2275 2276 if (execlists->tasklet.func == execlists_submission_tasklet) 2277 __execlists_submission_tasklet(engine); 2278 else 2279 tasklet_hi_schedule(&execlists->tasklet); 2280 } 2281 2282 static void submit_queue(struct intel_engine_cs *engine, 2283 const struct i915_request *rq) 2284 { 2285 struct intel_engine_execlists *execlists = &engine->execlists; 2286 2287 if (rq_prio(rq) <= execlists->queue_priority_hint) 2288 return; 2289 2290 execlists->queue_priority_hint = rq_prio(rq); 2291 __submit_queue_imm(engine); 2292 } 2293 2294 static void execlists_submit_request(struct i915_request *request) 2295 { 2296 struct intel_engine_cs *engine = request->engine; 2297 unsigned long flags; 2298 2299 /* Will be called from irq-context when using foreign fences. */ 2300 spin_lock_irqsave(&engine->active.lock, flags); 2301 2302 queue_request(engine, &request->sched, rq_prio(request)); 2303 2304 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2305 GEM_BUG_ON(list_empty(&request->sched.link)); 2306 2307 submit_queue(engine, request); 2308 2309 spin_unlock_irqrestore(&engine->active.lock, flags); 2310 } 2311 2312 static void __execlists_context_fini(struct intel_context *ce) 2313 { 2314 intel_ring_put(ce->ring); 2315 i915_vma_put(ce->state); 2316 } 2317 2318 static void execlists_context_destroy(struct kref *kref) 2319 { 2320 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2321 2322 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2323 GEM_BUG_ON(intel_context_is_pinned(ce)); 2324 2325 if (ce->state) 2326 __execlists_context_fini(ce); 2327 2328 intel_context_fini(ce); 2329 intel_context_free(ce); 2330 } 2331 2332 static void 2333 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2334 { 2335 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2336 return; 2337 2338 vaddr += engine->context_size; 2339 2340 memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE); 2341 } 2342 2343 static void 2344 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2345 { 2346 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2347 return; 2348 2349 vaddr += engine->context_size; 2350 2351 if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) 2352 dev_err_once(engine->i915->drm.dev, 2353 "%s context redzone overwritten!\n", 2354 engine->name); 2355 } 2356 2357 static void execlists_context_unpin(struct intel_context *ce) 2358 { 2359 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2360 ce->engine); 2361 2362 i915_gem_object_unpin_map(ce->state->obj); 2363 intel_ring_reset(ce->ring, ce->ring->tail); 2364 } 2365 2366 static void 2367 __execlists_update_reg_state(const struct intel_context *ce, 2368 const struct intel_engine_cs *engine) 2369 { 2370 struct intel_ring *ring = ce->ring; 2371 u32 *regs = ce->lrc_reg_state; 2372 2373 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); 2374 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2375 2376 regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma); 2377 regs[CTX_RING_HEAD] = ring->head; 2378 regs[CTX_RING_TAIL] = ring->tail; 2379 2380 /* RPCS */ 2381 if (engine->class == RENDER_CLASS) { 2382 regs[CTX_R_PWR_CLK_STATE] = 2383 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2384 2385 i915_oa_init_reg_state(ce, engine); 2386 } 2387 } 2388 2389 static int 2390 __execlists_context_pin(struct intel_context *ce, 2391 struct intel_engine_cs *engine) 2392 { 2393 void *vaddr; 2394 int ret; 2395 2396 GEM_BUG_ON(!ce->state); 2397 2398 ret = intel_context_active_acquire(ce); 2399 if (ret) 2400 goto err; 2401 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2402 2403 vaddr = i915_gem_object_pin_map(ce->state->obj, 2404 i915_coherent_map_type(engine->i915) | 2405 I915_MAP_OVERRIDE); 2406 if (IS_ERR(vaddr)) { 2407 ret = PTR_ERR(vaddr); 2408 goto unpin_active; 2409 } 2410 2411 ce->lrc_desc = lrc_descriptor(ce, engine); 2412 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2413 __execlists_update_reg_state(ce, engine); 2414 2415 return 0; 2416 2417 unpin_active: 2418 intel_context_active_release(ce); 2419 err: 2420 return ret; 2421 } 2422 2423 static int execlists_context_pin(struct intel_context *ce) 2424 { 2425 return __execlists_context_pin(ce, ce->engine); 2426 } 2427 2428 static int execlists_context_alloc(struct intel_context *ce) 2429 { 2430 return __execlists_context_alloc(ce, ce->engine); 2431 } 2432 2433 static void execlists_context_reset(struct intel_context *ce) 2434 { 2435 /* 2436 * Because we emit WA_TAIL_DWORDS there may be a disparity 2437 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2438 * that stored in context. As we only write new commands from 2439 * ce->ring->tail onwards, everything before that is junk. If the GPU 2440 * starts reading from its RING_HEAD from the context, it may try to 2441 * execute that junk and die. 2442 * 2443 * The contexts that are stilled pinned on resume belong to the 2444 * kernel, and are local to each engine. All other contexts will 2445 * have their head/tail sanitized upon pinning before use, so they 2446 * will never see garbage, 2447 * 2448 * So to avoid that we reset the context images upon resume. For 2449 * simplicity, we just zero everything out. 2450 */ 2451 intel_ring_reset(ce->ring, 0); 2452 __execlists_update_reg_state(ce, ce->engine); 2453 } 2454 2455 static const struct intel_context_ops execlists_context_ops = { 2456 .alloc = execlists_context_alloc, 2457 2458 .pin = execlists_context_pin, 2459 .unpin = execlists_context_unpin, 2460 2461 .enter = intel_context_enter_engine, 2462 .exit = intel_context_exit_engine, 2463 2464 .reset = execlists_context_reset, 2465 .destroy = execlists_context_destroy, 2466 }; 2467 2468 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 2469 { 2470 u32 *cs; 2471 2472 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb); 2473 2474 cs = intel_ring_begin(rq, 6); 2475 if (IS_ERR(cs)) 2476 return PTR_ERR(cs); 2477 2478 /* 2479 * Check if we have been preempted before we even get started. 2480 * 2481 * After this point i915_request_started() reports true, even if 2482 * we get preempted and so are no longer running. 2483 */ 2484 *cs++ = MI_ARB_CHECK; 2485 *cs++ = MI_NOOP; 2486 2487 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 2488 *cs++ = i915_request_timeline(rq)->hwsp_offset; 2489 *cs++ = 0; 2490 *cs++ = rq->fence.seqno - 1; 2491 2492 intel_ring_advance(rq, cs); 2493 2494 /* Record the updated position of the request's payload */ 2495 rq->infix = intel_ring_offset(rq, cs); 2496 2497 return 0; 2498 } 2499 2500 static int execlists_request_alloc(struct i915_request *request) 2501 { 2502 int ret; 2503 2504 GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); 2505 2506 /* 2507 * Flush enough space to reduce the likelihood of waiting after 2508 * we start building the request - in which case we will just 2509 * have to repeat work. 2510 */ 2511 request->reserved_space += EXECLISTS_REQUEST_SIZE; 2512 2513 /* 2514 * Note that after this point, we have committed to using 2515 * this request as it is being used to both track the 2516 * state of engine initialisation and liveness of the 2517 * golden renderstate above. Think twice before you try 2518 * to cancel/unwind this request now. 2519 */ 2520 2521 /* Unconditionally invalidate GPU caches and TLBs. */ 2522 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 2523 if (ret) 2524 return ret; 2525 2526 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 2527 return 0; 2528 } 2529 2530 /* 2531 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 2532 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 2533 * but there is a slight complication as this is applied in WA batch where the 2534 * values are only initialized once so we cannot take register value at the 2535 * beginning and reuse it further; hence we save its value to memory, upload a 2536 * constant value with bit21 set and then we restore it back with the saved value. 2537 * To simplify the WA, a constant value is formed by using the default value 2538 * of this register. This shouldn't be a problem because we are only modifying 2539 * it for a short period and this batch in non-premptible. We can ofcourse 2540 * use additional instructions that read the actual value of the register 2541 * at that time and set our bit of interest but it makes the WA complicated. 2542 * 2543 * This WA is also required for Gen9 so extracting as a function avoids 2544 * code duplication. 2545 */ 2546 static u32 * 2547 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 2548 { 2549 /* NB no one else is allowed to scribble over scratch + 256! */ 2550 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2551 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2552 *batch++ = intel_gt_scratch_offset(engine->gt, 2553 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2554 *batch++ = 0; 2555 2556 *batch++ = MI_LOAD_REGISTER_IMM(1); 2557 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2558 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 2559 2560 batch = gen8_emit_pipe_control(batch, 2561 PIPE_CONTROL_CS_STALL | 2562 PIPE_CONTROL_DC_FLUSH_ENABLE, 2563 0); 2564 2565 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2566 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2567 *batch++ = intel_gt_scratch_offset(engine->gt, 2568 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2569 *batch++ = 0; 2570 2571 return batch; 2572 } 2573 2574 /* 2575 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 2576 * initialized at the beginning and shared across all contexts but this field 2577 * helps us to have multiple batches at different offsets and select them based 2578 * on a criteria. At the moment this batch always start at the beginning of the page 2579 * and at this point we don't have multiple wa_ctx batch buffers. 2580 * 2581 * The number of WA applied are not known at the beginning; we use this field 2582 * to return the no of DWORDS written. 2583 * 2584 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 2585 * so it adds NOOPs as padding to make it cacheline aligned. 2586 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 2587 * makes a complete batch buffer. 2588 */ 2589 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2590 { 2591 /* WaDisableCtxRestoreArbitration:bdw,chv */ 2592 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2593 2594 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 2595 if (IS_BROADWELL(engine->i915)) 2596 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2597 2598 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 2599 /* Actual scratch location is at 128 bytes offset */ 2600 batch = gen8_emit_pipe_control(batch, 2601 PIPE_CONTROL_FLUSH_L3 | 2602 PIPE_CONTROL_STORE_DATA_INDEX | 2603 PIPE_CONTROL_CS_STALL | 2604 PIPE_CONTROL_QW_WRITE, 2605 LRC_PPHWSP_SCRATCH_ADDR); 2606 2607 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2608 2609 /* Pad to end of cacheline */ 2610 while ((unsigned long)batch % CACHELINE_BYTES) 2611 *batch++ = MI_NOOP; 2612 2613 /* 2614 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 2615 * execution depends on the length specified in terms of cache lines 2616 * in the register CTX_RCS_INDIRECT_CTX 2617 */ 2618 2619 return batch; 2620 } 2621 2622 struct lri { 2623 i915_reg_t reg; 2624 u32 value; 2625 }; 2626 2627 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 2628 { 2629 GEM_BUG_ON(!count || count > 63); 2630 2631 *batch++ = MI_LOAD_REGISTER_IMM(count); 2632 do { 2633 *batch++ = i915_mmio_reg_offset(lri->reg); 2634 *batch++ = lri->value; 2635 } while (lri++, --count); 2636 *batch++ = MI_NOOP; 2637 2638 return batch; 2639 } 2640 2641 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2642 { 2643 static const struct lri lri[] = { 2644 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 2645 { 2646 COMMON_SLICE_CHICKEN2, 2647 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 2648 0), 2649 }, 2650 2651 /* BSpec: 11391 */ 2652 { 2653 FF_SLICE_CHICKEN, 2654 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 2655 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 2656 }, 2657 2658 /* BSpec: 11299 */ 2659 { 2660 _3D_CHICKEN3, 2661 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 2662 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 2663 } 2664 }; 2665 2666 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2667 2668 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 2669 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2670 2671 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 2672 2673 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 2674 if (HAS_POOLED_EU(engine->i915)) { 2675 /* 2676 * EU pool configuration is setup along with golden context 2677 * during context initialization. This value depends on 2678 * device type (2x6 or 3x6) and needs to be updated based 2679 * on which subslice is disabled especially for 2x6 2680 * devices, however it is safe to load default 2681 * configuration of 3x6 device instead of masking off 2682 * corresponding bits because HW ignores bits of a disabled 2683 * subslice and drops down to appropriate config. Please 2684 * see render_state_setup() in i915_gem_render_state.c for 2685 * possible configurations, to avoid duplication they are 2686 * not shown here again. 2687 */ 2688 *batch++ = GEN9_MEDIA_POOL_STATE; 2689 *batch++ = GEN9_MEDIA_POOL_ENABLE; 2690 *batch++ = 0x00777000; 2691 *batch++ = 0; 2692 *batch++ = 0; 2693 *batch++ = 0; 2694 } 2695 2696 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2697 2698 /* Pad to end of cacheline */ 2699 while ((unsigned long)batch % CACHELINE_BYTES) 2700 *batch++ = MI_NOOP; 2701 2702 return batch; 2703 } 2704 2705 static u32 * 2706 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2707 { 2708 int i; 2709 2710 /* 2711 * WaPipeControlBefore3DStateSamplePattern: cnl 2712 * 2713 * Ensure the engine is idle prior to programming a 2714 * 3DSTATE_SAMPLE_PATTERN during a context restore. 2715 */ 2716 batch = gen8_emit_pipe_control(batch, 2717 PIPE_CONTROL_CS_STALL, 2718 0); 2719 /* 2720 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 2721 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 2722 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 2723 * confusing. Since gen8_emit_pipe_control() already advances the 2724 * batch by 6 dwords, we advance the other 10 here, completing a 2725 * cacheline. It's not clear if the workaround requires this padding 2726 * before other commands, or if it's just the regular padding we would 2727 * already have for the workaround bb, so leave it here for now. 2728 */ 2729 for (i = 0; i < 10; i++) 2730 *batch++ = MI_NOOP; 2731 2732 /* Pad to end of cacheline */ 2733 while ((unsigned long)batch % CACHELINE_BYTES) 2734 *batch++ = MI_NOOP; 2735 2736 return batch; 2737 } 2738 2739 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 2740 2741 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 2742 { 2743 struct drm_i915_gem_object *obj; 2744 struct i915_vma *vma; 2745 int err; 2746 2747 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 2748 if (IS_ERR(obj)) 2749 return PTR_ERR(obj); 2750 2751 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 2752 if (IS_ERR(vma)) { 2753 err = PTR_ERR(vma); 2754 goto err; 2755 } 2756 2757 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 2758 if (err) 2759 goto err; 2760 2761 engine->wa_ctx.vma = vma; 2762 return 0; 2763 2764 err: 2765 i915_gem_object_put(obj); 2766 return err; 2767 } 2768 2769 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 2770 { 2771 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 2772 } 2773 2774 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 2775 2776 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 2777 { 2778 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 2779 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 2780 &wa_ctx->per_ctx }; 2781 wa_bb_func_t wa_bb_fn[2]; 2782 struct page *page; 2783 void *batch, *batch_ptr; 2784 unsigned int i; 2785 int ret; 2786 2787 if (engine->class != RENDER_CLASS) 2788 return 0; 2789 2790 switch (INTEL_GEN(engine->i915)) { 2791 case 12: 2792 case 11: 2793 return 0; 2794 case 10: 2795 wa_bb_fn[0] = gen10_init_indirectctx_bb; 2796 wa_bb_fn[1] = NULL; 2797 break; 2798 case 9: 2799 wa_bb_fn[0] = gen9_init_indirectctx_bb; 2800 wa_bb_fn[1] = NULL; 2801 break; 2802 case 8: 2803 wa_bb_fn[0] = gen8_init_indirectctx_bb; 2804 wa_bb_fn[1] = NULL; 2805 break; 2806 default: 2807 MISSING_CASE(INTEL_GEN(engine->i915)); 2808 return 0; 2809 } 2810 2811 ret = lrc_setup_wa_ctx(engine); 2812 if (ret) { 2813 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 2814 return ret; 2815 } 2816 2817 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 2818 batch = batch_ptr = kmap_atomic(page); 2819 2820 /* 2821 * Emit the two workaround batch buffers, recording the offset from the 2822 * start of the workaround batch buffer object for each and their 2823 * respective sizes. 2824 */ 2825 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 2826 wa_bb[i]->offset = batch_ptr - batch; 2827 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 2828 CACHELINE_BYTES))) { 2829 ret = -EINVAL; 2830 break; 2831 } 2832 if (wa_bb_fn[i]) 2833 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 2834 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 2835 } 2836 2837 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 2838 2839 kunmap_atomic(batch); 2840 if (ret) 2841 lrc_destroy_wa_ctx(engine); 2842 2843 return ret; 2844 } 2845 2846 static void enable_execlists(struct intel_engine_cs *engine) 2847 { 2848 u32 mode; 2849 2850 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 2851 2852 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 2853 2854 if (INTEL_GEN(engine->i915) >= 11) 2855 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 2856 else 2857 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 2858 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 2859 2860 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 2861 2862 ENGINE_WRITE_FW(engine, 2863 RING_HWS_PGA, 2864 i915_ggtt_offset(engine->status_page.vma)); 2865 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 2866 } 2867 2868 static bool unexpected_starting_state(struct intel_engine_cs *engine) 2869 { 2870 bool unexpected = false; 2871 2872 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 2873 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 2874 unexpected = true; 2875 } 2876 2877 return unexpected; 2878 } 2879 2880 static int execlists_resume(struct intel_engine_cs *engine) 2881 { 2882 intel_engine_apply_workarounds(engine); 2883 intel_engine_apply_whitelist(engine); 2884 2885 intel_mocs_init_engine(engine); 2886 2887 intel_engine_reset_breadcrumbs(engine); 2888 2889 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 2890 struct drm_printer p = drm_debug_printer(__func__); 2891 2892 intel_engine_dump(engine, &p, NULL); 2893 } 2894 2895 enable_execlists(engine); 2896 2897 return 0; 2898 } 2899 2900 static void execlists_reset_prepare(struct intel_engine_cs *engine) 2901 { 2902 struct intel_engine_execlists * const execlists = &engine->execlists; 2903 unsigned long flags; 2904 2905 GEM_TRACE("%s: depth<-%d\n", engine->name, 2906 atomic_read(&execlists->tasklet.count)); 2907 2908 /* 2909 * Prevent request submission to the hardware until we have 2910 * completed the reset in i915_gem_reset_finish(). If a request 2911 * is completed by one engine, it may then queue a request 2912 * to a second via its execlists->tasklet *just* as we are 2913 * calling engine->resume() and also writing the ELSP. 2914 * Turning off the execlists->tasklet until the reset is over 2915 * prevents the race. 2916 */ 2917 __tasklet_disable_sync_once(&execlists->tasklet); 2918 GEM_BUG_ON(!reset_in_progress(execlists)); 2919 2920 /* And flush any current direct submission. */ 2921 spin_lock_irqsave(&engine->active.lock, flags); 2922 spin_unlock_irqrestore(&engine->active.lock, flags); 2923 2924 /* 2925 * We stop engines, otherwise we might get failed reset and a 2926 * dead gpu (on elk). Also as modern gpu as kbl can suffer 2927 * from system hang if batchbuffer is progressing when 2928 * the reset is issued, regardless of READY_TO_RESET ack. 2929 * Thus assume it is best to stop engines on all gens 2930 * where we have a gpu reset. 2931 * 2932 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 2933 * 2934 * FIXME: Wa for more modern gens needs to be validated 2935 */ 2936 intel_engine_stop_cs(engine); 2937 } 2938 2939 static void reset_csb_pointers(struct intel_engine_cs *engine) 2940 { 2941 struct intel_engine_execlists * const execlists = &engine->execlists; 2942 const unsigned int reset_value = execlists->csb_size - 1; 2943 2944 ring_set_paused(engine, 0); 2945 2946 /* 2947 * After a reset, the HW starts writing into CSB entry [0]. We 2948 * therefore have to set our HEAD pointer back one entry so that 2949 * the *first* entry we check is entry 0. To complicate this further, 2950 * as we don't wait for the first interrupt after reset, we have to 2951 * fake the HW write to point back to the last entry so that our 2952 * inline comparison of our cached head position against the last HW 2953 * write works even before the first interrupt. 2954 */ 2955 execlists->csb_head = reset_value; 2956 WRITE_ONCE(*execlists->csb_write, reset_value); 2957 wmb(); /* Make sure this is visible to HW (paranoia?) */ 2958 2959 invalidate_csb_entries(&execlists->csb_status[0], 2960 &execlists->csb_status[reset_value]); 2961 } 2962 2963 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 2964 { 2965 if (INTEL_GEN(engine->i915) >= 12) 2966 return 0x60; 2967 else if (INTEL_GEN(engine->i915) >= 9) 2968 return 0x54; 2969 else if (engine->class == RENDER_CLASS) 2970 return 0x58; 2971 else 2972 return -1; 2973 } 2974 2975 static void __execlists_reset_reg_state(const struct intel_context *ce, 2976 const struct intel_engine_cs *engine) 2977 { 2978 u32 *regs = ce->lrc_reg_state; 2979 int x; 2980 2981 x = lrc_ring_mi_mode(engine); 2982 if (x != -1) { 2983 regs[x + 1] &= ~STOP_RING; 2984 regs[x + 1] |= STOP_RING << 16; 2985 } 2986 } 2987 2988 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 2989 { 2990 struct intel_engine_execlists * const execlists = &engine->execlists; 2991 struct intel_context *ce; 2992 struct i915_request *rq; 2993 2994 mb(); /* paranoia: read the CSB pointers from after the reset */ 2995 clflush(execlists->csb_write); 2996 mb(); 2997 2998 process_csb(engine); /* drain preemption events */ 2999 3000 /* Following the reset, we need to reload the CSB read/write pointers */ 3001 reset_csb_pointers(engine); 3002 3003 /* 3004 * Save the currently executing context, even if we completed 3005 * its request, it was still running at the time of the 3006 * reset and will have been clobbered. 3007 */ 3008 rq = execlists_active(execlists); 3009 if (!rq) 3010 goto unwind; 3011 3012 /* We still have requests in-flight; the engine should be active */ 3013 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3014 3015 ce = rq->hw_context; 3016 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3017 3018 if (i915_request_completed(rq)) { 3019 /* Idle context; tidy up the ring so we can restart afresh */ 3020 ce->ring->head = intel_ring_wrap(ce->ring, rq->tail); 3021 goto out_replay; 3022 } 3023 3024 /* Context has requests still in-flight; it should not be idle! */ 3025 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3026 rq = active_request(ce->timeline, rq); 3027 ce->ring->head = intel_ring_wrap(ce->ring, rq->head); 3028 GEM_BUG_ON(ce->ring->head == ce->ring->tail); 3029 3030 /* 3031 * If this request hasn't started yet, e.g. it is waiting on a 3032 * semaphore, we need to avoid skipping the request or else we 3033 * break the signaling chain. However, if the context is corrupt 3034 * the request will not restart and we will be stuck with a wedged 3035 * device. It is quite often the case that if we issue a reset 3036 * while the GPU is loading the context image, that the context 3037 * image becomes corrupt. 3038 * 3039 * Otherwise, if we have not started yet, the request should replay 3040 * perfectly and we do not need to flag the result as being erroneous. 3041 */ 3042 if (!i915_request_started(rq)) 3043 goto out_replay; 3044 3045 /* 3046 * If the request was innocent, we leave the request in the ELSP 3047 * and will try to replay it on restarting. The context image may 3048 * have been corrupted by the reset, in which case we may have 3049 * to service a new GPU hang, but more likely we can continue on 3050 * without impact. 3051 * 3052 * If the request was guilty, we presume the context is corrupt 3053 * and have to at least restore the RING register in the context 3054 * image back to the expected values to skip over the guilty request. 3055 */ 3056 __i915_request_reset(rq, stalled); 3057 if (!stalled) 3058 goto out_replay; 3059 3060 /* 3061 * We want a simple context + ring to execute the breadcrumb update. 3062 * We cannot rely on the context being intact across the GPU hang, 3063 * so clear it and rebuild just what we need for the breadcrumb. 3064 * All pending requests for this context will be zapped, and any 3065 * future request will be after userspace has had the opportunity 3066 * to recreate its own state. 3067 */ 3068 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3069 restore_default_state(ce, engine); 3070 3071 out_replay: 3072 GEM_TRACE("%s replay {head:%04x, tail:%04x}\n", 3073 engine->name, ce->ring->head, ce->ring->tail); 3074 intel_ring_update_space(ce->ring); 3075 __execlists_reset_reg_state(ce, engine); 3076 __execlists_update_reg_state(ce, engine); 3077 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3078 3079 unwind: 3080 /* Push back any incomplete requests for replay after the reset. */ 3081 cancel_port_requests(execlists); 3082 __unwind_incomplete_requests(engine); 3083 } 3084 3085 static void execlists_reset(struct intel_engine_cs *engine, bool stalled) 3086 { 3087 unsigned long flags; 3088 3089 GEM_TRACE("%s\n", engine->name); 3090 3091 spin_lock_irqsave(&engine->active.lock, flags); 3092 3093 __execlists_reset(engine, stalled); 3094 3095 spin_unlock_irqrestore(&engine->active.lock, flags); 3096 } 3097 3098 static void nop_submission_tasklet(unsigned long data) 3099 { 3100 /* The driver is wedged; don't process any more events. */ 3101 } 3102 3103 static void execlists_cancel_requests(struct intel_engine_cs *engine) 3104 { 3105 struct intel_engine_execlists * const execlists = &engine->execlists; 3106 struct i915_request *rq, *rn; 3107 struct rb_node *rb; 3108 unsigned long flags; 3109 3110 GEM_TRACE("%s\n", engine->name); 3111 3112 /* 3113 * Before we call engine->cancel_requests(), we should have exclusive 3114 * access to the submission state. This is arranged for us by the 3115 * caller disabling the interrupt generation, the tasklet and other 3116 * threads that may then access the same state, giving us a free hand 3117 * to reset state. However, we still need to let lockdep be aware that 3118 * we know this state may be accessed in hardirq context, so we 3119 * disable the irq around this manipulation and we want to keep 3120 * the spinlock focused on its duties and not accidentally conflate 3121 * coverage to the submission's irq state. (Similarly, although we 3122 * shouldn't need to disable irq around the manipulation of the 3123 * submission's irq state, we also wish to remind ourselves that 3124 * it is irq state.) 3125 */ 3126 spin_lock_irqsave(&engine->active.lock, flags); 3127 3128 __execlists_reset(engine, true); 3129 3130 /* Mark all executing requests as skipped. */ 3131 list_for_each_entry(rq, &engine->active.requests, sched.link) 3132 mark_eio(rq); 3133 3134 /* Flush the queued requests to the timeline list (for retiring). */ 3135 while ((rb = rb_first_cached(&execlists->queue))) { 3136 struct i915_priolist *p = to_priolist(rb); 3137 int i; 3138 3139 priolist_for_each_request_consume(rq, rn, p, i) { 3140 mark_eio(rq); 3141 __i915_request_submit(rq); 3142 } 3143 3144 rb_erase_cached(&p->node, &execlists->queue); 3145 i915_priolist_free(p); 3146 } 3147 3148 /* Cancel all attached virtual engines */ 3149 while ((rb = rb_first_cached(&execlists->virtual))) { 3150 struct virtual_engine *ve = 3151 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3152 3153 rb_erase_cached(rb, &execlists->virtual); 3154 RB_CLEAR_NODE(rb); 3155 3156 spin_lock(&ve->base.active.lock); 3157 rq = fetch_and_zero(&ve->request); 3158 if (rq) { 3159 mark_eio(rq); 3160 3161 rq->engine = engine; 3162 __i915_request_submit(rq); 3163 i915_request_put(rq); 3164 3165 ve->base.execlists.queue_priority_hint = INT_MIN; 3166 } 3167 spin_unlock(&ve->base.active.lock); 3168 } 3169 3170 /* Remaining _unready_ requests will be nop'ed when submitted */ 3171 3172 execlists->queue_priority_hint = INT_MIN; 3173 execlists->queue = RB_ROOT_CACHED; 3174 3175 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3176 execlists->tasklet.func = nop_submission_tasklet; 3177 3178 spin_unlock_irqrestore(&engine->active.lock, flags); 3179 } 3180 3181 static void execlists_reset_finish(struct intel_engine_cs *engine) 3182 { 3183 struct intel_engine_execlists * const execlists = &engine->execlists; 3184 3185 /* 3186 * After a GPU reset, we may have requests to replay. Do so now while 3187 * we still have the forcewake to be sure that the GPU is not allowed 3188 * to sleep before we restart and reload a context. 3189 */ 3190 GEM_BUG_ON(!reset_in_progress(execlists)); 3191 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3192 execlists->tasklet.func(execlists->tasklet.data); 3193 3194 if (__tasklet_enable(&execlists->tasklet)) 3195 /* And kick in case we missed a new request submission. */ 3196 tasklet_hi_schedule(&execlists->tasklet); 3197 GEM_TRACE("%s: depth->%d\n", engine->name, 3198 atomic_read(&execlists->tasklet.count)); 3199 } 3200 3201 static int gen8_emit_bb_start(struct i915_request *rq, 3202 u64 offset, u32 len, 3203 const unsigned int flags) 3204 { 3205 u32 *cs; 3206 3207 cs = intel_ring_begin(rq, 4); 3208 if (IS_ERR(cs)) 3209 return PTR_ERR(cs); 3210 3211 /* 3212 * WaDisableCtxRestoreArbitration:bdw,chv 3213 * 3214 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3215 * particular all the gen that do not need the w/a at all!), if we 3216 * took care to make sure that on every switch into this context 3217 * (both ordinary and for preemption) that arbitrartion was enabled 3218 * we would be fine. However, for gen8 there is another w/a that 3219 * requires us to not preempt inside GPGPU execution, so we keep 3220 * arbitration disabled for gen8 batches. Arbitration will be 3221 * re-enabled before we close the request 3222 * (engine->emit_fini_breadcrumb). 3223 */ 3224 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3225 3226 /* FIXME(BDW+): Address space and security selectors. */ 3227 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3228 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3229 *cs++ = lower_32_bits(offset); 3230 *cs++ = upper_32_bits(offset); 3231 3232 intel_ring_advance(rq, cs); 3233 3234 return 0; 3235 } 3236 3237 static int gen9_emit_bb_start(struct i915_request *rq, 3238 u64 offset, u32 len, 3239 const unsigned int flags) 3240 { 3241 u32 *cs; 3242 3243 cs = intel_ring_begin(rq, 6); 3244 if (IS_ERR(cs)) 3245 return PTR_ERR(cs); 3246 3247 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3248 3249 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3250 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3251 *cs++ = lower_32_bits(offset); 3252 *cs++ = upper_32_bits(offset); 3253 3254 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3255 *cs++ = MI_NOOP; 3256 3257 intel_ring_advance(rq, cs); 3258 3259 return 0; 3260 } 3261 3262 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3263 { 3264 ENGINE_WRITE(engine, RING_IMR, 3265 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3266 ENGINE_POSTING_READ(engine, RING_IMR); 3267 } 3268 3269 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3270 { 3271 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3272 } 3273 3274 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3275 { 3276 u32 cmd, *cs; 3277 3278 cs = intel_ring_begin(request, 4); 3279 if (IS_ERR(cs)) 3280 return PTR_ERR(cs); 3281 3282 cmd = MI_FLUSH_DW + 1; 3283 3284 /* We always require a command barrier so that subsequent 3285 * commands, such as breadcrumb interrupts, are strictly ordered 3286 * wrt the contents of the write cache being flushed to memory 3287 * (and thus being coherent from the CPU). 3288 */ 3289 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3290 3291 if (mode & EMIT_INVALIDATE) { 3292 cmd |= MI_INVALIDATE_TLB; 3293 if (request->engine->class == VIDEO_DECODE_CLASS) 3294 cmd |= MI_INVALIDATE_BSD; 3295 } 3296 3297 *cs++ = cmd; 3298 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3299 *cs++ = 0; /* upper addr */ 3300 *cs++ = 0; /* value */ 3301 intel_ring_advance(request, cs); 3302 3303 return 0; 3304 } 3305 3306 static int gen8_emit_flush_render(struct i915_request *request, 3307 u32 mode) 3308 { 3309 bool vf_flush_wa = false, dc_flush_wa = false; 3310 u32 *cs, flags = 0; 3311 int len; 3312 3313 flags |= PIPE_CONTROL_CS_STALL; 3314 3315 if (mode & EMIT_FLUSH) { 3316 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3317 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3318 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3319 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3320 } 3321 3322 if (mode & EMIT_INVALIDATE) { 3323 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3324 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3325 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3326 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3327 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3328 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3329 flags |= PIPE_CONTROL_QW_WRITE; 3330 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3331 3332 /* 3333 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3334 * pipe control. 3335 */ 3336 if (IS_GEN(request->i915, 9)) 3337 vf_flush_wa = true; 3338 3339 /* WaForGAMHang:kbl */ 3340 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3341 dc_flush_wa = true; 3342 } 3343 3344 len = 6; 3345 3346 if (vf_flush_wa) 3347 len += 6; 3348 3349 if (dc_flush_wa) 3350 len += 12; 3351 3352 cs = intel_ring_begin(request, len); 3353 if (IS_ERR(cs)) 3354 return PTR_ERR(cs); 3355 3356 if (vf_flush_wa) 3357 cs = gen8_emit_pipe_control(cs, 0, 0); 3358 3359 if (dc_flush_wa) 3360 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3361 0); 3362 3363 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3364 3365 if (dc_flush_wa) 3366 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3367 3368 intel_ring_advance(request, cs); 3369 3370 return 0; 3371 } 3372 3373 static int gen11_emit_flush_render(struct i915_request *request, 3374 u32 mode) 3375 { 3376 if (mode & EMIT_FLUSH) { 3377 u32 *cs; 3378 u32 flags = 0; 3379 3380 flags |= PIPE_CONTROL_CS_STALL; 3381 3382 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3383 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3384 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3385 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3386 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3387 flags |= PIPE_CONTROL_QW_WRITE; 3388 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3389 3390 cs = intel_ring_begin(request, 6); 3391 if (IS_ERR(cs)) 3392 return PTR_ERR(cs); 3393 3394 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3395 intel_ring_advance(request, cs); 3396 } 3397 3398 if (mode & EMIT_INVALIDATE) { 3399 u32 *cs; 3400 u32 flags = 0; 3401 3402 flags |= PIPE_CONTROL_CS_STALL; 3403 3404 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3405 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3406 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3407 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3408 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3409 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3410 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3411 flags |= PIPE_CONTROL_QW_WRITE; 3412 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3413 3414 cs = intel_ring_begin(request, 6); 3415 if (IS_ERR(cs)) 3416 return PTR_ERR(cs); 3417 3418 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3419 intel_ring_advance(request, cs); 3420 } 3421 3422 return 0; 3423 } 3424 3425 static u32 preparser_disable(bool state) 3426 { 3427 return MI_ARB_CHECK | 1 << 8 | state; 3428 } 3429 3430 static int gen12_emit_flush_render(struct i915_request *request, 3431 u32 mode) 3432 { 3433 if (mode & EMIT_FLUSH) { 3434 u32 flags = 0; 3435 u32 *cs; 3436 3437 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3438 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3439 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3440 /* Wa_1409600907:tgl */ 3441 flags |= PIPE_CONTROL_DEPTH_STALL; 3442 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3443 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3444 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3445 3446 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3447 flags |= PIPE_CONTROL_QW_WRITE; 3448 3449 flags |= PIPE_CONTROL_CS_STALL; 3450 3451 cs = intel_ring_begin(request, 6); 3452 if (IS_ERR(cs)) 3453 return PTR_ERR(cs); 3454 3455 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3456 intel_ring_advance(request, cs); 3457 } 3458 3459 if (mode & EMIT_INVALIDATE) { 3460 u32 flags = 0; 3461 u32 *cs; 3462 3463 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3464 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3465 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3466 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3467 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3468 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3469 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3470 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 3471 3472 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3473 flags |= PIPE_CONTROL_QW_WRITE; 3474 3475 flags |= PIPE_CONTROL_CS_STALL; 3476 3477 cs = intel_ring_begin(request, 8); 3478 if (IS_ERR(cs)) 3479 return PTR_ERR(cs); 3480 3481 /* 3482 * Prevent the pre-parser from skipping past the TLB 3483 * invalidate and loading a stale page for the batch 3484 * buffer / request payload. 3485 */ 3486 *cs++ = preparser_disable(true); 3487 3488 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3489 3490 *cs++ = preparser_disable(false); 3491 intel_ring_advance(request, cs); 3492 3493 /* 3494 * Wa_1604544889:tgl 3495 */ 3496 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 3497 flags = 0; 3498 flags |= PIPE_CONTROL_CS_STALL; 3499 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3500 3501 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3502 flags |= PIPE_CONTROL_QW_WRITE; 3503 3504 cs = intel_ring_begin(request, 6); 3505 if (IS_ERR(cs)) 3506 return PTR_ERR(cs); 3507 3508 cs = gen8_emit_pipe_control(cs, flags, 3509 LRC_PPHWSP_SCRATCH_ADDR); 3510 intel_ring_advance(request, cs); 3511 } 3512 } 3513 3514 return 0; 3515 } 3516 3517 /* 3518 * Reserve space for 2 NOOPs at the end of each request to be 3519 * used as a workaround for not being allowed to do lite 3520 * restore with HEAD==TAIL (WaIdleLiteRestore). 3521 */ 3522 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 3523 { 3524 /* Ensure there's always at least one preemption point per-request. */ 3525 *cs++ = MI_ARB_CHECK; 3526 *cs++ = MI_NOOP; 3527 request->wa_tail = intel_ring_offset(request, cs); 3528 3529 return cs; 3530 } 3531 3532 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 3533 { 3534 *cs++ = MI_SEMAPHORE_WAIT | 3535 MI_SEMAPHORE_GLOBAL_GTT | 3536 MI_SEMAPHORE_POLL | 3537 MI_SEMAPHORE_SAD_EQ_SDD; 3538 *cs++ = 0; 3539 *cs++ = intel_hws_preempt_address(request->engine); 3540 *cs++ = 0; 3541 3542 return cs; 3543 } 3544 3545 static __always_inline u32* 3546 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 3547 u32 *cs) 3548 { 3549 *cs++ = MI_USER_INTERRUPT; 3550 3551 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3552 if (intel_engine_has_semaphores(request->engine)) 3553 cs = emit_preempt_busywait(request, cs); 3554 3555 request->tail = intel_ring_offset(request, cs); 3556 assert_ring_tail_valid(request->ring, request->tail); 3557 3558 return gen8_emit_wa_tail(request, cs); 3559 } 3560 3561 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 3562 { 3563 cs = gen8_emit_ggtt_write(cs, 3564 request->fence.seqno, 3565 i915_request_active_timeline(request)->hwsp_offset, 3566 0); 3567 3568 return gen8_emit_fini_breadcrumb_footer(request, cs); 3569 } 3570 3571 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3572 { 3573 cs = gen8_emit_pipe_control(cs, 3574 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3575 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3576 PIPE_CONTROL_DC_FLUSH_ENABLE, 3577 0); 3578 3579 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 3580 cs = gen8_emit_ggtt_write_rcs(cs, 3581 request->fence.seqno, 3582 i915_request_active_timeline(request)->hwsp_offset, 3583 PIPE_CONTROL_FLUSH_ENABLE | 3584 PIPE_CONTROL_CS_STALL); 3585 3586 return gen8_emit_fini_breadcrumb_footer(request, cs); 3587 } 3588 3589 static u32 * 3590 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3591 { 3592 cs = gen8_emit_ggtt_write_rcs(cs, 3593 request->fence.seqno, 3594 i915_request_active_timeline(request)->hwsp_offset, 3595 PIPE_CONTROL_CS_STALL | 3596 PIPE_CONTROL_TILE_CACHE_FLUSH | 3597 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3598 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3599 PIPE_CONTROL_DC_FLUSH_ENABLE | 3600 PIPE_CONTROL_FLUSH_ENABLE); 3601 3602 return gen8_emit_fini_breadcrumb_footer(request, cs); 3603 } 3604 3605 /* 3606 * Note that the CS instruction pre-parser will not stall on the breadcrumb 3607 * flush and will continue pre-fetching the instructions after it before the 3608 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 3609 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 3610 * of the next request before the memory has been flushed, we're guaranteed that 3611 * we won't access the batch itself too early. 3612 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 3613 * so, if the current request is modifying an instruction in the next request on 3614 * the same intel_context, we might pre-fetch and then execute the pre-update 3615 * instruction. To avoid this, the users of self-modifying code should either 3616 * disable the parser around the code emitting the memory writes, via a new flag 3617 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 3618 * the in-kernel use-cases we've opted to use a separate context, see 3619 * reloc_gpu() as an example. 3620 * All the above applies only to the instructions themselves. Non-inline data 3621 * used by the instructions is not pre-fetched. 3622 */ 3623 3624 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 3625 { 3626 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 3627 MI_SEMAPHORE_GLOBAL_GTT | 3628 MI_SEMAPHORE_POLL | 3629 MI_SEMAPHORE_SAD_EQ_SDD; 3630 *cs++ = 0; 3631 *cs++ = intel_hws_preempt_address(request->engine); 3632 *cs++ = 0; 3633 *cs++ = 0; 3634 *cs++ = MI_NOOP; 3635 3636 return cs; 3637 } 3638 3639 static __always_inline u32* 3640 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 3641 { 3642 *cs++ = MI_USER_INTERRUPT; 3643 3644 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3645 if (intel_engine_has_semaphores(request->engine)) 3646 cs = gen12_emit_preempt_busywait(request, cs); 3647 3648 request->tail = intel_ring_offset(request, cs); 3649 assert_ring_tail_valid(request->ring, request->tail); 3650 3651 return gen8_emit_wa_tail(request, cs); 3652 } 3653 3654 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 3655 { 3656 cs = gen8_emit_ggtt_write(cs, 3657 request->fence.seqno, 3658 i915_request_active_timeline(request)->hwsp_offset, 3659 0); 3660 3661 return gen12_emit_fini_breadcrumb_footer(request, cs); 3662 } 3663 3664 static u32 * 3665 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3666 { 3667 cs = gen8_emit_ggtt_write_rcs(cs, 3668 request->fence.seqno, 3669 i915_request_active_timeline(request)->hwsp_offset, 3670 PIPE_CONTROL_CS_STALL | 3671 PIPE_CONTROL_TILE_CACHE_FLUSH | 3672 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3673 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3674 /* Wa_1409600907:tgl */ 3675 PIPE_CONTROL_DEPTH_STALL | 3676 PIPE_CONTROL_DC_FLUSH_ENABLE | 3677 PIPE_CONTROL_FLUSH_ENABLE | 3678 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 3679 3680 return gen12_emit_fini_breadcrumb_footer(request, cs); 3681 } 3682 3683 static void execlists_park(struct intel_engine_cs *engine) 3684 { 3685 cancel_timer(&engine->execlists.timer); 3686 cancel_timer(&engine->execlists.preempt); 3687 } 3688 3689 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 3690 { 3691 engine->submit_request = execlists_submit_request; 3692 engine->cancel_requests = execlists_cancel_requests; 3693 engine->schedule = i915_schedule; 3694 engine->execlists.tasklet.func = execlists_submission_tasklet; 3695 3696 engine->reset.prepare = execlists_reset_prepare; 3697 engine->reset.reset = execlists_reset; 3698 engine->reset.finish = execlists_reset_finish; 3699 3700 engine->park = execlists_park; 3701 engine->unpark = NULL; 3702 3703 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 3704 if (!intel_vgpu_active(engine->i915)) { 3705 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 3706 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 3707 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 3708 } 3709 3710 if (INTEL_GEN(engine->i915) >= 12) 3711 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 3712 } 3713 3714 static void execlists_destroy(struct intel_engine_cs *engine) 3715 { 3716 intel_engine_cleanup_common(engine); 3717 lrc_destroy_wa_ctx(engine); 3718 kfree(engine); 3719 } 3720 3721 static void 3722 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 3723 { 3724 /* Default vfuncs which can be overriden by each engine. */ 3725 3726 engine->destroy = execlists_destroy; 3727 engine->resume = execlists_resume; 3728 3729 engine->reset.prepare = execlists_reset_prepare; 3730 engine->reset.reset = execlists_reset; 3731 engine->reset.finish = execlists_reset_finish; 3732 3733 engine->cops = &execlists_context_ops; 3734 engine->request_alloc = execlists_request_alloc; 3735 3736 engine->emit_flush = gen8_emit_flush; 3737 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 3738 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 3739 if (INTEL_GEN(engine->i915) >= 12) 3740 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 3741 3742 engine->set_default_submission = intel_execlists_set_default_submission; 3743 3744 if (INTEL_GEN(engine->i915) < 11) { 3745 engine->irq_enable = gen8_logical_ring_enable_irq; 3746 engine->irq_disable = gen8_logical_ring_disable_irq; 3747 } else { 3748 /* 3749 * TODO: On Gen11 interrupt masks need to be clear 3750 * to allow C6 entry. Keep interrupts enabled at 3751 * and take the hit of generating extra interrupts 3752 * until a more refined solution exists. 3753 */ 3754 } 3755 if (IS_GEN(engine->i915, 8)) 3756 engine->emit_bb_start = gen8_emit_bb_start; 3757 else 3758 engine->emit_bb_start = gen9_emit_bb_start; 3759 } 3760 3761 static inline void 3762 logical_ring_default_irqs(struct intel_engine_cs *engine) 3763 { 3764 unsigned int shift = 0; 3765 3766 if (INTEL_GEN(engine->i915) < 11) { 3767 const u8 irq_shifts[] = { 3768 [RCS0] = GEN8_RCS_IRQ_SHIFT, 3769 [BCS0] = GEN8_BCS_IRQ_SHIFT, 3770 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 3771 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 3772 [VECS0] = GEN8_VECS_IRQ_SHIFT, 3773 }; 3774 3775 shift = irq_shifts[engine->id]; 3776 } 3777 3778 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 3779 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 3780 } 3781 3782 static void rcs_submission_override(struct intel_engine_cs *engine) 3783 { 3784 switch (INTEL_GEN(engine->i915)) { 3785 case 12: 3786 engine->emit_flush = gen12_emit_flush_render; 3787 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 3788 break; 3789 case 11: 3790 engine->emit_flush = gen11_emit_flush_render; 3791 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 3792 break; 3793 default: 3794 engine->emit_flush = gen8_emit_flush_render; 3795 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 3796 break; 3797 } 3798 } 3799 3800 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 3801 { 3802 tasklet_init(&engine->execlists.tasklet, 3803 execlists_submission_tasklet, (unsigned long)engine); 3804 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 3805 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 3806 3807 logical_ring_default_vfuncs(engine); 3808 logical_ring_default_irqs(engine); 3809 3810 if (engine->class == RENDER_CLASS) 3811 rcs_submission_override(engine); 3812 3813 return 0; 3814 } 3815 3816 int intel_execlists_submission_init(struct intel_engine_cs *engine) 3817 { 3818 struct intel_engine_execlists * const execlists = &engine->execlists; 3819 struct drm_i915_private *i915 = engine->i915; 3820 struct intel_uncore *uncore = engine->uncore; 3821 u32 base = engine->mmio_base; 3822 int ret; 3823 3824 ret = intel_engine_init_common(engine); 3825 if (ret) 3826 return ret; 3827 3828 if (intel_init_workaround_bb(engine)) 3829 /* 3830 * We continue even if we fail to initialize WA batch 3831 * because we only expect rare glitches but nothing 3832 * critical to prevent us from using GPU 3833 */ 3834 DRM_ERROR("WA batch buffer initialization failed\n"); 3835 3836 if (HAS_LOGICAL_RING_ELSQ(i915)) { 3837 execlists->submit_reg = uncore->regs + 3838 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 3839 execlists->ctrl_reg = uncore->regs + 3840 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 3841 } else { 3842 execlists->submit_reg = uncore->regs + 3843 i915_mmio_reg_offset(RING_ELSP(base)); 3844 } 3845 3846 execlists->csb_status = 3847 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 3848 3849 execlists->csb_write = 3850 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 3851 3852 if (INTEL_GEN(i915) < 11) 3853 execlists->csb_size = GEN8_CSB_ENTRIES; 3854 else 3855 execlists->csb_size = GEN11_CSB_ENTRIES; 3856 3857 reset_csb_pointers(engine); 3858 3859 return 0; 3860 } 3861 3862 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 3863 { 3864 u32 indirect_ctx_offset; 3865 3866 switch (INTEL_GEN(engine->i915)) { 3867 default: 3868 MISSING_CASE(INTEL_GEN(engine->i915)); 3869 /* fall through */ 3870 case 12: 3871 indirect_ctx_offset = 3872 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3873 break; 3874 case 11: 3875 indirect_ctx_offset = 3876 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3877 break; 3878 case 10: 3879 indirect_ctx_offset = 3880 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3881 break; 3882 case 9: 3883 indirect_ctx_offset = 3884 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3885 break; 3886 case 8: 3887 indirect_ctx_offset = 3888 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3889 break; 3890 } 3891 3892 return indirect_ctx_offset; 3893 } 3894 3895 3896 static void init_common_reg_state(u32 * const regs, 3897 const struct intel_engine_cs *engine, 3898 const struct intel_ring *ring) 3899 { 3900 regs[CTX_CONTEXT_CONTROL] = 3901 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | 3902 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 3903 if (INTEL_GEN(engine->i915) < 11) 3904 regs[CTX_CONTEXT_CONTROL] |= 3905 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 3906 CTX_CTRL_RS_CTX_ENABLE); 3907 3908 regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3909 regs[CTX_BB_STATE] = RING_BB_PPGTT; 3910 } 3911 3912 static void init_wa_bb_reg_state(u32 * const regs, 3913 const struct intel_engine_cs *engine, 3914 u32 pos_bb_per_ctx) 3915 { 3916 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 3917 3918 if (wa_ctx->per_ctx.size) { 3919 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3920 3921 regs[pos_bb_per_ctx] = 3922 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 3923 } 3924 3925 if (wa_ctx->indirect_ctx.size) { 3926 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3927 3928 regs[pos_bb_per_ctx + 2] = 3929 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 3930 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 3931 3932 regs[pos_bb_per_ctx + 4] = 3933 intel_lr_indirect_ctx_offset(engine) << 6; 3934 } 3935 } 3936 3937 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 3938 { 3939 if (i915_vm_is_4lvl(&ppgtt->vm)) { 3940 /* 64b PPGTT (48bit canonical) 3941 * PDP0_DESCRIPTOR contains the base address to PML4 and 3942 * other PDP Descriptors are ignored. 3943 */ 3944 ASSIGN_CTX_PML4(ppgtt, regs); 3945 } else { 3946 ASSIGN_CTX_PDP(ppgtt, regs, 3); 3947 ASSIGN_CTX_PDP(ppgtt, regs, 2); 3948 ASSIGN_CTX_PDP(ppgtt, regs, 1); 3949 ASSIGN_CTX_PDP(ppgtt, regs, 0); 3950 } 3951 } 3952 3953 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 3954 { 3955 if (i915_is_ggtt(vm)) 3956 return i915_vm_to_ggtt(vm)->alias; 3957 else 3958 return i915_vm_to_ppgtt(vm); 3959 } 3960 3961 static void execlists_init_reg_state(u32 *regs, 3962 const struct intel_context *ce, 3963 const struct intel_engine_cs *engine, 3964 const struct intel_ring *ring, 3965 bool close) 3966 { 3967 /* 3968 * A context is actually a big batch buffer with several 3969 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 3970 * values we are setting here are only for the first context restore: 3971 * on a subsequent save, the GPU will recreate this batchbuffer with new 3972 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 3973 * we are not initializing here). 3974 * 3975 * Must keep consistent with virtual_update_register_offsets(). 3976 */ 3977 u32 *bbe = set_offsets(regs, reg_offsets(engine), engine); 3978 3979 if (close) { /* Close the batch; used mainly by live_lrc_layout() */ 3980 *bbe = MI_BATCH_BUFFER_END; 3981 if (INTEL_GEN(engine->i915) >= 10) 3982 *bbe |= BIT(0); 3983 } 3984 3985 init_common_reg_state(regs, engine, ring); 3986 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 3987 3988 init_wa_bb_reg_state(regs, engine, 3989 INTEL_GEN(engine->i915) >= 12 ? 3990 GEN12_CTX_BB_PER_CTX_PTR : 3991 CTX_BB_PER_CTX_PTR); 3992 } 3993 3994 static int 3995 populate_lr_context(struct intel_context *ce, 3996 struct drm_i915_gem_object *ctx_obj, 3997 struct intel_engine_cs *engine, 3998 struct intel_ring *ring) 3999 { 4000 bool inhibit = true; 4001 void *vaddr; 4002 u32 *regs; 4003 int ret; 4004 4005 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4006 if (IS_ERR(vaddr)) { 4007 ret = PTR_ERR(vaddr); 4008 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4009 return ret; 4010 } 4011 4012 set_redzone(vaddr, engine); 4013 4014 if (engine->default_state) { 4015 void *defaults; 4016 4017 defaults = i915_gem_object_pin_map(engine->default_state, 4018 I915_MAP_WB); 4019 if (IS_ERR(defaults)) { 4020 ret = PTR_ERR(defaults); 4021 goto err_unpin_ctx; 4022 } 4023 4024 memcpy(vaddr, defaults, engine->context_size); 4025 i915_gem_object_unpin_map(engine->default_state); 4026 inhibit = false; 4027 } 4028 4029 /* The second page of the context object contains some fields which must 4030 * be set up prior to the first execution. */ 4031 regs = vaddr + LRC_STATE_PN * PAGE_SIZE; 4032 execlists_init_reg_state(regs, ce, engine, ring, inhibit); 4033 if (inhibit) 4034 regs[CTX_CONTEXT_CONTROL] |= 4035 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4036 4037 ret = 0; 4038 err_unpin_ctx: 4039 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4040 i915_gem_object_unpin_map(ctx_obj); 4041 return ret; 4042 } 4043 4044 static int __execlists_context_alloc(struct intel_context *ce, 4045 struct intel_engine_cs *engine) 4046 { 4047 struct drm_i915_gem_object *ctx_obj; 4048 struct intel_ring *ring; 4049 struct i915_vma *vma; 4050 u32 context_size; 4051 int ret; 4052 4053 GEM_BUG_ON(ce->state); 4054 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4055 4056 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4057 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4058 4059 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4060 if (IS_ERR(ctx_obj)) 4061 return PTR_ERR(ctx_obj); 4062 4063 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4064 if (IS_ERR(vma)) { 4065 ret = PTR_ERR(vma); 4066 goto error_deref_obj; 4067 } 4068 4069 if (!ce->timeline) { 4070 struct intel_timeline *tl; 4071 4072 tl = intel_timeline_create(engine->gt, NULL); 4073 if (IS_ERR(tl)) { 4074 ret = PTR_ERR(tl); 4075 goto error_deref_obj; 4076 } 4077 4078 ce->timeline = tl; 4079 } 4080 4081 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4082 if (IS_ERR(ring)) { 4083 ret = PTR_ERR(ring); 4084 goto error_deref_obj; 4085 } 4086 4087 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4088 if (ret) { 4089 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4090 goto error_ring_free; 4091 } 4092 4093 ce->ring = ring; 4094 ce->state = vma; 4095 4096 return 0; 4097 4098 error_ring_free: 4099 intel_ring_put(ring); 4100 error_deref_obj: 4101 i915_gem_object_put(ctx_obj); 4102 return ret; 4103 } 4104 4105 static struct list_head *virtual_queue(struct virtual_engine *ve) 4106 { 4107 return &ve->base.execlists.default_priolist.requests[0]; 4108 } 4109 4110 static void virtual_context_destroy(struct kref *kref) 4111 { 4112 struct virtual_engine *ve = 4113 container_of(kref, typeof(*ve), context.ref); 4114 unsigned int n; 4115 4116 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4117 GEM_BUG_ON(ve->request); 4118 GEM_BUG_ON(ve->context.inflight); 4119 4120 for (n = 0; n < ve->num_siblings; n++) { 4121 struct intel_engine_cs *sibling = ve->siblings[n]; 4122 struct rb_node *node = &ve->nodes[sibling->id].rb; 4123 4124 if (RB_EMPTY_NODE(node)) 4125 continue; 4126 4127 spin_lock_irq(&sibling->active.lock); 4128 4129 /* Detachment is lazily performed in the execlists tasklet */ 4130 if (!RB_EMPTY_NODE(node)) 4131 rb_erase_cached(node, &sibling->execlists.virtual); 4132 4133 spin_unlock_irq(&sibling->active.lock); 4134 } 4135 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4136 4137 if (ve->context.state) 4138 __execlists_context_fini(&ve->context); 4139 intel_context_fini(&ve->context); 4140 4141 kfree(ve->bonds); 4142 kfree(ve); 4143 } 4144 4145 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4146 { 4147 int swp; 4148 4149 /* 4150 * Pick a random sibling on starting to help spread the load around. 4151 * 4152 * New contexts are typically created with exactly the same order 4153 * of siblings, and often started in batches. Due to the way we iterate 4154 * the array of sibling when submitting requests, sibling[0] is 4155 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4156 * randomised across the system, we also help spread the load by the 4157 * first engine we inspect being different each time. 4158 * 4159 * NB This does not force us to execute on this engine, it will just 4160 * typically be the first we inspect for submission. 4161 */ 4162 swp = prandom_u32_max(ve->num_siblings); 4163 if (!swp) 4164 return; 4165 4166 swap(ve->siblings[swp], ve->siblings[0]); 4167 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4168 virtual_update_register_offsets(ve->context.lrc_reg_state, 4169 ve->siblings[0]); 4170 } 4171 4172 static int virtual_context_pin(struct intel_context *ce) 4173 { 4174 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4175 int err; 4176 4177 /* Note: we must use a real engine class for setting up reg state */ 4178 err = __execlists_context_pin(ce, ve->siblings[0]); 4179 if (err) 4180 return err; 4181 4182 virtual_engine_initial_hint(ve); 4183 return 0; 4184 } 4185 4186 static void virtual_context_enter(struct intel_context *ce) 4187 { 4188 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4189 unsigned int n; 4190 4191 for (n = 0; n < ve->num_siblings; n++) 4192 intel_engine_pm_get(ve->siblings[n]); 4193 4194 intel_timeline_enter(ce->timeline); 4195 } 4196 4197 static void virtual_context_exit(struct intel_context *ce) 4198 { 4199 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4200 unsigned int n; 4201 4202 intel_timeline_exit(ce->timeline); 4203 4204 for (n = 0; n < ve->num_siblings; n++) 4205 intel_engine_pm_put(ve->siblings[n]); 4206 } 4207 4208 static const struct intel_context_ops virtual_context_ops = { 4209 .pin = virtual_context_pin, 4210 .unpin = execlists_context_unpin, 4211 4212 .enter = virtual_context_enter, 4213 .exit = virtual_context_exit, 4214 4215 .destroy = virtual_context_destroy, 4216 }; 4217 4218 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4219 { 4220 struct i915_request *rq; 4221 intel_engine_mask_t mask; 4222 4223 rq = READ_ONCE(ve->request); 4224 if (!rq) 4225 return 0; 4226 4227 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4228 mask = rq->execution_mask; 4229 if (unlikely(!mask)) { 4230 /* Invalid selection, submit to a random engine in error */ 4231 i915_request_skip(rq, -ENODEV); 4232 mask = ve->siblings[0]->mask; 4233 } 4234 4235 GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n", 4236 ve->base.name, 4237 rq->fence.context, rq->fence.seqno, 4238 mask, ve->base.execlists.queue_priority_hint); 4239 4240 return mask; 4241 } 4242 4243 static void virtual_submission_tasklet(unsigned long data) 4244 { 4245 struct virtual_engine * const ve = (struct virtual_engine *)data; 4246 const int prio = ve->base.execlists.queue_priority_hint; 4247 intel_engine_mask_t mask; 4248 unsigned int n; 4249 4250 rcu_read_lock(); 4251 mask = virtual_submission_mask(ve); 4252 rcu_read_unlock(); 4253 if (unlikely(!mask)) 4254 return; 4255 4256 local_irq_disable(); 4257 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4258 struct intel_engine_cs *sibling = ve->siblings[n]; 4259 struct ve_node * const node = &ve->nodes[sibling->id]; 4260 struct rb_node **parent, *rb; 4261 bool first; 4262 4263 if (unlikely(!(mask & sibling->mask))) { 4264 if (!RB_EMPTY_NODE(&node->rb)) { 4265 spin_lock(&sibling->active.lock); 4266 rb_erase_cached(&node->rb, 4267 &sibling->execlists.virtual); 4268 RB_CLEAR_NODE(&node->rb); 4269 spin_unlock(&sibling->active.lock); 4270 } 4271 continue; 4272 } 4273 4274 spin_lock(&sibling->active.lock); 4275 4276 if (!RB_EMPTY_NODE(&node->rb)) { 4277 /* 4278 * Cheat and avoid rebalancing the tree if we can 4279 * reuse this node in situ. 4280 */ 4281 first = rb_first_cached(&sibling->execlists.virtual) == 4282 &node->rb; 4283 if (prio == node->prio || (prio > node->prio && first)) 4284 goto submit_engine; 4285 4286 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4287 } 4288 4289 rb = NULL; 4290 first = true; 4291 parent = &sibling->execlists.virtual.rb_root.rb_node; 4292 while (*parent) { 4293 struct ve_node *other; 4294 4295 rb = *parent; 4296 other = rb_entry(rb, typeof(*other), rb); 4297 if (prio > other->prio) { 4298 parent = &rb->rb_left; 4299 } else { 4300 parent = &rb->rb_right; 4301 first = false; 4302 } 4303 } 4304 4305 rb_link_node(&node->rb, rb, parent); 4306 rb_insert_color_cached(&node->rb, 4307 &sibling->execlists.virtual, 4308 first); 4309 4310 submit_engine: 4311 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4312 node->prio = prio; 4313 if (first && prio > sibling->execlists.queue_priority_hint) { 4314 sibling->execlists.queue_priority_hint = prio; 4315 tasklet_hi_schedule(&sibling->execlists.tasklet); 4316 } 4317 4318 spin_unlock(&sibling->active.lock); 4319 } 4320 local_irq_enable(); 4321 } 4322 4323 static void virtual_submit_request(struct i915_request *rq) 4324 { 4325 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4326 struct i915_request *old; 4327 unsigned long flags; 4328 4329 GEM_TRACE("%s: rq=%llx:%lld\n", 4330 ve->base.name, 4331 rq->fence.context, 4332 rq->fence.seqno); 4333 4334 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4335 4336 spin_lock_irqsave(&ve->base.active.lock, flags); 4337 4338 old = ve->request; 4339 if (old) { /* background completion event from preempt-to-busy */ 4340 GEM_BUG_ON(!i915_request_completed(old)); 4341 __i915_request_submit(old); 4342 i915_request_put(old); 4343 } 4344 4345 if (i915_request_completed(rq)) { 4346 __i915_request_submit(rq); 4347 4348 ve->base.execlists.queue_priority_hint = INT_MIN; 4349 ve->request = NULL; 4350 } else { 4351 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4352 ve->request = i915_request_get(rq); 4353 4354 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4355 list_move_tail(&rq->sched.link, virtual_queue(ve)); 4356 4357 tasklet_schedule(&ve->base.execlists.tasklet); 4358 } 4359 4360 spin_unlock_irqrestore(&ve->base.active.lock, flags); 4361 } 4362 4363 static struct ve_bond * 4364 virtual_find_bond(struct virtual_engine *ve, 4365 const struct intel_engine_cs *master) 4366 { 4367 int i; 4368 4369 for (i = 0; i < ve->num_bonds; i++) { 4370 if (ve->bonds[i].master == master) 4371 return &ve->bonds[i]; 4372 } 4373 4374 return NULL; 4375 } 4376 4377 static void 4378 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 4379 { 4380 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4381 intel_engine_mask_t allowed, exec; 4382 struct ve_bond *bond; 4383 4384 allowed = ~to_request(signal)->engine->mask; 4385 4386 bond = virtual_find_bond(ve, to_request(signal)->engine); 4387 if (bond) 4388 allowed &= bond->sibling_mask; 4389 4390 /* Restrict the bonded request to run on only the available engines */ 4391 exec = READ_ONCE(rq->execution_mask); 4392 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 4393 ; 4394 4395 /* Prevent the master from being re-run on the bonded engines */ 4396 to_request(signal)->execution_mask &= ~allowed; 4397 } 4398 4399 struct intel_context * 4400 intel_execlists_create_virtual(struct i915_gem_context *ctx, 4401 struct intel_engine_cs **siblings, 4402 unsigned int count) 4403 { 4404 struct virtual_engine *ve; 4405 unsigned int n; 4406 int err; 4407 4408 if (count == 0) 4409 return ERR_PTR(-EINVAL); 4410 4411 if (count == 1) 4412 return intel_context_create(ctx, siblings[0]); 4413 4414 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 4415 if (!ve) 4416 return ERR_PTR(-ENOMEM); 4417 4418 ve->base.i915 = ctx->i915; 4419 ve->base.gt = siblings[0]->gt; 4420 ve->base.uncore = siblings[0]->uncore; 4421 ve->base.id = -1; 4422 ve->base.class = OTHER_CLASS; 4423 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 4424 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4425 4426 /* 4427 * The decision on whether to submit a request using semaphores 4428 * depends on the saturated state of the engine. We only compute 4429 * this during HW submission of the request, and we need for this 4430 * state to be globally applied to all requests being submitted 4431 * to this engine. Virtual engines encompass more than one physical 4432 * engine and so we cannot accurately tell in advance if one of those 4433 * engines is already saturated and so cannot afford to use a semaphore 4434 * and be pessimized in priority for doing so -- if we are the only 4435 * context using semaphores after all other clients have stopped, we 4436 * will be starved on the saturated system. Such a global switch for 4437 * semaphores is less than ideal, but alas is the current compromise. 4438 */ 4439 ve->base.saturated = ALL_ENGINES; 4440 4441 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 4442 4443 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 4444 intel_engine_init_breadcrumbs(&ve->base); 4445 4446 intel_engine_init_execlists(&ve->base); 4447 4448 ve->base.cops = &virtual_context_ops; 4449 ve->base.request_alloc = execlists_request_alloc; 4450 4451 ve->base.schedule = i915_schedule; 4452 ve->base.submit_request = virtual_submit_request; 4453 ve->base.bond_execute = virtual_bond_execute; 4454 4455 INIT_LIST_HEAD(virtual_queue(ve)); 4456 ve->base.execlists.queue_priority_hint = INT_MIN; 4457 tasklet_init(&ve->base.execlists.tasklet, 4458 virtual_submission_tasklet, 4459 (unsigned long)ve); 4460 4461 intel_context_init(&ve->context, ctx, &ve->base); 4462 4463 for (n = 0; n < count; n++) { 4464 struct intel_engine_cs *sibling = siblings[n]; 4465 4466 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 4467 if (sibling->mask & ve->base.mask) { 4468 DRM_DEBUG("duplicate %s entry in load balancer\n", 4469 sibling->name); 4470 err = -EINVAL; 4471 goto err_put; 4472 } 4473 4474 /* 4475 * The virtual engine implementation is tightly coupled to 4476 * the execlists backend -- we push out request directly 4477 * into a tree inside each physical engine. We could support 4478 * layering if we handle cloning of the requests and 4479 * submitting a copy into each backend. 4480 */ 4481 if (sibling->execlists.tasklet.func != 4482 execlists_submission_tasklet) { 4483 err = -ENODEV; 4484 goto err_put; 4485 } 4486 4487 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 4488 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 4489 4490 ve->siblings[ve->num_siblings++] = sibling; 4491 ve->base.mask |= sibling->mask; 4492 4493 /* 4494 * All physical engines must be compatible for their emission 4495 * functions (as we build the instructions during request 4496 * construction and do not alter them before submission 4497 * on the physical engine). We use the engine class as a guide 4498 * here, although that could be refined. 4499 */ 4500 if (ve->base.class != OTHER_CLASS) { 4501 if (ve->base.class != sibling->class) { 4502 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 4503 sibling->class, ve->base.class); 4504 err = -EINVAL; 4505 goto err_put; 4506 } 4507 continue; 4508 } 4509 4510 ve->base.class = sibling->class; 4511 ve->base.uabi_class = sibling->uabi_class; 4512 snprintf(ve->base.name, sizeof(ve->base.name), 4513 "v%dx%d", ve->base.class, count); 4514 ve->base.context_size = sibling->context_size; 4515 4516 ve->base.emit_bb_start = sibling->emit_bb_start; 4517 ve->base.emit_flush = sibling->emit_flush; 4518 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 4519 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 4520 ve->base.emit_fini_breadcrumb_dw = 4521 sibling->emit_fini_breadcrumb_dw; 4522 4523 ve->base.flags = sibling->flags; 4524 } 4525 4526 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 4527 4528 err = __execlists_context_alloc(&ve->context, siblings[0]); 4529 if (err) 4530 goto err_put; 4531 4532 __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags); 4533 4534 return &ve->context; 4535 4536 err_put: 4537 intel_context_put(&ve->context); 4538 return ERR_PTR(err); 4539 } 4540 4541 struct intel_context * 4542 intel_execlists_clone_virtual(struct i915_gem_context *ctx, 4543 struct intel_engine_cs *src) 4544 { 4545 struct virtual_engine *se = to_virtual_engine(src); 4546 struct intel_context *dst; 4547 4548 dst = intel_execlists_create_virtual(ctx, 4549 se->siblings, 4550 se->num_siblings); 4551 if (IS_ERR(dst)) 4552 return dst; 4553 4554 if (se->num_bonds) { 4555 struct virtual_engine *de = to_virtual_engine(dst->engine); 4556 4557 de->bonds = kmemdup(se->bonds, 4558 sizeof(*se->bonds) * se->num_bonds, 4559 GFP_KERNEL); 4560 if (!de->bonds) { 4561 intel_context_put(dst); 4562 return ERR_PTR(-ENOMEM); 4563 } 4564 4565 de->num_bonds = se->num_bonds; 4566 } 4567 4568 return dst; 4569 } 4570 4571 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 4572 const struct intel_engine_cs *master, 4573 const struct intel_engine_cs *sibling) 4574 { 4575 struct virtual_engine *ve = to_virtual_engine(engine); 4576 struct ve_bond *bond; 4577 int n; 4578 4579 /* Sanity check the sibling is part of the virtual engine */ 4580 for (n = 0; n < ve->num_siblings; n++) 4581 if (sibling == ve->siblings[n]) 4582 break; 4583 if (n == ve->num_siblings) 4584 return -EINVAL; 4585 4586 bond = virtual_find_bond(ve, master); 4587 if (bond) { 4588 bond->sibling_mask |= sibling->mask; 4589 return 0; 4590 } 4591 4592 bond = krealloc(ve->bonds, 4593 sizeof(*bond) * (ve->num_bonds + 1), 4594 GFP_KERNEL); 4595 if (!bond) 4596 return -ENOMEM; 4597 4598 bond[ve->num_bonds].master = master; 4599 bond[ve->num_bonds].sibling_mask = sibling->mask; 4600 4601 ve->bonds = bond; 4602 ve->num_bonds++; 4603 4604 return 0; 4605 } 4606 4607 struct intel_engine_cs * 4608 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 4609 unsigned int sibling) 4610 { 4611 struct virtual_engine *ve = to_virtual_engine(engine); 4612 4613 if (sibling >= ve->num_siblings) 4614 return NULL; 4615 4616 return ve->siblings[sibling]; 4617 } 4618 4619 void intel_execlists_show_requests(struct intel_engine_cs *engine, 4620 struct drm_printer *m, 4621 void (*show_request)(struct drm_printer *m, 4622 struct i915_request *rq, 4623 const char *prefix), 4624 unsigned int max) 4625 { 4626 const struct intel_engine_execlists *execlists = &engine->execlists; 4627 struct i915_request *rq, *last; 4628 unsigned long flags; 4629 unsigned int count; 4630 struct rb_node *rb; 4631 4632 spin_lock_irqsave(&engine->active.lock, flags); 4633 4634 last = NULL; 4635 count = 0; 4636 list_for_each_entry(rq, &engine->active.requests, sched.link) { 4637 if (count++ < max - 1) 4638 show_request(m, rq, "\t\tE "); 4639 else 4640 last = rq; 4641 } 4642 if (last) { 4643 if (count > max) { 4644 drm_printf(m, 4645 "\t\t...skipping %d executing requests...\n", 4646 count - max); 4647 } 4648 show_request(m, last, "\t\tE "); 4649 } 4650 4651 last = NULL; 4652 count = 0; 4653 if (execlists->queue_priority_hint != INT_MIN) 4654 drm_printf(m, "\t\tQueue priority hint: %d\n", 4655 execlists->queue_priority_hint); 4656 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 4657 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 4658 int i; 4659 4660 priolist_for_each_request(rq, p, i) { 4661 if (count++ < max - 1) 4662 show_request(m, rq, "\t\tQ "); 4663 else 4664 last = rq; 4665 } 4666 } 4667 if (last) { 4668 if (count > max) { 4669 drm_printf(m, 4670 "\t\t...skipping %d queued requests...\n", 4671 count - max); 4672 } 4673 show_request(m, last, "\t\tQ "); 4674 } 4675 4676 last = NULL; 4677 count = 0; 4678 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 4679 struct virtual_engine *ve = 4680 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4681 struct i915_request *rq = READ_ONCE(ve->request); 4682 4683 if (rq) { 4684 if (count++ < max - 1) 4685 show_request(m, rq, "\t\tV "); 4686 else 4687 last = rq; 4688 } 4689 } 4690 if (last) { 4691 if (count > max) { 4692 drm_printf(m, 4693 "\t\t...skipping %d virtual requests...\n", 4694 count - max); 4695 } 4696 show_request(m, last, "\t\tV "); 4697 } 4698 4699 spin_unlock_irqrestore(&engine->active.lock, flags); 4700 } 4701 4702 void intel_lr_context_reset(struct intel_engine_cs *engine, 4703 struct intel_context *ce, 4704 u32 head, 4705 bool scrub) 4706 { 4707 GEM_BUG_ON(!intel_context_is_pinned(ce)); 4708 4709 /* 4710 * We want a simple context + ring to execute the breadcrumb update. 4711 * We cannot rely on the context being intact across the GPU hang, 4712 * so clear it and rebuild just what we need for the breadcrumb. 4713 * All pending requests for this context will be zapped, and any 4714 * future request will be after userspace has had the opportunity 4715 * to recreate its own state. 4716 */ 4717 if (scrub) 4718 restore_default_state(ce, engine); 4719 4720 /* Rerun the request; its payload has been neutered (if guilty). */ 4721 ce->ring->head = head; 4722 intel_ring_update_space(ce->ring); 4723 4724 __execlists_update_reg_state(ce, engine); 4725 } 4726 4727 bool 4728 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 4729 { 4730 return engine->set_default_submission == 4731 intel_execlists_set_default_submission; 4732 } 4733 4734 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 4735 #include "selftest_lrc.c" 4736 #endif 4737