1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 151 #define RING_EXECLIST_QFULL (1 << 0x2) 152 #define RING_EXECLIST1_VALID (1 << 0x3) 153 #define RING_EXECLIST0_VALID (1 << 0x4) 154 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 155 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 156 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 157 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 159 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 162 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 163 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 164 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 166 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 167 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 169 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 172 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 173 #define GEN12_IDLE_CTX_ID 0x7FF 174 #define GEN12_CSB_CTX_VALID(csb_dw) \ 175 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 176 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 179 #define WA_TAIL_DWORDS 2 180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[0]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine); 241 242 static void mark_eio(struct i915_request *rq) 243 { 244 if (i915_request_completed(rq)) 245 return; 246 247 GEM_BUG_ON(i915_request_signaled(rq)); 248 249 dma_fence_set_error(&rq->fence, -EIO); 250 i915_request_mark_complete(rq); 251 } 252 253 static struct i915_request * 254 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 255 { 256 struct i915_request *active = rq; 257 258 rcu_read_lock(); 259 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 260 if (i915_request_completed(rq)) 261 break; 262 263 active = rq; 264 } 265 rcu_read_unlock(); 266 267 return active; 268 } 269 270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 271 { 272 return (i915_ggtt_offset(engine->status_page.vma) + 273 I915_GEM_HWS_PREEMPT_ADDR); 274 } 275 276 static inline void 277 ring_set_paused(const struct intel_engine_cs *engine, int state) 278 { 279 /* 280 * We inspect HWS_PREEMPT with a semaphore inside 281 * engine->emit_fini_breadcrumb. If the dword is true, 282 * the ring is paused as the semaphore will busywait 283 * until the dword is false. 284 */ 285 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 286 if (state) 287 wmb(); 288 } 289 290 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 291 { 292 return rb_entry(rb, struct i915_priolist, node); 293 } 294 295 static inline int rq_prio(const struct i915_request *rq) 296 { 297 return rq->sched.attr.priority; 298 } 299 300 static int effective_prio(const struct i915_request *rq) 301 { 302 int prio = rq_prio(rq); 303 304 /* 305 * If this request is special and must not be interrupted at any 306 * cost, so be it. Note we are only checking the most recent request 307 * in the context and so may be masking an earlier vip request. It 308 * is hoped that under the conditions where nopreempt is used, this 309 * will not matter (i.e. all requests to that context will be 310 * nopreempt for as long as desired). 311 */ 312 if (i915_request_has_nopreempt(rq)) 313 prio = I915_PRIORITY_UNPREEMPTABLE; 314 315 /* 316 * On unwinding the active request, we give it a priority bump 317 * if it has completed waiting on any semaphore. If we know that 318 * the request has already started, we can prevent an unwanted 319 * preempt-to-idle cycle by taking that into account now. 320 */ 321 if (__i915_request_has_started(rq)) 322 prio |= I915_PRIORITY_NOSEMAPHORE; 323 324 /* Restrict mere WAIT boosts from triggering preemption */ 325 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 326 return prio | __NO_PREEMPTION; 327 } 328 329 static int queue_prio(const struct intel_engine_execlists *execlists) 330 { 331 struct i915_priolist *p; 332 struct rb_node *rb; 333 334 rb = rb_first_cached(&execlists->queue); 335 if (!rb) 336 return INT_MIN; 337 338 /* 339 * As the priolist[] are inverted, with the highest priority in [0], 340 * we have to flip the index value to become priority. 341 */ 342 p = to_priolist(rb); 343 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 344 } 345 346 static inline bool need_preempt(const struct intel_engine_cs *engine, 347 const struct i915_request *rq, 348 struct rb_node *rb) 349 { 350 int last_prio; 351 352 if (!intel_engine_has_semaphores(engine)) 353 return false; 354 355 /* 356 * Check if the current priority hint merits a preemption attempt. 357 * 358 * We record the highest value priority we saw during rescheduling 359 * prior to this dequeue, therefore we know that if it is strictly 360 * less than the current tail of ESLP[0], we do not need to force 361 * a preempt-to-idle cycle. 362 * 363 * However, the priority hint is a mere hint that we may need to 364 * preempt. If that hint is stale or we may be trying to preempt 365 * ourselves, ignore the request. 366 * 367 * More naturally we would write 368 * prio >= max(0, last); 369 * except that we wish to prevent triggering preemption at the same 370 * priority level: the task that is running should remain running 371 * to preserve FIFO ordering of dependencies. 372 */ 373 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 374 if (engine->execlists.queue_priority_hint <= last_prio) 375 return false; 376 377 /* 378 * Check against the first request in ELSP[1], it will, thanks to the 379 * power of PI, be the highest priority of that context. 380 */ 381 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 382 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 383 return true; 384 385 if (rb) { 386 struct virtual_engine *ve = 387 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 388 bool preempt = false; 389 390 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 391 struct i915_request *next; 392 393 rcu_read_lock(); 394 next = READ_ONCE(ve->request); 395 if (next) 396 preempt = rq_prio(next) > last_prio; 397 rcu_read_unlock(); 398 } 399 400 if (preempt) 401 return preempt; 402 } 403 404 /* 405 * If the inflight context did not trigger the preemption, then maybe 406 * it was the set of queued requests? Pick the highest priority in 407 * the queue (the first active priolist) and see if it deserves to be 408 * running instead of ELSP[0]. 409 * 410 * The highest priority request in the queue can not be either 411 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 412 * context, it's priority would not exceed ELSP[0] aka last_prio. 413 */ 414 return queue_prio(&engine->execlists) > last_prio; 415 } 416 417 __maybe_unused static inline bool 418 assert_priority_queue(const struct i915_request *prev, 419 const struct i915_request *next) 420 { 421 /* 422 * Without preemption, the prev may refer to the still active element 423 * which we refuse to let go. 424 * 425 * Even with preemption, there are times when we think it is better not 426 * to preempt and leave an ostensibly lower priority request in flight. 427 */ 428 if (i915_request_is_active(prev)) 429 return true; 430 431 return rq_prio(prev) >= rq_prio(next); 432 } 433 434 /* 435 * The context descriptor encodes various attributes of a context, 436 * including its GTT address and some flags. Because it's fairly 437 * expensive to calculate, we'll just do it once and cache the result, 438 * which remains valid until the context is unpinned. 439 * 440 * This is what a descriptor looks like, from LSB to MSB:: 441 * 442 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 443 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 444 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 445 * bits 53-54: mbz, reserved for use by hardware 446 * bits 55-63: group ID, currently unused and set to 0 447 * 448 * Starting from Gen11, the upper dword of the descriptor has a new format: 449 * 450 * bits 32-36: reserved 451 * bits 37-47: SW context ID 452 * bits 48:53: engine instance 453 * bit 54: mbz, reserved for use by hardware 454 * bits 55-60: SW counter 455 * bits 61-63: engine class 456 * 457 * engine info, SW context ID and SW counter need to form a unique number 458 * (Context ID) per lrc. 459 */ 460 static u64 461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 462 { 463 u64 desc; 464 465 desc = INTEL_LEGACY_32B_CONTEXT; 466 if (i915_vm_is_4lvl(ce->vm)) 467 desc = INTEL_LEGACY_64B_CONTEXT; 468 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 469 470 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 471 if (IS_GEN(engine->i915, 8)) 472 desc |= GEN8_CTX_L3LLC_COHERENT; 473 474 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 475 /* 476 * The following 32bits are copied into the OA reports (dword 2). 477 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 478 * anything below. 479 */ 480 if (INTEL_GEN(engine->i915) >= 11) { 481 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 482 /* bits 48-53 */ 483 484 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 485 /* bits 61-63 */ 486 } 487 488 return desc; 489 } 490 491 static inline unsigned int dword_in_page(void *addr) 492 { 493 return offset_in_page(addr) / sizeof(u32); 494 } 495 496 static void set_offsets(u32 *regs, 497 const u8 *data, 498 const struct intel_engine_cs *engine, 499 bool clear) 500 #define NOP(x) (BIT(7) | (x)) 501 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 502 #define POSTED BIT(0) 503 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 504 #define REG16(x) \ 505 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 506 (((x) >> 2) & 0x7f) 507 #define END(x) 0, (x) 508 { 509 const u32 base = engine->mmio_base; 510 511 while (*data) { 512 u8 count, flags; 513 514 if (*data & BIT(7)) { /* skip */ 515 count = *data++ & ~BIT(7); 516 if (clear) 517 memset32(regs, MI_NOOP, count); 518 regs += count; 519 continue; 520 } 521 522 count = *data & 0x3f; 523 flags = *data >> 6; 524 data++; 525 526 *regs = MI_LOAD_REGISTER_IMM(count); 527 if (flags & POSTED) 528 *regs |= MI_LRI_FORCE_POSTED; 529 if (INTEL_GEN(engine->i915) >= 11) 530 *regs |= MI_LRI_CS_MMIO; 531 regs++; 532 533 GEM_BUG_ON(!count); 534 do { 535 u32 offset = 0; 536 u8 v; 537 538 do { 539 v = *data++; 540 offset <<= 7; 541 offset |= v & ~BIT(7); 542 } while (v & BIT(7)); 543 544 regs[0] = base + (offset << 2); 545 if (clear) 546 regs[1] = 0; 547 regs += 2; 548 } while (--count); 549 } 550 551 if (clear) { 552 u8 count = *++data; 553 554 /* Clear past the tail for HW access */ 555 GEM_BUG_ON(dword_in_page(regs) > count); 556 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 557 558 /* Close the batch; used mainly by live_lrc_layout() */ 559 *regs = MI_BATCH_BUFFER_END; 560 if (INTEL_GEN(engine->i915) >= 10) 561 *regs |= BIT(0); 562 } 563 } 564 565 static const u8 gen8_xcs_offsets[] = { 566 NOP(1), 567 LRI(11, 0), 568 REG16(0x244), 569 REG(0x034), 570 REG(0x030), 571 REG(0x038), 572 REG(0x03c), 573 REG(0x168), 574 REG(0x140), 575 REG(0x110), 576 REG(0x11c), 577 REG(0x114), 578 REG(0x118), 579 580 NOP(9), 581 LRI(9, 0), 582 REG16(0x3a8), 583 REG16(0x28c), 584 REG16(0x288), 585 REG16(0x284), 586 REG16(0x280), 587 REG16(0x27c), 588 REG16(0x278), 589 REG16(0x274), 590 REG16(0x270), 591 592 NOP(13), 593 LRI(2, 0), 594 REG16(0x200), 595 REG(0x028), 596 597 END(80) 598 }; 599 600 static const u8 gen9_xcs_offsets[] = { 601 NOP(1), 602 LRI(14, POSTED), 603 REG16(0x244), 604 REG(0x034), 605 REG(0x030), 606 REG(0x038), 607 REG(0x03c), 608 REG(0x168), 609 REG(0x140), 610 REG(0x110), 611 REG(0x11c), 612 REG(0x114), 613 REG(0x118), 614 REG(0x1c0), 615 REG(0x1c4), 616 REG(0x1c8), 617 618 NOP(3), 619 LRI(9, POSTED), 620 REG16(0x3a8), 621 REG16(0x28c), 622 REG16(0x288), 623 REG16(0x284), 624 REG16(0x280), 625 REG16(0x27c), 626 REG16(0x278), 627 REG16(0x274), 628 REG16(0x270), 629 630 NOP(13), 631 LRI(1, POSTED), 632 REG16(0x200), 633 634 NOP(13), 635 LRI(44, POSTED), 636 REG(0x028), 637 REG(0x09c), 638 REG(0x0c0), 639 REG(0x178), 640 REG(0x17c), 641 REG16(0x358), 642 REG(0x170), 643 REG(0x150), 644 REG(0x154), 645 REG(0x158), 646 REG16(0x41c), 647 REG16(0x600), 648 REG16(0x604), 649 REG16(0x608), 650 REG16(0x60c), 651 REG16(0x610), 652 REG16(0x614), 653 REG16(0x618), 654 REG16(0x61c), 655 REG16(0x620), 656 REG16(0x624), 657 REG16(0x628), 658 REG16(0x62c), 659 REG16(0x630), 660 REG16(0x634), 661 REG16(0x638), 662 REG16(0x63c), 663 REG16(0x640), 664 REG16(0x644), 665 REG16(0x648), 666 REG16(0x64c), 667 REG16(0x650), 668 REG16(0x654), 669 REG16(0x658), 670 REG16(0x65c), 671 REG16(0x660), 672 REG16(0x664), 673 REG16(0x668), 674 REG16(0x66c), 675 REG16(0x670), 676 REG16(0x674), 677 REG16(0x678), 678 REG16(0x67c), 679 REG(0x068), 680 681 END(176) 682 }; 683 684 static const u8 gen12_xcs_offsets[] = { 685 NOP(1), 686 LRI(13, POSTED), 687 REG16(0x244), 688 REG(0x034), 689 REG(0x030), 690 REG(0x038), 691 REG(0x03c), 692 REG(0x168), 693 REG(0x140), 694 REG(0x110), 695 REG(0x1c0), 696 REG(0x1c4), 697 REG(0x1c8), 698 REG(0x180), 699 REG16(0x2b4), 700 701 NOP(5), 702 LRI(9, POSTED), 703 REG16(0x3a8), 704 REG16(0x28c), 705 REG16(0x288), 706 REG16(0x284), 707 REG16(0x280), 708 REG16(0x27c), 709 REG16(0x278), 710 REG16(0x274), 711 REG16(0x270), 712 713 END(80) 714 }; 715 716 static const u8 gen8_rcs_offsets[] = { 717 NOP(1), 718 LRI(14, POSTED), 719 REG16(0x244), 720 REG(0x034), 721 REG(0x030), 722 REG(0x038), 723 REG(0x03c), 724 REG(0x168), 725 REG(0x140), 726 REG(0x110), 727 REG(0x11c), 728 REG(0x114), 729 REG(0x118), 730 REG(0x1c0), 731 REG(0x1c4), 732 REG(0x1c8), 733 734 NOP(3), 735 LRI(9, POSTED), 736 REG16(0x3a8), 737 REG16(0x28c), 738 REG16(0x288), 739 REG16(0x284), 740 REG16(0x280), 741 REG16(0x27c), 742 REG16(0x278), 743 REG16(0x274), 744 REG16(0x270), 745 746 NOP(13), 747 LRI(1, 0), 748 REG(0x0c8), 749 750 END(80) 751 }; 752 753 static const u8 gen9_rcs_offsets[] = { 754 NOP(1), 755 LRI(14, POSTED), 756 REG16(0x244), 757 REG(0x34), 758 REG(0x30), 759 REG(0x38), 760 REG(0x3c), 761 REG(0x168), 762 REG(0x140), 763 REG(0x110), 764 REG(0x11c), 765 REG(0x114), 766 REG(0x118), 767 REG(0x1c0), 768 REG(0x1c4), 769 REG(0x1c8), 770 771 NOP(3), 772 LRI(9, POSTED), 773 REG16(0x3a8), 774 REG16(0x28c), 775 REG16(0x288), 776 REG16(0x284), 777 REG16(0x280), 778 REG16(0x27c), 779 REG16(0x278), 780 REG16(0x274), 781 REG16(0x270), 782 783 NOP(13), 784 LRI(1, 0), 785 REG(0xc8), 786 787 NOP(13), 788 LRI(44, POSTED), 789 REG(0x28), 790 REG(0x9c), 791 REG(0xc0), 792 REG(0x178), 793 REG(0x17c), 794 REG16(0x358), 795 REG(0x170), 796 REG(0x150), 797 REG(0x154), 798 REG(0x158), 799 REG16(0x41c), 800 REG16(0x600), 801 REG16(0x604), 802 REG16(0x608), 803 REG16(0x60c), 804 REG16(0x610), 805 REG16(0x614), 806 REG16(0x618), 807 REG16(0x61c), 808 REG16(0x620), 809 REG16(0x624), 810 REG16(0x628), 811 REG16(0x62c), 812 REG16(0x630), 813 REG16(0x634), 814 REG16(0x638), 815 REG16(0x63c), 816 REG16(0x640), 817 REG16(0x644), 818 REG16(0x648), 819 REG16(0x64c), 820 REG16(0x650), 821 REG16(0x654), 822 REG16(0x658), 823 REG16(0x65c), 824 REG16(0x660), 825 REG16(0x664), 826 REG16(0x668), 827 REG16(0x66c), 828 REG16(0x670), 829 REG16(0x674), 830 REG16(0x678), 831 REG16(0x67c), 832 REG(0x68), 833 834 END(176) 835 }; 836 837 static const u8 gen11_rcs_offsets[] = { 838 NOP(1), 839 LRI(15, POSTED), 840 REG16(0x244), 841 REG(0x034), 842 REG(0x030), 843 REG(0x038), 844 REG(0x03c), 845 REG(0x168), 846 REG(0x140), 847 REG(0x110), 848 REG(0x11c), 849 REG(0x114), 850 REG(0x118), 851 REG(0x1c0), 852 REG(0x1c4), 853 REG(0x1c8), 854 REG(0x180), 855 856 NOP(1), 857 LRI(9, POSTED), 858 REG16(0x3a8), 859 REG16(0x28c), 860 REG16(0x288), 861 REG16(0x284), 862 REG16(0x280), 863 REG16(0x27c), 864 REG16(0x278), 865 REG16(0x274), 866 REG16(0x270), 867 868 LRI(1, POSTED), 869 REG(0x1b0), 870 871 NOP(10), 872 LRI(1, 0), 873 REG(0x0c8), 874 875 END(80) 876 }; 877 878 static const u8 gen12_rcs_offsets[] = { 879 NOP(1), 880 LRI(13, POSTED), 881 REG16(0x244), 882 REG(0x034), 883 REG(0x030), 884 REG(0x038), 885 REG(0x03c), 886 REG(0x168), 887 REG(0x140), 888 REG(0x110), 889 REG(0x1c0), 890 REG(0x1c4), 891 REG(0x1c8), 892 REG(0x180), 893 REG16(0x2b4), 894 895 NOP(5), 896 LRI(9, POSTED), 897 REG16(0x3a8), 898 REG16(0x28c), 899 REG16(0x288), 900 REG16(0x284), 901 REG16(0x280), 902 REG16(0x27c), 903 REG16(0x278), 904 REG16(0x274), 905 REG16(0x270), 906 907 LRI(3, POSTED), 908 REG(0x1b0), 909 REG16(0x5a8), 910 REG16(0x5ac), 911 912 NOP(6), 913 LRI(1, 0), 914 REG(0x0c8), 915 916 END(80) 917 }; 918 919 #undef END 920 #undef REG16 921 #undef REG 922 #undef LRI 923 #undef NOP 924 925 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 926 { 927 /* 928 * The gen12+ lists only have the registers we program in the basic 929 * default state. We rely on the context image using relative 930 * addressing to automatic fixup the register state between the 931 * physical engines for virtual engine. 932 */ 933 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 934 !intel_engine_has_relative_mmio(engine)); 935 936 if (engine->class == RENDER_CLASS) { 937 if (INTEL_GEN(engine->i915) >= 12) 938 return gen12_rcs_offsets; 939 else if (INTEL_GEN(engine->i915) >= 11) 940 return gen11_rcs_offsets; 941 else if (INTEL_GEN(engine->i915) >= 9) 942 return gen9_rcs_offsets; 943 else 944 return gen8_rcs_offsets; 945 } else { 946 if (INTEL_GEN(engine->i915) >= 12) 947 return gen12_xcs_offsets; 948 else if (INTEL_GEN(engine->i915) >= 9) 949 return gen9_xcs_offsets; 950 else 951 return gen8_xcs_offsets; 952 } 953 } 954 955 static struct i915_request * 956 __unwind_incomplete_requests(struct intel_engine_cs *engine) 957 { 958 struct i915_request *rq, *rn, *active = NULL; 959 struct list_head *uninitialized_var(pl); 960 int prio = I915_PRIORITY_INVALID; 961 962 lockdep_assert_held(&engine->active.lock); 963 964 list_for_each_entry_safe_reverse(rq, rn, 965 &engine->active.requests, 966 sched.link) { 967 if (i915_request_completed(rq)) 968 continue; /* XXX */ 969 970 __i915_request_unsubmit(rq); 971 972 /* 973 * Push the request back into the queue for later resubmission. 974 * If this request is not native to this physical engine (i.e. 975 * it came from a virtual source), push it back onto the virtual 976 * engine so that it can be moved across onto another physical 977 * engine as load dictates. 978 */ 979 if (likely(rq->execution_mask == engine->mask)) { 980 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 981 if (rq_prio(rq) != prio) { 982 prio = rq_prio(rq); 983 pl = i915_sched_lookup_priolist(engine, prio); 984 } 985 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 986 987 list_move(&rq->sched.link, pl); 988 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 989 990 active = rq; 991 } else { 992 struct intel_engine_cs *owner = rq->context->engine; 993 994 /* 995 * Decouple the virtual breadcrumb before moving it 996 * back to the virtual engine -- we don't want the 997 * request to complete in the background and try 998 * and cancel the breadcrumb on the virtual engine 999 * (instead of the old engine where it is linked)! 1000 */ 1001 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1002 &rq->fence.flags)) { 1003 spin_lock_nested(&rq->lock, 1004 SINGLE_DEPTH_NESTING); 1005 i915_request_cancel_breadcrumb(rq); 1006 spin_unlock(&rq->lock); 1007 } 1008 rq->engine = owner; 1009 owner->submit_request(rq); 1010 active = NULL; 1011 } 1012 } 1013 1014 return active; 1015 } 1016 1017 struct i915_request * 1018 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1019 { 1020 struct intel_engine_cs *engine = 1021 container_of(execlists, typeof(*engine), execlists); 1022 1023 return __unwind_incomplete_requests(engine); 1024 } 1025 1026 static inline void 1027 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1028 { 1029 /* 1030 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1031 * The compiler should eliminate this function as dead-code. 1032 */ 1033 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1034 return; 1035 1036 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1037 status, rq); 1038 } 1039 1040 static void intel_engine_context_in(struct intel_engine_cs *engine) 1041 { 1042 unsigned long flags; 1043 1044 if (READ_ONCE(engine->stats.enabled) == 0) 1045 return; 1046 1047 write_seqlock_irqsave(&engine->stats.lock, flags); 1048 1049 if (engine->stats.enabled > 0) { 1050 if (engine->stats.active++ == 0) 1051 engine->stats.start = ktime_get(); 1052 GEM_BUG_ON(engine->stats.active == 0); 1053 } 1054 1055 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1056 } 1057 1058 static void intel_engine_context_out(struct intel_engine_cs *engine) 1059 { 1060 unsigned long flags; 1061 1062 if (READ_ONCE(engine->stats.enabled) == 0) 1063 return; 1064 1065 write_seqlock_irqsave(&engine->stats.lock, flags); 1066 1067 if (engine->stats.enabled > 0) { 1068 ktime_t last; 1069 1070 if (engine->stats.active && --engine->stats.active == 0) { 1071 /* 1072 * Decrement the active context count and in case GPU 1073 * is now idle add up to the running total. 1074 */ 1075 last = ktime_sub(ktime_get(), engine->stats.start); 1076 1077 engine->stats.total = ktime_add(engine->stats.total, 1078 last); 1079 } else if (engine->stats.active == 0) { 1080 /* 1081 * After turning on engine stats, context out might be 1082 * the first event in which case we account from the 1083 * time stats gathering was turned on. 1084 */ 1085 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 1086 1087 engine->stats.total = ktime_add(engine->stats.total, 1088 last); 1089 } 1090 } 1091 1092 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1093 } 1094 1095 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 1096 { 1097 if (INTEL_GEN(engine->i915) >= 12) 1098 return 0x60; 1099 else if (INTEL_GEN(engine->i915) >= 9) 1100 return 0x54; 1101 else if (engine->class == RENDER_CLASS) 1102 return 0x58; 1103 else 1104 return -1; 1105 } 1106 1107 static void 1108 execlists_check_context(const struct intel_context *ce, 1109 const struct intel_engine_cs *engine) 1110 { 1111 const struct intel_ring *ring = ce->ring; 1112 u32 *regs = ce->lrc_reg_state; 1113 bool valid = true; 1114 int x; 1115 1116 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1117 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1118 engine->name, 1119 regs[CTX_RING_START], 1120 i915_ggtt_offset(ring->vma)); 1121 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1122 valid = false; 1123 } 1124 1125 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1126 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1127 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1128 engine->name, 1129 regs[CTX_RING_CTL], 1130 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1131 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1132 valid = false; 1133 } 1134 1135 x = lrc_ring_mi_mode(engine); 1136 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1137 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1138 engine->name, regs[x + 1]); 1139 regs[x + 1] &= ~STOP_RING; 1140 regs[x + 1] |= STOP_RING << 16; 1141 valid = false; 1142 } 1143 1144 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1145 } 1146 1147 static void restore_default_state(struct intel_context *ce, 1148 struct intel_engine_cs *engine) 1149 { 1150 u32 *regs = ce->lrc_reg_state; 1151 1152 if (engine->pinned_default_state) 1153 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1154 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1155 engine->context_size - PAGE_SIZE); 1156 1157 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1158 } 1159 1160 static void reset_active(struct i915_request *rq, 1161 struct intel_engine_cs *engine) 1162 { 1163 struct intel_context * const ce = rq->context; 1164 u32 head; 1165 1166 /* 1167 * The executing context has been cancelled. We want to prevent 1168 * further execution along this context and propagate the error on 1169 * to anything depending on its results. 1170 * 1171 * In __i915_request_submit(), we apply the -EIO and remove the 1172 * requests' payloads for any banned requests. But first, we must 1173 * rewind the context back to the start of the incomplete request so 1174 * that we do not jump back into the middle of the batch. 1175 * 1176 * We preserve the breadcrumbs and semaphores of the incomplete 1177 * requests so that inter-timeline dependencies (i.e other timelines) 1178 * remain correctly ordered. And we defer to __i915_request_submit() 1179 * so that all asynchronous waits are correctly handled. 1180 */ 1181 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1182 rq->fence.context, rq->fence.seqno); 1183 1184 /* On resubmission of the active request, payload will be scrubbed */ 1185 if (i915_request_completed(rq)) 1186 head = rq->tail; 1187 else 1188 head = active_request(ce->timeline, rq)->head; 1189 ce->ring->head = intel_ring_wrap(ce->ring, head); 1190 intel_ring_update_space(ce->ring); 1191 1192 /* Scrub the context image to prevent replaying the previous batch */ 1193 restore_default_state(ce, engine); 1194 __execlists_update_reg_state(ce, engine); 1195 1196 /* We've switched away, so this should be a no-op, but intent matters */ 1197 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1198 } 1199 1200 static inline struct intel_engine_cs * 1201 __execlists_schedule_in(struct i915_request *rq) 1202 { 1203 struct intel_engine_cs * const engine = rq->engine; 1204 struct intel_context * const ce = rq->context; 1205 1206 intel_context_get(ce); 1207 1208 if (unlikely(intel_context_is_banned(ce))) 1209 reset_active(rq, engine); 1210 1211 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1212 execlists_check_context(ce, engine); 1213 1214 if (ce->tag) { 1215 /* Use a fixed tag for OA and friends */ 1216 ce->lrc_desc |= (u64)ce->tag << 32; 1217 } else { 1218 /* We don't need a strict matching tag, just different values */ 1219 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1220 ce->lrc_desc |= 1221 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) << 1222 GEN11_SW_CTX_ID_SHIFT; 1223 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1224 } 1225 1226 __intel_gt_pm_get(engine->gt); 1227 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1228 intel_engine_context_in(engine); 1229 1230 return engine; 1231 } 1232 1233 static inline struct i915_request * 1234 execlists_schedule_in(struct i915_request *rq, int idx) 1235 { 1236 struct intel_context * const ce = rq->context; 1237 struct intel_engine_cs *old; 1238 1239 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1240 trace_i915_request_in(rq, idx); 1241 1242 old = READ_ONCE(ce->inflight); 1243 do { 1244 if (!old) { 1245 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1246 break; 1247 } 1248 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1249 1250 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1251 return i915_request_get(rq); 1252 } 1253 1254 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1255 { 1256 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1257 struct i915_request *next = READ_ONCE(ve->request); 1258 1259 if (next && next->execution_mask & ~rq->execution_mask) 1260 tasklet_schedule(&ve->base.execlists.tasklet); 1261 } 1262 1263 static inline void 1264 __execlists_schedule_out(struct i915_request *rq, 1265 struct intel_engine_cs * const engine) 1266 { 1267 struct intel_context * const ce = rq->context; 1268 1269 /* 1270 * NB process_csb() is not under the engine->active.lock and hence 1271 * schedule_out can race with schedule_in meaning that we should 1272 * refrain from doing non-trivial work here. 1273 */ 1274 1275 /* 1276 * If we have just completed this context, the engine may now be 1277 * idle and we want to re-enter powersaving. 1278 */ 1279 if (list_is_last(&rq->link, &ce->timeline->requests) && 1280 i915_request_completed(rq)) 1281 intel_engine_add_retire(engine, ce->timeline); 1282 1283 intel_engine_context_out(engine); 1284 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1285 intel_gt_pm_put_async(engine->gt); 1286 1287 /* 1288 * If this is part of a virtual engine, its next request may 1289 * have been blocked waiting for access to the active context. 1290 * We have to kick all the siblings again in case we need to 1291 * switch (e.g. the next request is not runnable on this 1292 * engine). Hopefully, we will already have submitted the next 1293 * request before the tasklet runs and do not need to rebuild 1294 * each virtual tree and kick everyone again. 1295 */ 1296 if (ce->engine != engine) 1297 kick_siblings(rq, ce); 1298 1299 intel_context_put(ce); 1300 } 1301 1302 static inline void 1303 execlists_schedule_out(struct i915_request *rq) 1304 { 1305 struct intel_context * const ce = rq->context; 1306 struct intel_engine_cs *cur, *old; 1307 1308 trace_i915_request_out(rq); 1309 1310 old = READ_ONCE(ce->inflight); 1311 do 1312 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1313 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1314 if (!cur) 1315 __execlists_schedule_out(rq, old); 1316 1317 i915_request_put(rq); 1318 } 1319 1320 static u64 execlists_update_context(struct i915_request *rq) 1321 { 1322 struct intel_context *ce = rq->context; 1323 u64 desc = ce->lrc_desc; 1324 u32 tail; 1325 1326 /* 1327 * WaIdleLiteRestore:bdw,skl 1328 * 1329 * We should never submit the context with the same RING_TAIL twice 1330 * just in case we submit an empty ring, which confuses the HW. 1331 * 1332 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1333 * the normal request to be able to always advance the RING_TAIL on 1334 * subsequent resubmissions (for lite restore). Should that fail us, 1335 * and we try and submit the same tail again, force the context 1336 * reload. 1337 */ 1338 tail = intel_ring_set_tail(rq->ring, rq->tail); 1339 if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail)) 1340 desc |= CTX_DESC_FORCE_RESTORE; 1341 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1342 rq->tail = rq->wa_tail; 1343 1344 /* 1345 * Make sure the context image is complete before we submit it to HW. 1346 * 1347 * Ostensibly, writes (including the WCB) should be flushed prior to 1348 * an uncached write such as our mmio register access, the empirical 1349 * evidence (esp. on Braswell) suggests that the WC write into memory 1350 * may not be visible to the HW prior to the completion of the UC 1351 * register write and that we may begin execution from the context 1352 * before its image is complete leading to invalid PD chasing. 1353 */ 1354 wmb(); 1355 1356 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1357 return desc; 1358 } 1359 1360 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1361 { 1362 if (execlists->ctrl_reg) { 1363 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1364 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1365 } else { 1366 writel(upper_32_bits(desc), execlists->submit_reg); 1367 writel(lower_32_bits(desc), execlists->submit_reg); 1368 } 1369 } 1370 1371 static __maybe_unused void 1372 trace_ports(const struct intel_engine_execlists *execlists, 1373 const char *msg, 1374 struct i915_request * const *ports) 1375 { 1376 const struct intel_engine_cs *engine = 1377 container_of(execlists, typeof(*engine), execlists); 1378 1379 if (!ports[0]) 1380 return; 1381 1382 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg, 1383 ports[0]->fence.context, 1384 ports[0]->fence.seqno, 1385 i915_request_completed(ports[0]) ? "!" : 1386 i915_request_started(ports[0]) ? "*" : 1387 "", 1388 ports[1] ? ports[1]->fence.context : 0, 1389 ports[1] ? ports[1]->fence.seqno : 0); 1390 } 1391 1392 static __maybe_unused bool 1393 assert_pending_valid(const struct intel_engine_execlists *execlists, 1394 const char *msg) 1395 { 1396 struct i915_request * const *port, *rq; 1397 struct intel_context *ce = NULL; 1398 1399 trace_ports(execlists, msg, execlists->pending); 1400 1401 if (!execlists->pending[0]) { 1402 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1403 return false; 1404 } 1405 1406 if (execlists->pending[execlists_num_ports(execlists)]) { 1407 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1408 execlists_num_ports(execlists)); 1409 return false; 1410 } 1411 1412 for (port = execlists->pending; (rq = *port); port++) { 1413 unsigned long flags; 1414 bool ok = true; 1415 1416 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1417 GEM_BUG_ON(!i915_request_is_active(rq)); 1418 1419 if (ce == rq->context) { 1420 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n", 1421 ce->timeline->fence_context, 1422 port - execlists->pending); 1423 return false; 1424 } 1425 ce = rq->context; 1426 1427 /* Hold tightly onto the lock to prevent concurrent retires! */ 1428 if (!spin_trylock_irqsave(&rq->lock, flags)) 1429 continue; 1430 1431 if (i915_request_completed(rq)) 1432 goto unlock; 1433 1434 if (i915_active_is_idle(&ce->active) && 1435 !intel_context_is_barrier(ce)) { 1436 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n", 1437 ce->timeline->fence_context, 1438 port - execlists->pending); 1439 ok = false; 1440 goto unlock; 1441 } 1442 1443 if (!i915_vma_is_pinned(ce->state)) { 1444 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n", 1445 ce->timeline->fence_context, 1446 port - execlists->pending); 1447 ok = false; 1448 goto unlock; 1449 } 1450 1451 if (!i915_vma_is_pinned(ce->ring->vma)) { 1452 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n", 1453 ce->timeline->fence_context, 1454 port - execlists->pending); 1455 ok = false; 1456 goto unlock; 1457 } 1458 1459 unlock: 1460 spin_unlock_irqrestore(&rq->lock, flags); 1461 if (!ok) 1462 return false; 1463 } 1464 1465 return ce; 1466 } 1467 1468 static void execlists_submit_ports(struct intel_engine_cs *engine) 1469 { 1470 struct intel_engine_execlists *execlists = &engine->execlists; 1471 unsigned int n; 1472 1473 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1474 1475 /* 1476 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1477 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1478 * not be relinquished until the device is idle (see 1479 * i915_gem_idle_work_handler()). As a precaution, we make sure 1480 * that all ELSP are drained i.e. we have processed the CSB, 1481 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1482 */ 1483 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1484 1485 /* 1486 * ELSQ note: the submit queue is not cleared after being submitted 1487 * to the HW so we need to make sure we always clean it up. This is 1488 * currently ensured by the fact that we always write the same number 1489 * of elsq entries, keep this in mind before changing the loop below. 1490 */ 1491 for (n = execlists_num_ports(execlists); n--; ) { 1492 struct i915_request *rq = execlists->pending[n]; 1493 1494 write_desc(execlists, 1495 rq ? execlists_update_context(rq) : 0, 1496 n); 1497 } 1498 1499 /* we need to manually load the submit queue */ 1500 if (execlists->ctrl_reg) 1501 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1502 } 1503 1504 static bool ctx_single_port_submission(const struct intel_context *ce) 1505 { 1506 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1507 intel_context_force_single_submission(ce)); 1508 } 1509 1510 static bool can_merge_ctx(const struct intel_context *prev, 1511 const struct intel_context *next) 1512 { 1513 if (prev != next) 1514 return false; 1515 1516 if (ctx_single_port_submission(prev)) 1517 return false; 1518 1519 return true; 1520 } 1521 1522 static bool can_merge_rq(const struct i915_request *prev, 1523 const struct i915_request *next) 1524 { 1525 GEM_BUG_ON(prev == next); 1526 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1527 1528 /* 1529 * We do not submit known completed requests. Therefore if the next 1530 * request is already completed, we can pretend to merge it in 1531 * with the previous context (and we will skip updating the ELSP 1532 * and tracking). Thus hopefully keeping the ELSP full with active 1533 * contexts, despite the best efforts of preempt-to-busy to confuse 1534 * us. 1535 */ 1536 if (i915_request_completed(next)) 1537 return true; 1538 1539 if (unlikely((prev->fence.flags ^ next->fence.flags) & 1540 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1541 BIT(I915_FENCE_FLAG_SENTINEL)))) 1542 return false; 1543 1544 if (!can_merge_ctx(prev->context, next->context)) 1545 return false; 1546 1547 return true; 1548 } 1549 1550 static void virtual_update_register_offsets(u32 *regs, 1551 struct intel_engine_cs *engine) 1552 { 1553 set_offsets(regs, reg_offsets(engine), engine, false); 1554 } 1555 1556 static bool virtual_matches(const struct virtual_engine *ve, 1557 const struct i915_request *rq, 1558 const struct intel_engine_cs *engine) 1559 { 1560 const struct intel_engine_cs *inflight; 1561 1562 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1563 return false; 1564 1565 /* 1566 * We track when the HW has completed saving the context image 1567 * (i.e. when we have seen the final CS event switching out of 1568 * the context) and must not overwrite the context image before 1569 * then. This restricts us to only using the active engine 1570 * while the previous virtualized request is inflight (so 1571 * we reuse the register offsets). This is a very small 1572 * hystersis on the greedy seelction algorithm. 1573 */ 1574 inflight = intel_context_inflight(&ve->context); 1575 if (inflight && inflight != engine) 1576 return false; 1577 1578 return true; 1579 } 1580 1581 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1582 struct intel_engine_cs *engine) 1583 { 1584 struct intel_engine_cs *old = ve->siblings[0]; 1585 1586 /* All unattached (rq->engine == old) must already be completed */ 1587 1588 spin_lock(&old->breadcrumbs.irq_lock); 1589 if (!list_empty(&ve->context.signal_link)) { 1590 list_move_tail(&ve->context.signal_link, 1591 &engine->breadcrumbs.signalers); 1592 intel_engine_signal_breadcrumbs(engine); 1593 } 1594 spin_unlock(&old->breadcrumbs.irq_lock); 1595 } 1596 1597 static struct i915_request * 1598 last_active(const struct intel_engine_execlists *execlists) 1599 { 1600 struct i915_request * const *last = READ_ONCE(execlists->active); 1601 1602 while (*last && i915_request_completed(*last)) 1603 last++; 1604 1605 return *last; 1606 } 1607 1608 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1609 { 1610 LIST_HEAD(list); 1611 1612 /* 1613 * We want to move the interrupted request to the back of 1614 * the round-robin list (i.e. its priority level), but 1615 * in doing so, we must then move all requests that were in 1616 * flight and were waiting for the interrupted request to 1617 * be run after it again. 1618 */ 1619 do { 1620 struct i915_dependency *p; 1621 1622 GEM_BUG_ON(i915_request_is_active(rq)); 1623 list_move_tail(&rq->sched.link, pl); 1624 1625 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 1626 struct i915_request *w = 1627 container_of(p->waiter, typeof(*w), sched); 1628 1629 /* Leave semaphores spinning on the other engines */ 1630 if (w->engine != rq->engine) 1631 continue; 1632 1633 /* No waiter should start before its signaler */ 1634 GEM_BUG_ON(i915_request_started(w) && 1635 !i915_request_completed(rq)); 1636 1637 GEM_BUG_ON(i915_request_is_active(w)); 1638 if (!i915_request_is_ready(w)) 1639 continue; 1640 1641 if (rq_prio(w) < rq_prio(rq)) 1642 continue; 1643 1644 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1645 list_move_tail(&w->sched.link, &list); 1646 } 1647 1648 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1649 } while (rq); 1650 } 1651 1652 static void defer_active(struct intel_engine_cs *engine) 1653 { 1654 struct i915_request *rq; 1655 1656 rq = __unwind_incomplete_requests(engine); 1657 if (!rq) 1658 return; 1659 1660 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1661 } 1662 1663 static bool 1664 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1665 { 1666 int hint; 1667 1668 if (!intel_engine_has_timeslices(engine)) 1669 return false; 1670 1671 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1672 return false; 1673 1674 hint = max(rq_prio(list_next_entry(rq, sched.link)), 1675 engine->execlists.queue_priority_hint); 1676 1677 return hint >= effective_prio(rq); 1678 } 1679 1680 static int 1681 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1682 { 1683 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1684 return INT_MIN; 1685 1686 return rq_prio(list_next_entry(rq, sched.link)); 1687 } 1688 1689 static inline unsigned long 1690 timeslice(const struct intel_engine_cs *engine) 1691 { 1692 return READ_ONCE(engine->props.timeslice_duration_ms); 1693 } 1694 1695 static unsigned long 1696 active_timeslice(const struct intel_engine_cs *engine) 1697 { 1698 const struct i915_request *rq = *engine->execlists.active; 1699 1700 if (!rq || i915_request_completed(rq)) 1701 return 0; 1702 1703 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1704 return 0; 1705 1706 return timeslice(engine); 1707 } 1708 1709 static void set_timeslice(struct intel_engine_cs *engine) 1710 { 1711 if (!intel_engine_has_timeslices(engine)) 1712 return; 1713 1714 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1715 } 1716 1717 static void record_preemption(struct intel_engine_execlists *execlists) 1718 { 1719 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1720 } 1721 1722 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1723 { 1724 struct i915_request *rq; 1725 1726 rq = last_active(&engine->execlists); 1727 if (!rq) 1728 return 0; 1729 1730 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1731 if (unlikely(intel_context_is_banned(rq->context))) 1732 return 1; 1733 1734 return READ_ONCE(engine->props.preempt_timeout_ms); 1735 } 1736 1737 static void set_preempt_timeout(struct intel_engine_cs *engine) 1738 { 1739 if (!intel_engine_has_preempt_reset(engine)) 1740 return; 1741 1742 set_timer_ms(&engine->execlists.preempt, 1743 active_preempt_timeout(engine)); 1744 } 1745 1746 static inline void clear_ports(struct i915_request **ports, int count) 1747 { 1748 memset_p((void **)ports, NULL, count); 1749 } 1750 1751 static void execlists_dequeue(struct intel_engine_cs *engine) 1752 { 1753 struct intel_engine_execlists * const execlists = &engine->execlists; 1754 struct i915_request **port = execlists->pending; 1755 struct i915_request ** const last_port = port + execlists->port_mask; 1756 struct i915_request *last; 1757 struct rb_node *rb; 1758 bool submit = false; 1759 1760 /* 1761 * Hardware submission is through 2 ports. Conceptually each port 1762 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1763 * static for a context, and unique to each, so we only execute 1764 * requests belonging to a single context from each ring. RING_HEAD 1765 * is maintained by the CS in the context image, it marks the place 1766 * where it got up to last time, and through RING_TAIL we tell the CS 1767 * where we want to execute up to this time. 1768 * 1769 * In this list the requests are in order of execution. Consecutive 1770 * requests from the same context are adjacent in the ringbuffer. We 1771 * can combine these requests into a single RING_TAIL update: 1772 * 1773 * RING_HEAD...req1...req2 1774 * ^- RING_TAIL 1775 * since to execute req2 the CS must first execute req1. 1776 * 1777 * Our goal then is to point each port to the end of a consecutive 1778 * sequence of requests as being the most optimal (fewest wake ups 1779 * and context switches) submission. 1780 */ 1781 1782 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1783 struct virtual_engine *ve = 1784 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1785 struct i915_request *rq = READ_ONCE(ve->request); 1786 1787 if (!rq) { /* lazily cleanup after another engine handled rq */ 1788 rb_erase_cached(rb, &execlists->virtual); 1789 RB_CLEAR_NODE(rb); 1790 rb = rb_first_cached(&execlists->virtual); 1791 continue; 1792 } 1793 1794 if (!virtual_matches(ve, rq, engine)) { 1795 rb = rb_next(rb); 1796 continue; 1797 } 1798 1799 break; 1800 } 1801 1802 /* 1803 * If the queue is higher priority than the last 1804 * request in the currently active context, submit afresh. 1805 * We will resubmit again afterwards in case we need to split 1806 * the active context to interject the preemption request, 1807 * i.e. we will retrigger preemption following the ack in case 1808 * of trouble. 1809 */ 1810 last = last_active(execlists); 1811 if (last) { 1812 if (need_preempt(engine, last, rb)) { 1813 ENGINE_TRACE(engine, 1814 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 1815 last->fence.context, 1816 last->fence.seqno, 1817 last->sched.attr.priority, 1818 execlists->queue_priority_hint); 1819 record_preemption(execlists); 1820 1821 /* 1822 * Don't let the RING_HEAD advance past the breadcrumb 1823 * as we unwind (and until we resubmit) so that we do 1824 * not accidentally tell it to go backwards. 1825 */ 1826 ring_set_paused(engine, 1); 1827 1828 /* 1829 * Note that we have not stopped the GPU at this point, 1830 * so we are unwinding the incomplete requests as they 1831 * remain inflight and so by the time we do complete 1832 * the preemption, some of the unwound requests may 1833 * complete! 1834 */ 1835 __unwind_incomplete_requests(engine); 1836 1837 /* 1838 * If we need to return to the preempted context, we 1839 * need to skip the lite-restore and force it to 1840 * reload the RING_TAIL. Otherwise, the HW has a 1841 * tendency to ignore us rewinding the TAIL to the 1842 * end of an earlier request. 1843 */ 1844 last->context->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1845 last = NULL; 1846 } else if (need_timeslice(engine, last) && 1847 timer_expired(&engine->execlists.timer)) { 1848 ENGINE_TRACE(engine, 1849 "expired last=%llx:%lld, prio=%d, hint=%d\n", 1850 last->fence.context, 1851 last->fence.seqno, 1852 last->sched.attr.priority, 1853 execlists->queue_priority_hint); 1854 1855 ring_set_paused(engine, 1); 1856 defer_active(engine); 1857 1858 /* 1859 * Unlike for preemption, if we rewind and continue 1860 * executing the same context as previously active, 1861 * the order of execution will remain the same and 1862 * the tail will only advance. We do not need to 1863 * force a full context restore, as a lite-restore 1864 * is sufficient to resample the monotonic TAIL. 1865 * 1866 * If we switch to any other context, similarly we 1867 * will not rewind TAIL of current context, and 1868 * normal save/restore will preserve state and allow 1869 * us to later continue executing the same request. 1870 */ 1871 last = NULL; 1872 } else { 1873 /* 1874 * Otherwise if we already have a request pending 1875 * for execution after the current one, we can 1876 * just wait until the next CS event before 1877 * queuing more. In either case we will force a 1878 * lite-restore preemption event, but if we wait 1879 * we hopefully coalesce several updates into a single 1880 * submission. 1881 */ 1882 if (!list_is_last(&last->sched.link, 1883 &engine->active.requests)) { 1884 /* 1885 * Even if ELSP[1] is occupied and not worthy 1886 * of timeslices, our queue might be. 1887 */ 1888 if (!execlists->timer.expires && 1889 need_timeslice(engine, last)) 1890 set_timer_ms(&execlists->timer, 1891 timeslice(engine)); 1892 1893 return; 1894 } 1895 } 1896 } 1897 1898 while (rb) { /* XXX virtual is always taking precedence */ 1899 struct virtual_engine *ve = 1900 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1901 struct i915_request *rq; 1902 1903 spin_lock(&ve->base.active.lock); 1904 1905 rq = ve->request; 1906 if (unlikely(!rq)) { /* lost the race to a sibling */ 1907 spin_unlock(&ve->base.active.lock); 1908 rb_erase_cached(rb, &execlists->virtual); 1909 RB_CLEAR_NODE(rb); 1910 rb = rb_first_cached(&execlists->virtual); 1911 continue; 1912 } 1913 1914 GEM_BUG_ON(rq != ve->request); 1915 GEM_BUG_ON(rq->engine != &ve->base); 1916 GEM_BUG_ON(rq->context != &ve->context); 1917 1918 if (rq_prio(rq) >= queue_prio(execlists)) { 1919 if (!virtual_matches(ve, rq, engine)) { 1920 spin_unlock(&ve->base.active.lock); 1921 rb = rb_next(rb); 1922 continue; 1923 } 1924 1925 if (last && !can_merge_rq(last, rq)) { 1926 spin_unlock(&ve->base.active.lock); 1927 return; /* leave this for another */ 1928 } 1929 1930 ENGINE_TRACE(engine, 1931 "virtual rq=%llx:%lld%s, new engine? %s\n", 1932 rq->fence.context, 1933 rq->fence.seqno, 1934 i915_request_completed(rq) ? "!" : 1935 i915_request_started(rq) ? "*" : 1936 "", 1937 yesno(engine != ve->siblings[0])); 1938 1939 ve->request = NULL; 1940 ve->base.execlists.queue_priority_hint = INT_MIN; 1941 rb_erase_cached(rb, &execlists->virtual); 1942 RB_CLEAR_NODE(rb); 1943 1944 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1945 rq->engine = engine; 1946 1947 if (engine != ve->siblings[0]) { 1948 u32 *regs = ve->context.lrc_reg_state; 1949 unsigned int n; 1950 1951 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1952 1953 if (!intel_engine_has_relative_mmio(engine)) 1954 virtual_update_register_offsets(regs, 1955 engine); 1956 1957 if (!list_empty(&ve->context.signals)) 1958 virtual_xfer_breadcrumbs(ve, engine); 1959 1960 /* 1961 * Move the bound engine to the top of the list 1962 * for future execution. We then kick this 1963 * tasklet first before checking others, so that 1964 * we preferentially reuse this set of bound 1965 * registers. 1966 */ 1967 for (n = 1; n < ve->num_siblings; n++) { 1968 if (ve->siblings[n] == engine) { 1969 swap(ve->siblings[n], 1970 ve->siblings[0]); 1971 break; 1972 } 1973 } 1974 1975 GEM_BUG_ON(ve->siblings[0] != engine); 1976 } 1977 1978 if (__i915_request_submit(rq)) { 1979 submit = true; 1980 last = rq; 1981 } 1982 i915_request_put(rq); 1983 1984 /* 1985 * Hmm, we have a bunch of virtual engine requests, 1986 * but the first one was already completed (thanks 1987 * preempt-to-busy!). Keep looking at the veng queue 1988 * until we have no more relevant requests (i.e. 1989 * the normal submit queue has higher priority). 1990 */ 1991 if (!submit) { 1992 spin_unlock(&ve->base.active.lock); 1993 rb = rb_first_cached(&execlists->virtual); 1994 continue; 1995 } 1996 } 1997 1998 spin_unlock(&ve->base.active.lock); 1999 break; 2000 } 2001 2002 while ((rb = rb_first_cached(&execlists->queue))) { 2003 struct i915_priolist *p = to_priolist(rb); 2004 struct i915_request *rq, *rn; 2005 int i; 2006 2007 priolist_for_each_request_consume(rq, rn, p, i) { 2008 bool merge = true; 2009 2010 /* 2011 * Can we combine this request with the current port? 2012 * It has to be the same context/ringbuffer and not 2013 * have any exceptions (e.g. GVT saying never to 2014 * combine contexts). 2015 * 2016 * If we can combine the requests, we can execute both 2017 * by updating the RING_TAIL to point to the end of the 2018 * second request, and so we never need to tell the 2019 * hardware about the first. 2020 */ 2021 if (last && !can_merge_rq(last, rq)) { 2022 /* 2023 * If we are on the second port and cannot 2024 * combine this request with the last, then we 2025 * are done. 2026 */ 2027 if (port == last_port) 2028 goto done; 2029 2030 /* 2031 * We must not populate both ELSP[] with the 2032 * same LRCA, i.e. we must submit 2 different 2033 * contexts if we submit 2 ELSP. 2034 */ 2035 if (last->context == rq->context) 2036 goto done; 2037 2038 if (i915_request_has_sentinel(last)) 2039 goto done; 2040 2041 /* 2042 * If GVT overrides us we only ever submit 2043 * port[0], leaving port[1] empty. Note that we 2044 * also have to be careful that we don't queue 2045 * the same context (even though a different 2046 * request) to the second port. 2047 */ 2048 if (ctx_single_port_submission(last->context) || 2049 ctx_single_port_submission(rq->context)) 2050 goto done; 2051 2052 merge = false; 2053 } 2054 2055 if (__i915_request_submit(rq)) { 2056 if (!merge) { 2057 *port = execlists_schedule_in(last, port - execlists->pending); 2058 port++; 2059 last = NULL; 2060 } 2061 2062 GEM_BUG_ON(last && 2063 !can_merge_ctx(last->context, 2064 rq->context)); 2065 2066 submit = true; 2067 last = rq; 2068 } 2069 } 2070 2071 rb_erase_cached(&p->node, &execlists->queue); 2072 i915_priolist_free(p); 2073 } 2074 2075 done: 2076 /* 2077 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2078 * 2079 * We choose the priority hint such that if we add a request of greater 2080 * priority than this, we kick the submission tasklet to decide on 2081 * the right order of submitting the requests to hardware. We must 2082 * also be prepared to reorder requests as they are in-flight on the 2083 * HW. We derive the priority hint then as the first "hole" in 2084 * the HW submission ports and if there are no available slots, 2085 * the priority of the lowest executing request, i.e. last. 2086 * 2087 * When we do receive a higher priority request ready to run from the 2088 * user, see queue_request(), the priority hint is bumped to that 2089 * request triggering preemption on the next dequeue (or subsequent 2090 * interrupt for secondary ports). 2091 */ 2092 execlists->queue_priority_hint = queue_prio(execlists); 2093 2094 if (submit) { 2095 *port = execlists_schedule_in(last, port - execlists->pending); 2096 execlists->switch_priority_hint = 2097 switch_prio(engine, *execlists->pending); 2098 2099 /* 2100 * Skip if we ended up with exactly the same set of requests, 2101 * e.g. trying to timeslice a pair of ordered contexts 2102 */ 2103 if (!memcmp(execlists->active, execlists->pending, 2104 (port - execlists->pending + 1) * sizeof(*port))) { 2105 do 2106 execlists_schedule_out(fetch_and_zero(port)); 2107 while (port-- != execlists->pending); 2108 2109 goto skip_submit; 2110 } 2111 clear_ports(port + 1, last_port - port); 2112 2113 execlists_submit_ports(engine); 2114 set_preempt_timeout(engine); 2115 } else { 2116 skip_submit: 2117 ring_set_paused(engine, 0); 2118 } 2119 } 2120 2121 static void 2122 cancel_port_requests(struct intel_engine_execlists * const execlists) 2123 { 2124 struct i915_request * const *port; 2125 2126 for (port = execlists->pending; *port; port++) 2127 execlists_schedule_out(*port); 2128 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2129 2130 /* Mark the end of active before we overwrite *active */ 2131 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2132 execlists_schedule_out(*port); 2133 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2134 2135 WRITE_ONCE(execlists->active, execlists->inflight); 2136 } 2137 2138 static inline void 2139 invalidate_csb_entries(const u32 *first, const u32 *last) 2140 { 2141 clflush((void *)first); 2142 clflush((void *)last); 2143 } 2144 2145 static inline bool 2146 reset_in_progress(const struct intel_engine_execlists *execlists) 2147 { 2148 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 2149 } 2150 2151 /* 2152 * Starting with Gen12, the status has a new format: 2153 * 2154 * bit 0: switched to new queue 2155 * bit 1: reserved 2156 * bit 2: semaphore wait mode (poll or signal), only valid when 2157 * switch detail is set to "wait on semaphore" 2158 * bits 3-5: engine class 2159 * bits 6-11: engine instance 2160 * bits 12-14: reserved 2161 * bits 15-25: sw context id of the lrc the GT switched to 2162 * bits 26-31: sw counter of the lrc the GT switched to 2163 * bits 32-35: context switch detail 2164 * - 0: ctx complete 2165 * - 1: wait on sync flip 2166 * - 2: wait on vblank 2167 * - 3: wait on scanline 2168 * - 4: wait on semaphore 2169 * - 5: context preempted (not on SEMAPHORE_WAIT or 2170 * WAIT_FOR_EVENT) 2171 * bit 36: reserved 2172 * bits 37-43: wait detail (for switch detail 1 to 4) 2173 * bits 44-46: reserved 2174 * bits 47-57: sw context id of the lrc the GT switched away from 2175 * bits 58-63: sw counter of the lrc the GT switched away from 2176 */ 2177 static inline bool 2178 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2179 { 2180 u32 lower_dw = csb[0]; 2181 u32 upper_dw = csb[1]; 2182 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2183 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2184 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2185 2186 /* 2187 * The context switch detail is not guaranteed to be 5 when a preemption 2188 * occurs, so we can't just check for that. The check below works for 2189 * all the cases we care about, including preemptions of WAIT 2190 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2191 * would require some extra handling, but we don't support that. 2192 */ 2193 if (!ctx_away_valid || new_queue) { 2194 GEM_BUG_ON(!ctx_to_valid); 2195 return true; 2196 } 2197 2198 /* 2199 * switch detail = 5 is covered by the case above and we do not expect a 2200 * context switch on an unsuccessful wait instruction since we always 2201 * use polling mode. 2202 */ 2203 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2204 return false; 2205 } 2206 2207 static inline bool 2208 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2209 { 2210 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2211 } 2212 2213 static void process_csb(struct intel_engine_cs *engine) 2214 { 2215 struct intel_engine_execlists * const execlists = &engine->execlists; 2216 const u32 * const buf = execlists->csb_status; 2217 const u8 num_entries = execlists->csb_size; 2218 u8 head, tail; 2219 2220 /* 2221 * As we modify our execlists state tracking we require exclusive 2222 * access. Either we are inside the tasklet, or the tasklet is disabled 2223 * and we assume that is only inside the reset paths and so serialised. 2224 */ 2225 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2226 !reset_in_progress(execlists)); 2227 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2228 2229 /* 2230 * Note that csb_write, csb_status may be either in HWSP or mmio. 2231 * When reading from the csb_write mmio register, we have to be 2232 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2233 * the low 4bits. As it happens we know the next 4bits are always 2234 * zero and so we can simply masked off the low u8 of the register 2235 * and treat it identically to reading from the HWSP (without having 2236 * to use explicit shifting and masking, and probably bifurcating 2237 * the code to handle the legacy mmio read). 2238 */ 2239 head = execlists->csb_head; 2240 tail = READ_ONCE(*execlists->csb_write); 2241 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2242 if (unlikely(head == tail)) 2243 return; 2244 2245 /* 2246 * Hopefully paired with a wmb() in HW! 2247 * 2248 * We must complete the read of the write pointer before any reads 2249 * from the CSB, so that we do not see stale values. Without an rmb 2250 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2251 * we perform the READ_ONCE(*csb_write). 2252 */ 2253 rmb(); 2254 2255 do { 2256 bool promote; 2257 2258 if (++head == num_entries) 2259 head = 0; 2260 2261 /* 2262 * We are flying near dragons again. 2263 * 2264 * We hold a reference to the request in execlist_port[] 2265 * but no more than that. We are operating in softirq 2266 * context and so cannot hold any mutex or sleep. That 2267 * prevents us stopping the requests we are processing 2268 * in port[] from being retired simultaneously (the 2269 * breadcrumb will be complete before we see the 2270 * context-switch). As we only hold the reference to the 2271 * request, any pointer chasing underneath the request 2272 * is subject to a potential use-after-free. Thus we 2273 * store all of the bookkeeping within port[] as 2274 * required, and avoid using unguarded pointers beneath 2275 * request itself. The same applies to the atomic 2276 * status notifier. 2277 */ 2278 2279 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2280 head, buf[2 * head + 0], buf[2 * head + 1]); 2281 2282 if (INTEL_GEN(engine->i915) >= 12) 2283 promote = gen12_csb_parse(execlists, buf + 2 * head); 2284 else 2285 promote = gen8_csb_parse(execlists, buf + 2 * head); 2286 if (promote) { 2287 struct i915_request * const *old = execlists->active; 2288 2289 /* Point active to the new ELSP; prevent overwriting */ 2290 WRITE_ONCE(execlists->active, execlists->pending); 2291 2292 if (!inject_preempt_hang(execlists)) 2293 ring_set_paused(engine, 0); 2294 2295 /* cancel old inflight, prepare for switch */ 2296 trace_ports(execlists, "preempted", old); 2297 while (*old) 2298 execlists_schedule_out(*old++); 2299 2300 /* switch pending to inflight */ 2301 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2302 WRITE_ONCE(execlists->active, 2303 memcpy(execlists->inflight, 2304 execlists->pending, 2305 execlists_num_ports(execlists) * 2306 sizeof(*execlists->pending))); 2307 2308 WRITE_ONCE(execlists->pending[0], NULL); 2309 } else { 2310 GEM_BUG_ON(!*execlists->active); 2311 2312 /* port0 completed, advanced to port1 */ 2313 trace_ports(execlists, "completed", execlists->active); 2314 2315 /* 2316 * We rely on the hardware being strongly 2317 * ordered, that the breadcrumb write is 2318 * coherent (visible from the CPU) before the 2319 * user interrupt and CSB is processed. 2320 */ 2321 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 2322 !reset_in_progress(execlists)); 2323 execlists_schedule_out(*execlists->active++); 2324 2325 GEM_BUG_ON(execlists->active - execlists->inflight > 2326 execlists_num_ports(execlists)); 2327 } 2328 } while (head != tail); 2329 2330 execlists->csb_head = head; 2331 set_timeslice(engine); 2332 2333 /* 2334 * Gen11 has proven to fail wrt global observation point between 2335 * entry and tail update, failing on the ordering and thus 2336 * we see an old entry in the context status buffer. 2337 * 2338 * Forcibly evict out entries for the next gpu csb update, 2339 * to increase the odds that we get a fresh entries with non 2340 * working hardware. The cost for doing so comes out mostly with 2341 * the wash as hardware, working or not, will need to do the 2342 * invalidation before. 2343 */ 2344 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2345 } 2346 2347 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2348 { 2349 lockdep_assert_held(&engine->active.lock); 2350 if (!engine->execlists.pending[0]) { 2351 rcu_read_lock(); /* protect peeking at execlists->active */ 2352 execlists_dequeue(engine); 2353 rcu_read_unlock(); 2354 } 2355 } 2356 2357 static void __execlists_hold(struct i915_request *rq) 2358 { 2359 LIST_HEAD(list); 2360 2361 do { 2362 struct i915_dependency *p; 2363 2364 if (i915_request_is_active(rq)) 2365 __i915_request_unsubmit(rq); 2366 2367 RQ_TRACE(rq, "on hold\n"); 2368 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2369 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2370 i915_request_set_hold(rq); 2371 2372 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 2373 struct i915_request *w = 2374 container_of(p->waiter, typeof(*w), sched); 2375 2376 /* Leave semaphores spinning on the other engines */ 2377 if (w->engine != rq->engine) 2378 continue; 2379 2380 if (!i915_request_is_ready(w)) 2381 continue; 2382 2383 if (i915_request_completed(w)) 2384 continue; 2385 2386 if (i915_request_on_hold(rq)) 2387 continue; 2388 2389 list_move_tail(&w->sched.link, &list); 2390 } 2391 2392 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2393 } while (rq); 2394 } 2395 2396 static bool execlists_hold(struct intel_engine_cs *engine, 2397 struct i915_request *rq) 2398 { 2399 spin_lock_irq(&engine->active.lock); 2400 2401 if (i915_request_completed(rq)) { /* too late! */ 2402 rq = NULL; 2403 goto unlock; 2404 } 2405 2406 if (rq->engine != engine) { /* preempted virtual engine */ 2407 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2408 2409 /* 2410 * intel_context_inflight() is only protected by virtue 2411 * of process_csb() being called only by the tasklet (or 2412 * directly from inside reset while the tasklet is suspended). 2413 * Assert that neither of those are allowed to run while we 2414 * poke at the request queues. 2415 */ 2416 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2417 2418 /* 2419 * An unsubmitted request along a virtual engine will 2420 * remain on the active (this) engine until we are able 2421 * to process the context switch away (and so mark the 2422 * context as no longer in flight). That cannot have happened 2423 * yet, otherwise we would not be hanging! 2424 */ 2425 spin_lock(&ve->base.active.lock); 2426 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2427 GEM_BUG_ON(ve->request != rq); 2428 ve->request = NULL; 2429 spin_unlock(&ve->base.active.lock); 2430 i915_request_put(rq); 2431 2432 rq->engine = engine; 2433 } 2434 2435 /* 2436 * Transfer this request onto the hold queue to prevent it 2437 * being resumbitted to HW (and potentially completed) before we have 2438 * released it. Since we may have already submitted following 2439 * requests, we need to remove those as well. 2440 */ 2441 GEM_BUG_ON(i915_request_on_hold(rq)); 2442 GEM_BUG_ON(rq->engine != engine); 2443 __execlists_hold(rq); 2444 2445 unlock: 2446 spin_unlock_irq(&engine->active.lock); 2447 return rq; 2448 } 2449 2450 static bool hold_request(const struct i915_request *rq) 2451 { 2452 struct i915_dependency *p; 2453 2454 /* 2455 * If one of our ancestors is on hold, we must also be on hold, 2456 * otherwise we will bypass it and execute before it. 2457 */ 2458 list_for_each_entry(p, &rq->sched.signalers_list, signal_link) { 2459 const struct i915_request *s = 2460 container_of(p->signaler, typeof(*s), sched); 2461 2462 if (s->engine != rq->engine) 2463 continue; 2464 2465 if (i915_request_on_hold(s)) 2466 return true; 2467 } 2468 2469 return false; 2470 } 2471 2472 static void __execlists_unhold(struct i915_request *rq) 2473 { 2474 LIST_HEAD(list); 2475 2476 do { 2477 struct i915_dependency *p; 2478 2479 GEM_BUG_ON(!i915_request_on_hold(rq)); 2480 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2481 2482 i915_request_clear_hold(rq); 2483 list_move_tail(&rq->sched.link, 2484 i915_sched_lookup_priolist(rq->engine, 2485 rq_prio(rq))); 2486 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2487 RQ_TRACE(rq, "hold release\n"); 2488 2489 /* Also release any children on this engine that are ready */ 2490 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 2491 struct i915_request *w = 2492 container_of(p->waiter, typeof(*w), sched); 2493 2494 if (w->engine != rq->engine) 2495 continue; 2496 2497 if (!i915_request_on_hold(rq)) 2498 continue; 2499 2500 /* Check that no other parents are also on hold */ 2501 if (hold_request(rq)) 2502 continue; 2503 2504 list_move_tail(&w->sched.link, &list); 2505 } 2506 2507 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2508 } while (rq); 2509 } 2510 2511 static void execlists_unhold(struct intel_engine_cs *engine, 2512 struct i915_request *rq) 2513 { 2514 spin_lock_irq(&engine->active.lock); 2515 2516 /* 2517 * Move this request back to the priority queue, and all of its 2518 * children and grandchildren that were suspended along with it. 2519 */ 2520 __execlists_unhold(rq); 2521 2522 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2523 engine->execlists.queue_priority_hint = rq_prio(rq); 2524 tasklet_hi_schedule(&engine->execlists.tasklet); 2525 } 2526 2527 spin_unlock_irq(&engine->active.lock); 2528 } 2529 2530 struct execlists_capture { 2531 struct work_struct work; 2532 struct i915_request *rq; 2533 struct i915_gpu_coredump *error; 2534 }; 2535 2536 static void execlists_capture_work(struct work_struct *work) 2537 { 2538 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2539 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2540 struct intel_engine_cs *engine = cap->rq->engine; 2541 struct intel_gt_coredump *gt = cap->error->gt; 2542 struct intel_engine_capture_vma *vma; 2543 2544 /* Compress all the objects attached to the request, slow! */ 2545 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2546 if (vma) { 2547 struct i915_vma_compress *compress = 2548 i915_vma_capture_prepare(gt); 2549 2550 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2551 i915_vma_capture_finish(gt, compress); 2552 } 2553 2554 gt->simulated = gt->engine->simulated; 2555 cap->error->simulated = gt->simulated; 2556 2557 /* Publish the error state, and announce it to the world */ 2558 i915_error_state_store(cap->error); 2559 i915_gpu_coredump_put(cap->error); 2560 2561 /* Return this request and all that depend upon it for signaling */ 2562 execlists_unhold(engine, cap->rq); 2563 i915_request_put(cap->rq); 2564 2565 kfree(cap); 2566 } 2567 2568 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2569 { 2570 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2571 struct execlists_capture *cap; 2572 2573 cap = kmalloc(sizeof(*cap), gfp); 2574 if (!cap) 2575 return NULL; 2576 2577 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2578 if (!cap->error) 2579 goto err_cap; 2580 2581 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2582 if (!cap->error->gt) 2583 goto err_gpu; 2584 2585 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2586 if (!cap->error->gt->engine) 2587 goto err_gt; 2588 2589 return cap; 2590 2591 err_gt: 2592 kfree(cap->error->gt); 2593 err_gpu: 2594 kfree(cap->error); 2595 err_cap: 2596 kfree(cap); 2597 return NULL; 2598 } 2599 2600 static bool execlists_capture(struct intel_engine_cs *engine) 2601 { 2602 struct execlists_capture *cap; 2603 2604 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 2605 return true; 2606 2607 /* 2608 * We need to _quickly_ capture the engine state before we reset. 2609 * We are inside an atomic section (softirq) here and we are delaying 2610 * the forced preemption event. 2611 */ 2612 cap = capture_regs(engine); 2613 if (!cap) 2614 return true; 2615 2616 cap->rq = execlists_active(&engine->execlists); 2617 GEM_BUG_ON(!cap->rq); 2618 2619 rcu_read_lock(); 2620 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 2621 cap->rq = i915_request_get_rcu(cap->rq); 2622 rcu_read_unlock(); 2623 if (!cap->rq) 2624 goto err_free; 2625 2626 /* 2627 * Remove the request from the execlists queue, and take ownership 2628 * of the request. We pass it to our worker who will _slowly_ compress 2629 * all the pages the _user_ requested for debugging their batch, after 2630 * which we return it to the queue for signaling. 2631 * 2632 * By removing them from the execlists queue, we also remove the 2633 * requests from being processed by __unwind_incomplete_requests() 2634 * during the intel_engine_reset(), and so they will *not* be replayed 2635 * afterwards. 2636 * 2637 * Note that because we have not yet reset the engine at this point, 2638 * it is possible for the request that we have identified as being 2639 * guilty, did in fact complete and we will then hit an arbitration 2640 * point allowing the outstanding preemption to succeed. The likelihood 2641 * of that is very low (as capturing of the engine registers should be 2642 * fast enough to run inside an irq-off atomic section!), so we will 2643 * simply hold that request accountable for being non-preemptible 2644 * long enough to force the reset. 2645 */ 2646 if (!execlists_hold(engine, cap->rq)) 2647 goto err_rq; 2648 2649 INIT_WORK(&cap->work, execlists_capture_work); 2650 schedule_work(&cap->work); 2651 return true; 2652 2653 err_rq: 2654 i915_request_put(cap->rq); 2655 err_free: 2656 i915_gpu_coredump_put(cap->error); 2657 kfree(cap); 2658 return false; 2659 } 2660 2661 static noinline void preempt_reset(struct intel_engine_cs *engine) 2662 { 2663 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2664 unsigned long *lock = &engine->gt->reset.flags; 2665 2666 if (i915_modparams.reset < 3) 2667 return; 2668 2669 if (test_and_set_bit(bit, lock)) 2670 return; 2671 2672 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2673 tasklet_disable_nosync(&engine->execlists.tasklet); 2674 2675 ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n", 2676 READ_ONCE(engine->props.preempt_timeout_ms), 2677 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); 2678 2679 ring_set_paused(engine, 1); /* Freeze the current request in place */ 2680 if (execlists_capture(engine)) 2681 intel_engine_reset(engine, "preemption time out"); 2682 else 2683 ring_set_paused(engine, 0); 2684 2685 tasklet_enable(&engine->execlists.tasklet); 2686 clear_and_wake_up_bit(bit, lock); 2687 } 2688 2689 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2690 { 2691 const struct timer_list *t = &engine->execlists.preempt; 2692 2693 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2694 return false; 2695 2696 if (!timer_expired(t)) 2697 return false; 2698 2699 return READ_ONCE(engine->execlists.pending[0]); 2700 } 2701 2702 /* 2703 * Check the unread Context Status Buffers and manage the submission of new 2704 * contexts to the ELSP accordingly. 2705 */ 2706 static void execlists_submission_tasklet(unsigned long data) 2707 { 2708 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2709 bool timeout = preempt_timeout(engine); 2710 2711 process_csb(engine); 2712 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2713 unsigned long flags; 2714 2715 spin_lock_irqsave(&engine->active.lock, flags); 2716 __execlists_submission_tasklet(engine); 2717 spin_unlock_irqrestore(&engine->active.lock, flags); 2718 2719 /* Recheck after serialising with direct-submission */ 2720 if (timeout && preempt_timeout(engine)) 2721 preempt_reset(engine); 2722 } 2723 } 2724 2725 static void __execlists_kick(struct intel_engine_execlists *execlists) 2726 { 2727 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2728 tasklet_hi_schedule(&execlists->tasklet); 2729 } 2730 2731 #define execlists_kick(t, member) \ 2732 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2733 2734 static void execlists_timeslice(struct timer_list *timer) 2735 { 2736 execlists_kick(timer, timer); 2737 } 2738 2739 static void execlists_preempt(struct timer_list *timer) 2740 { 2741 execlists_kick(timer, preempt); 2742 } 2743 2744 static void queue_request(struct intel_engine_cs *engine, 2745 struct i915_request *rq) 2746 { 2747 GEM_BUG_ON(!list_empty(&rq->sched.link)); 2748 list_add_tail(&rq->sched.link, 2749 i915_sched_lookup_priolist(engine, rq_prio(rq))); 2750 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2751 } 2752 2753 static void __submit_queue_imm(struct intel_engine_cs *engine) 2754 { 2755 struct intel_engine_execlists * const execlists = &engine->execlists; 2756 2757 if (reset_in_progress(execlists)) 2758 return; /* defer until we restart the engine following reset */ 2759 2760 if (execlists->tasklet.func == execlists_submission_tasklet) 2761 __execlists_submission_tasklet(engine); 2762 else 2763 tasklet_hi_schedule(&execlists->tasklet); 2764 } 2765 2766 static void submit_queue(struct intel_engine_cs *engine, 2767 const struct i915_request *rq) 2768 { 2769 struct intel_engine_execlists *execlists = &engine->execlists; 2770 2771 if (rq_prio(rq) <= execlists->queue_priority_hint) 2772 return; 2773 2774 execlists->queue_priority_hint = rq_prio(rq); 2775 __submit_queue_imm(engine); 2776 } 2777 2778 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 2779 const struct i915_request *rq) 2780 { 2781 GEM_BUG_ON(i915_request_on_hold(rq)); 2782 return !list_empty(&engine->active.hold) && hold_request(rq); 2783 } 2784 2785 static void execlists_submit_request(struct i915_request *request) 2786 { 2787 struct intel_engine_cs *engine = request->engine; 2788 unsigned long flags; 2789 2790 /* Will be called from irq-context when using foreign fences. */ 2791 spin_lock_irqsave(&engine->active.lock, flags); 2792 2793 if (unlikely(ancestor_on_hold(engine, request))) { 2794 list_add_tail(&request->sched.link, &engine->active.hold); 2795 i915_request_set_hold(request); 2796 } else { 2797 queue_request(engine, request); 2798 2799 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2800 GEM_BUG_ON(list_empty(&request->sched.link)); 2801 2802 submit_queue(engine, request); 2803 } 2804 2805 spin_unlock_irqrestore(&engine->active.lock, flags); 2806 } 2807 2808 static void __execlists_context_fini(struct intel_context *ce) 2809 { 2810 intel_ring_put(ce->ring); 2811 i915_vma_put(ce->state); 2812 } 2813 2814 static void execlists_context_destroy(struct kref *kref) 2815 { 2816 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2817 2818 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2819 GEM_BUG_ON(intel_context_is_pinned(ce)); 2820 2821 if (ce->state) 2822 __execlists_context_fini(ce); 2823 2824 intel_context_fini(ce); 2825 intel_context_free(ce); 2826 } 2827 2828 static void 2829 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2830 { 2831 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2832 return; 2833 2834 vaddr += engine->context_size; 2835 2836 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 2837 } 2838 2839 static void 2840 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2841 { 2842 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2843 return; 2844 2845 vaddr += engine->context_size; 2846 2847 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 2848 dev_err_once(engine->i915->drm.dev, 2849 "%s context redzone overwritten!\n", 2850 engine->name); 2851 } 2852 2853 static void execlists_context_unpin(struct intel_context *ce) 2854 { 2855 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2856 ce->engine); 2857 2858 i915_gem_object_unpin_map(ce->state->obj); 2859 } 2860 2861 static void 2862 __execlists_update_reg_state(const struct intel_context *ce, 2863 const struct intel_engine_cs *engine) 2864 { 2865 struct intel_ring *ring = ce->ring; 2866 u32 *regs = ce->lrc_reg_state; 2867 2868 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); 2869 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2870 2871 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 2872 regs[CTX_RING_HEAD] = ring->head; 2873 regs[CTX_RING_TAIL] = ring->tail; 2874 2875 /* RPCS */ 2876 if (engine->class == RENDER_CLASS) { 2877 regs[CTX_R_PWR_CLK_STATE] = 2878 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2879 2880 i915_oa_init_reg_state(ce, engine); 2881 } 2882 } 2883 2884 static int 2885 __execlists_context_pin(struct intel_context *ce, 2886 struct intel_engine_cs *engine) 2887 { 2888 void *vaddr; 2889 2890 GEM_BUG_ON(!ce->state); 2891 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2892 2893 vaddr = i915_gem_object_pin_map(ce->state->obj, 2894 i915_coherent_map_type(engine->i915) | 2895 I915_MAP_OVERRIDE); 2896 if (IS_ERR(vaddr)) 2897 return PTR_ERR(vaddr); 2898 2899 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 2900 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2901 __execlists_update_reg_state(ce, engine); 2902 2903 return 0; 2904 } 2905 2906 static int execlists_context_pin(struct intel_context *ce) 2907 { 2908 return __execlists_context_pin(ce, ce->engine); 2909 } 2910 2911 static int execlists_context_alloc(struct intel_context *ce) 2912 { 2913 return __execlists_context_alloc(ce, ce->engine); 2914 } 2915 2916 static void execlists_context_reset(struct intel_context *ce) 2917 { 2918 CE_TRACE(ce, "reset\n"); 2919 GEM_BUG_ON(!intel_context_is_pinned(ce)); 2920 2921 /* 2922 * Because we emit WA_TAIL_DWORDS there may be a disparity 2923 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2924 * that stored in context. As we only write new commands from 2925 * ce->ring->tail onwards, everything before that is junk. If the GPU 2926 * starts reading from its RING_HEAD from the context, it may try to 2927 * execute that junk and die. 2928 * 2929 * The contexts that are stilled pinned on resume belong to the 2930 * kernel, and are local to each engine. All other contexts will 2931 * have their head/tail sanitized upon pinning before use, so they 2932 * will never see garbage, 2933 * 2934 * So to avoid that we reset the context images upon resume. For 2935 * simplicity, we just zero everything out. 2936 */ 2937 intel_ring_reset(ce->ring, ce->ring->emit); 2938 2939 /* Scrub away the garbage */ 2940 execlists_init_reg_state(ce->lrc_reg_state, 2941 ce, ce->engine, ce->ring, true); 2942 __execlists_update_reg_state(ce, ce->engine); 2943 2944 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 2945 } 2946 2947 static const struct intel_context_ops execlists_context_ops = { 2948 .alloc = execlists_context_alloc, 2949 2950 .pin = execlists_context_pin, 2951 .unpin = execlists_context_unpin, 2952 2953 .enter = intel_context_enter_engine, 2954 .exit = intel_context_exit_engine, 2955 2956 .reset = execlists_context_reset, 2957 .destroy = execlists_context_destroy, 2958 }; 2959 2960 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 2961 { 2962 u32 *cs; 2963 2964 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb); 2965 2966 cs = intel_ring_begin(rq, 6); 2967 if (IS_ERR(cs)) 2968 return PTR_ERR(cs); 2969 2970 /* 2971 * Check if we have been preempted before we even get started. 2972 * 2973 * After this point i915_request_started() reports true, even if 2974 * we get preempted and so are no longer running. 2975 */ 2976 *cs++ = MI_ARB_CHECK; 2977 *cs++ = MI_NOOP; 2978 2979 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 2980 *cs++ = i915_request_timeline(rq)->hwsp_offset; 2981 *cs++ = 0; 2982 *cs++ = rq->fence.seqno - 1; 2983 2984 intel_ring_advance(rq, cs); 2985 2986 /* Record the updated position of the request's payload */ 2987 rq->infix = intel_ring_offset(rq, cs); 2988 2989 return 0; 2990 } 2991 2992 static int execlists_request_alloc(struct i915_request *request) 2993 { 2994 int ret; 2995 2996 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 2997 2998 /* 2999 * Flush enough space to reduce the likelihood of waiting after 3000 * we start building the request - in which case we will just 3001 * have to repeat work. 3002 */ 3003 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3004 3005 /* 3006 * Note that after this point, we have committed to using 3007 * this request as it is being used to both track the 3008 * state of engine initialisation and liveness of the 3009 * golden renderstate above. Think twice before you try 3010 * to cancel/unwind this request now. 3011 */ 3012 3013 /* Unconditionally invalidate GPU caches and TLBs. */ 3014 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3015 if (ret) 3016 return ret; 3017 3018 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3019 return 0; 3020 } 3021 3022 /* 3023 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3024 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3025 * but there is a slight complication as this is applied in WA batch where the 3026 * values are only initialized once so we cannot take register value at the 3027 * beginning and reuse it further; hence we save its value to memory, upload a 3028 * constant value with bit21 set and then we restore it back with the saved value. 3029 * To simplify the WA, a constant value is formed by using the default value 3030 * of this register. This shouldn't be a problem because we are only modifying 3031 * it for a short period and this batch in non-premptible. We can ofcourse 3032 * use additional instructions that read the actual value of the register 3033 * at that time and set our bit of interest but it makes the WA complicated. 3034 * 3035 * This WA is also required for Gen9 so extracting as a function avoids 3036 * code duplication. 3037 */ 3038 static u32 * 3039 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3040 { 3041 /* NB no one else is allowed to scribble over scratch + 256! */ 3042 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3043 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3044 *batch++ = intel_gt_scratch_offset(engine->gt, 3045 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3046 *batch++ = 0; 3047 3048 *batch++ = MI_LOAD_REGISTER_IMM(1); 3049 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3050 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3051 3052 batch = gen8_emit_pipe_control(batch, 3053 PIPE_CONTROL_CS_STALL | 3054 PIPE_CONTROL_DC_FLUSH_ENABLE, 3055 0); 3056 3057 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3058 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3059 *batch++ = intel_gt_scratch_offset(engine->gt, 3060 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3061 *batch++ = 0; 3062 3063 return batch; 3064 } 3065 3066 /* 3067 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3068 * initialized at the beginning and shared across all contexts but this field 3069 * helps us to have multiple batches at different offsets and select them based 3070 * on a criteria. At the moment this batch always start at the beginning of the page 3071 * and at this point we don't have multiple wa_ctx batch buffers. 3072 * 3073 * The number of WA applied are not known at the beginning; we use this field 3074 * to return the no of DWORDS written. 3075 * 3076 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3077 * so it adds NOOPs as padding to make it cacheline aligned. 3078 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3079 * makes a complete batch buffer. 3080 */ 3081 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3082 { 3083 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3084 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3085 3086 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3087 if (IS_BROADWELL(engine->i915)) 3088 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3089 3090 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3091 /* Actual scratch location is at 128 bytes offset */ 3092 batch = gen8_emit_pipe_control(batch, 3093 PIPE_CONTROL_FLUSH_L3 | 3094 PIPE_CONTROL_STORE_DATA_INDEX | 3095 PIPE_CONTROL_CS_STALL | 3096 PIPE_CONTROL_QW_WRITE, 3097 LRC_PPHWSP_SCRATCH_ADDR); 3098 3099 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3100 3101 /* Pad to end of cacheline */ 3102 while ((unsigned long)batch % CACHELINE_BYTES) 3103 *batch++ = MI_NOOP; 3104 3105 /* 3106 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3107 * execution depends on the length specified in terms of cache lines 3108 * in the register CTX_RCS_INDIRECT_CTX 3109 */ 3110 3111 return batch; 3112 } 3113 3114 struct lri { 3115 i915_reg_t reg; 3116 u32 value; 3117 }; 3118 3119 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3120 { 3121 GEM_BUG_ON(!count || count > 63); 3122 3123 *batch++ = MI_LOAD_REGISTER_IMM(count); 3124 do { 3125 *batch++ = i915_mmio_reg_offset(lri->reg); 3126 *batch++ = lri->value; 3127 } while (lri++, --count); 3128 *batch++ = MI_NOOP; 3129 3130 return batch; 3131 } 3132 3133 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3134 { 3135 static const struct lri lri[] = { 3136 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3137 { 3138 COMMON_SLICE_CHICKEN2, 3139 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3140 0), 3141 }, 3142 3143 /* BSpec: 11391 */ 3144 { 3145 FF_SLICE_CHICKEN, 3146 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3147 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3148 }, 3149 3150 /* BSpec: 11299 */ 3151 { 3152 _3D_CHICKEN3, 3153 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3154 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3155 } 3156 }; 3157 3158 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3159 3160 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3161 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3162 3163 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3164 batch = gen8_emit_pipe_control(batch, 3165 PIPE_CONTROL_FLUSH_L3 | 3166 PIPE_CONTROL_STORE_DATA_INDEX | 3167 PIPE_CONTROL_CS_STALL | 3168 PIPE_CONTROL_QW_WRITE, 3169 LRC_PPHWSP_SCRATCH_ADDR); 3170 3171 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3172 3173 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3174 if (HAS_POOLED_EU(engine->i915)) { 3175 /* 3176 * EU pool configuration is setup along with golden context 3177 * during context initialization. This value depends on 3178 * device type (2x6 or 3x6) and needs to be updated based 3179 * on which subslice is disabled especially for 2x6 3180 * devices, however it is safe to load default 3181 * configuration of 3x6 device instead of masking off 3182 * corresponding bits because HW ignores bits of a disabled 3183 * subslice and drops down to appropriate config. Please 3184 * see render_state_setup() in i915_gem_render_state.c for 3185 * possible configurations, to avoid duplication they are 3186 * not shown here again. 3187 */ 3188 *batch++ = GEN9_MEDIA_POOL_STATE; 3189 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3190 *batch++ = 0x00777000; 3191 *batch++ = 0; 3192 *batch++ = 0; 3193 *batch++ = 0; 3194 } 3195 3196 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3197 3198 /* Pad to end of cacheline */ 3199 while ((unsigned long)batch % CACHELINE_BYTES) 3200 *batch++ = MI_NOOP; 3201 3202 return batch; 3203 } 3204 3205 static u32 * 3206 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3207 { 3208 int i; 3209 3210 /* 3211 * WaPipeControlBefore3DStateSamplePattern: cnl 3212 * 3213 * Ensure the engine is idle prior to programming a 3214 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3215 */ 3216 batch = gen8_emit_pipe_control(batch, 3217 PIPE_CONTROL_CS_STALL, 3218 0); 3219 /* 3220 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3221 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3222 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3223 * confusing. Since gen8_emit_pipe_control() already advances the 3224 * batch by 6 dwords, we advance the other 10 here, completing a 3225 * cacheline. It's not clear if the workaround requires this padding 3226 * before other commands, or if it's just the regular padding we would 3227 * already have for the workaround bb, so leave it here for now. 3228 */ 3229 for (i = 0; i < 10; i++) 3230 *batch++ = MI_NOOP; 3231 3232 /* Pad to end of cacheline */ 3233 while ((unsigned long)batch % CACHELINE_BYTES) 3234 *batch++ = MI_NOOP; 3235 3236 return batch; 3237 } 3238 3239 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3240 3241 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3242 { 3243 struct drm_i915_gem_object *obj; 3244 struct i915_vma *vma; 3245 int err; 3246 3247 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3248 if (IS_ERR(obj)) 3249 return PTR_ERR(obj); 3250 3251 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3252 if (IS_ERR(vma)) { 3253 err = PTR_ERR(vma); 3254 goto err; 3255 } 3256 3257 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 3258 if (err) 3259 goto err; 3260 3261 engine->wa_ctx.vma = vma; 3262 return 0; 3263 3264 err: 3265 i915_gem_object_put(obj); 3266 return err; 3267 } 3268 3269 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3270 { 3271 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3272 } 3273 3274 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3275 3276 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3277 { 3278 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3279 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3280 &wa_ctx->per_ctx }; 3281 wa_bb_func_t wa_bb_fn[2]; 3282 struct page *page; 3283 void *batch, *batch_ptr; 3284 unsigned int i; 3285 int ret; 3286 3287 if (engine->class != RENDER_CLASS) 3288 return 0; 3289 3290 switch (INTEL_GEN(engine->i915)) { 3291 case 12: 3292 case 11: 3293 return 0; 3294 case 10: 3295 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3296 wa_bb_fn[1] = NULL; 3297 break; 3298 case 9: 3299 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3300 wa_bb_fn[1] = NULL; 3301 break; 3302 case 8: 3303 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3304 wa_bb_fn[1] = NULL; 3305 break; 3306 default: 3307 MISSING_CASE(INTEL_GEN(engine->i915)); 3308 return 0; 3309 } 3310 3311 ret = lrc_setup_wa_ctx(engine); 3312 if (ret) { 3313 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 3314 return ret; 3315 } 3316 3317 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 3318 batch = batch_ptr = kmap_atomic(page); 3319 3320 /* 3321 * Emit the two workaround batch buffers, recording the offset from the 3322 * start of the workaround batch buffer object for each and their 3323 * respective sizes. 3324 */ 3325 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3326 wa_bb[i]->offset = batch_ptr - batch; 3327 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3328 CACHELINE_BYTES))) { 3329 ret = -EINVAL; 3330 break; 3331 } 3332 if (wa_bb_fn[i]) 3333 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3334 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3335 } 3336 3337 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3338 3339 kunmap_atomic(batch); 3340 if (ret) 3341 lrc_destroy_wa_ctx(engine); 3342 3343 return ret; 3344 } 3345 3346 static void enable_execlists(struct intel_engine_cs *engine) 3347 { 3348 u32 mode; 3349 3350 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 3351 3352 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 3353 3354 if (INTEL_GEN(engine->i915) >= 11) 3355 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 3356 else 3357 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 3358 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 3359 3360 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 3361 3362 ENGINE_WRITE_FW(engine, 3363 RING_HWS_PGA, 3364 i915_ggtt_offset(engine->status_page.vma)); 3365 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 3366 3367 engine->context_tag = 0; 3368 } 3369 3370 static bool unexpected_starting_state(struct intel_engine_cs *engine) 3371 { 3372 bool unexpected = false; 3373 3374 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 3375 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 3376 unexpected = true; 3377 } 3378 3379 return unexpected; 3380 } 3381 3382 static int execlists_resume(struct intel_engine_cs *engine) 3383 { 3384 intel_engine_apply_workarounds(engine); 3385 intel_engine_apply_whitelist(engine); 3386 3387 intel_mocs_init_engine(engine); 3388 3389 intel_engine_reset_breadcrumbs(engine); 3390 3391 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 3392 struct drm_printer p = drm_debug_printer(__func__); 3393 3394 intel_engine_dump(engine, &p, NULL); 3395 } 3396 3397 enable_execlists(engine); 3398 3399 return 0; 3400 } 3401 3402 static void execlists_reset_prepare(struct intel_engine_cs *engine) 3403 { 3404 struct intel_engine_execlists * const execlists = &engine->execlists; 3405 unsigned long flags; 3406 3407 ENGINE_TRACE(engine, "depth<-%d\n", 3408 atomic_read(&execlists->tasklet.count)); 3409 3410 /* 3411 * Prevent request submission to the hardware until we have 3412 * completed the reset in i915_gem_reset_finish(). If a request 3413 * is completed by one engine, it may then queue a request 3414 * to a second via its execlists->tasklet *just* as we are 3415 * calling engine->resume() and also writing the ELSP. 3416 * Turning off the execlists->tasklet until the reset is over 3417 * prevents the race. 3418 */ 3419 __tasklet_disable_sync_once(&execlists->tasklet); 3420 GEM_BUG_ON(!reset_in_progress(execlists)); 3421 3422 /* And flush any current direct submission. */ 3423 spin_lock_irqsave(&engine->active.lock, flags); 3424 spin_unlock_irqrestore(&engine->active.lock, flags); 3425 3426 /* 3427 * We stop engines, otherwise we might get failed reset and a 3428 * dead gpu (on elk). Also as modern gpu as kbl can suffer 3429 * from system hang if batchbuffer is progressing when 3430 * the reset is issued, regardless of READY_TO_RESET ack. 3431 * Thus assume it is best to stop engines on all gens 3432 * where we have a gpu reset. 3433 * 3434 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 3435 * 3436 * FIXME: Wa for more modern gens needs to be validated 3437 */ 3438 intel_engine_stop_cs(engine); 3439 } 3440 3441 static void reset_csb_pointers(struct intel_engine_cs *engine) 3442 { 3443 struct intel_engine_execlists * const execlists = &engine->execlists; 3444 const unsigned int reset_value = execlists->csb_size - 1; 3445 3446 ring_set_paused(engine, 0); 3447 3448 /* 3449 * After a reset, the HW starts writing into CSB entry [0]. We 3450 * therefore have to set our HEAD pointer back one entry so that 3451 * the *first* entry we check is entry 0. To complicate this further, 3452 * as we don't wait for the first interrupt after reset, we have to 3453 * fake the HW write to point back to the last entry so that our 3454 * inline comparison of our cached head position against the last HW 3455 * write works even before the first interrupt. 3456 */ 3457 execlists->csb_head = reset_value; 3458 WRITE_ONCE(*execlists->csb_write, reset_value); 3459 wmb(); /* Make sure this is visible to HW (paranoia?) */ 3460 3461 /* 3462 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3463 * Bludgeon them with a mmio update to be sure. 3464 */ 3465 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3466 reset_value << 8 | reset_value); 3467 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3468 3469 invalidate_csb_entries(&execlists->csb_status[0], 3470 &execlists->csb_status[reset_value]); 3471 } 3472 3473 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 3474 { 3475 int x; 3476 3477 x = lrc_ring_mi_mode(engine); 3478 if (x != -1) { 3479 regs[x + 1] &= ~STOP_RING; 3480 regs[x + 1] |= STOP_RING << 16; 3481 } 3482 } 3483 3484 static void __execlists_reset_reg_state(const struct intel_context *ce, 3485 const struct intel_engine_cs *engine) 3486 { 3487 u32 *regs = ce->lrc_reg_state; 3488 3489 __reset_stop_ring(regs, engine); 3490 } 3491 3492 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 3493 { 3494 struct intel_engine_execlists * const execlists = &engine->execlists; 3495 struct intel_context *ce; 3496 struct i915_request *rq; 3497 3498 mb(); /* paranoia: read the CSB pointers from after the reset */ 3499 clflush(execlists->csb_write); 3500 mb(); 3501 3502 process_csb(engine); /* drain preemption events */ 3503 3504 /* Following the reset, we need to reload the CSB read/write pointers */ 3505 reset_csb_pointers(engine); 3506 3507 /* 3508 * Save the currently executing context, even if we completed 3509 * its request, it was still running at the time of the 3510 * reset and will have been clobbered. 3511 */ 3512 rq = execlists_active(execlists); 3513 if (!rq) 3514 goto unwind; 3515 3516 /* We still have requests in-flight; the engine should be active */ 3517 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3518 3519 ce = rq->context; 3520 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3521 3522 if (i915_request_completed(rq)) { 3523 /* Idle context; tidy up the ring so we can restart afresh */ 3524 ce->ring->head = intel_ring_wrap(ce->ring, rq->tail); 3525 goto out_replay; 3526 } 3527 3528 /* Context has requests still in-flight; it should not be idle! */ 3529 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3530 rq = active_request(ce->timeline, rq); 3531 ce->ring->head = intel_ring_wrap(ce->ring, rq->head); 3532 GEM_BUG_ON(ce->ring->head == ce->ring->tail); 3533 3534 /* 3535 * If this request hasn't started yet, e.g. it is waiting on a 3536 * semaphore, we need to avoid skipping the request or else we 3537 * break the signaling chain. However, if the context is corrupt 3538 * the request will not restart and we will be stuck with a wedged 3539 * device. It is quite often the case that if we issue a reset 3540 * while the GPU is loading the context image, that the context 3541 * image becomes corrupt. 3542 * 3543 * Otherwise, if we have not started yet, the request should replay 3544 * perfectly and we do not need to flag the result as being erroneous. 3545 */ 3546 if (!i915_request_started(rq)) 3547 goto out_replay; 3548 3549 /* 3550 * If the request was innocent, we leave the request in the ELSP 3551 * and will try to replay it on restarting. The context image may 3552 * have been corrupted by the reset, in which case we may have 3553 * to service a new GPU hang, but more likely we can continue on 3554 * without impact. 3555 * 3556 * If the request was guilty, we presume the context is corrupt 3557 * and have to at least restore the RING register in the context 3558 * image back to the expected values to skip over the guilty request. 3559 */ 3560 __i915_request_reset(rq, stalled); 3561 if (!stalled) 3562 goto out_replay; 3563 3564 /* 3565 * We want a simple context + ring to execute the breadcrumb update. 3566 * We cannot rely on the context being intact across the GPU hang, 3567 * so clear it and rebuild just what we need for the breadcrumb. 3568 * All pending requests for this context will be zapped, and any 3569 * future request will be after userspace has had the opportunity 3570 * to recreate its own state. 3571 */ 3572 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3573 restore_default_state(ce, engine); 3574 3575 out_replay: 3576 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 3577 ce->ring->head, ce->ring->tail); 3578 intel_ring_update_space(ce->ring); 3579 __execlists_reset_reg_state(ce, engine); 3580 __execlists_update_reg_state(ce, engine); 3581 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3582 3583 unwind: 3584 /* Push back any incomplete requests for replay after the reset. */ 3585 cancel_port_requests(execlists); 3586 __unwind_incomplete_requests(engine); 3587 } 3588 3589 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 3590 { 3591 unsigned long flags; 3592 3593 ENGINE_TRACE(engine, "\n"); 3594 3595 spin_lock_irqsave(&engine->active.lock, flags); 3596 3597 __execlists_reset(engine, stalled); 3598 3599 spin_unlock_irqrestore(&engine->active.lock, flags); 3600 } 3601 3602 static void nop_submission_tasklet(unsigned long data) 3603 { 3604 /* The driver is wedged; don't process any more events. */ 3605 } 3606 3607 static void execlists_reset_cancel(struct intel_engine_cs *engine) 3608 { 3609 struct intel_engine_execlists * const execlists = &engine->execlists; 3610 struct i915_request *rq, *rn; 3611 struct rb_node *rb; 3612 unsigned long flags; 3613 3614 ENGINE_TRACE(engine, "\n"); 3615 3616 /* 3617 * Before we call engine->cancel_requests(), we should have exclusive 3618 * access to the submission state. This is arranged for us by the 3619 * caller disabling the interrupt generation, the tasklet and other 3620 * threads that may then access the same state, giving us a free hand 3621 * to reset state. However, we still need to let lockdep be aware that 3622 * we know this state may be accessed in hardirq context, so we 3623 * disable the irq around this manipulation and we want to keep 3624 * the spinlock focused on its duties and not accidentally conflate 3625 * coverage to the submission's irq state. (Similarly, although we 3626 * shouldn't need to disable irq around the manipulation of the 3627 * submission's irq state, we also wish to remind ourselves that 3628 * it is irq state.) 3629 */ 3630 spin_lock_irqsave(&engine->active.lock, flags); 3631 3632 __execlists_reset(engine, true); 3633 3634 /* Mark all executing requests as skipped. */ 3635 list_for_each_entry(rq, &engine->active.requests, sched.link) 3636 mark_eio(rq); 3637 3638 /* Flush the queued requests to the timeline list (for retiring). */ 3639 while ((rb = rb_first_cached(&execlists->queue))) { 3640 struct i915_priolist *p = to_priolist(rb); 3641 int i; 3642 3643 priolist_for_each_request_consume(rq, rn, p, i) { 3644 mark_eio(rq); 3645 __i915_request_submit(rq); 3646 } 3647 3648 rb_erase_cached(&p->node, &execlists->queue); 3649 i915_priolist_free(p); 3650 } 3651 3652 /* On-hold requests will be flushed to timeline upon their release */ 3653 list_for_each_entry(rq, &engine->active.hold, sched.link) 3654 mark_eio(rq); 3655 3656 /* Cancel all attached virtual engines */ 3657 while ((rb = rb_first_cached(&execlists->virtual))) { 3658 struct virtual_engine *ve = 3659 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3660 3661 rb_erase_cached(rb, &execlists->virtual); 3662 RB_CLEAR_NODE(rb); 3663 3664 spin_lock(&ve->base.active.lock); 3665 rq = fetch_and_zero(&ve->request); 3666 if (rq) { 3667 mark_eio(rq); 3668 3669 rq->engine = engine; 3670 __i915_request_submit(rq); 3671 i915_request_put(rq); 3672 3673 ve->base.execlists.queue_priority_hint = INT_MIN; 3674 } 3675 spin_unlock(&ve->base.active.lock); 3676 } 3677 3678 /* Remaining _unready_ requests will be nop'ed when submitted */ 3679 3680 execlists->queue_priority_hint = INT_MIN; 3681 execlists->queue = RB_ROOT_CACHED; 3682 3683 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3684 execlists->tasklet.func = nop_submission_tasklet; 3685 3686 spin_unlock_irqrestore(&engine->active.lock, flags); 3687 } 3688 3689 static void execlists_reset_finish(struct intel_engine_cs *engine) 3690 { 3691 struct intel_engine_execlists * const execlists = &engine->execlists; 3692 3693 /* 3694 * After a GPU reset, we may have requests to replay. Do so now while 3695 * we still have the forcewake to be sure that the GPU is not allowed 3696 * to sleep before we restart and reload a context. 3697 */ 3698 GEM_BUG_ON(!reset_in_progress(execlists)); 3699 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3700 execlists->tasklet.func(execlists->tasklet.data); 3701 3702 if (__tasklet_enable(&execlists->tasklet)) 3703 /* And kick in case we missed a new request submission. */ 3704 tasklet_hi_schedule(&execlists->tasklet); 3705 ENGINE_TRACE(engine, "depth->%d\n", 3706 atomic_read(&execlists->tasklet.count)); 3707 } 3708 3709 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 3710 u64 offset, u32 len, 3711 const unsigned int flags) 3712 { 3713 u32 *cs; 3714 3715 cs = intel_ring_begin(rq, 4); 3716 if (IS_ERR(cs)) 3717 return PTR_ERR(cs); 3718 3719 /* 3720 * WaDisableCtxRestoreArbitration:bdw,chv 3721 * 3722 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3723 * particular all the gen that do not need the w/a at all!), if we 3724 * took care to make sure that on every switch into this context 3725 * (both ordinary and for preemption) that arbitrartion was enabled 3726 * we would be fine. However, for gen8 there is another w/a that 3727 * requires us to not preempt inside GPGPU execution, so we keep 3728 * arbitration disabled for gen8 batches. Arbitration will be 3729 * re-enabled before we close the request 3730 * (engine->emit_fini_breadcrumb). 3731 */ 3732 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3733 3734 /* FIXME(BDW+): Address space and security selectors. */ 3735 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3736 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3737 *cs++ = lower_32_bits(offset); 3738 *cs++ = upper_32_bits(offset); 3739 3740 intel_ring_advance(rq, cs); 3741 3742 return 0; 3743 } 3744 3745 static int gen8_emit_bb_start(struct i915_request *rq, 3746 u64 offset, u32 len, 3747 const unsigned int flags) 3748 { 3749 u32 *cs; 3750 3751 cs = intel_ring_begin(rq, 6); 3752 if (IS_ERR(cs)) 3753 return PTR_ERR(cs); 3754 3755 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3756 3757 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3758 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3759 *cs++ = lower_32_bits(offset); 3760 *cs++ = upper_32_bits(offset); 3761 3762 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3763 *cs++ = MI_NOOP; 3764 3765 intel_ring_advance(rq, cs); 3766 3767 return 0; 3768 } 3769 3770 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3771 { 3772 ENGINE_WRITE(engine, RING_IMR, 3773 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3774 ENGINE_POSTING_READ(engine, RING_IMR); 3775 } 3776 3777 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3778 { 3779 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3780 } 3781 3782 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3783 { 3784 u32 cmd, *cs; 3785 3786 cs = intel_ring_begin(request, 4); 3787 if (IS_ERR(cs)) 3788 return PTR_ERR(cs); 3789 3790 cmd = MI_FLUSH_DW + 1; 3791 3792 /* We always require a command barrier so that subsequent 3793 * commands, such as breadcrumb interrupts, are strictly ordered 3794 * wrt the contents of the write cache being flushed to memory 3795 * (and thus being coherent from the CPU). 3796 */ 3797 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3798 3799 if (mode & EMIT_INVALIDATE) { 3800 cmd |= MI_INVALIDATE_TLB; 3801 if (request->engine->class == VIDEO_DECODE_CLASS) 3802 cmd |= MI_INVALIDATE_BSD; 3803 } 3804 3805 *cs++ = cmd; 3806 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3807 *cs++ = 0; /* upper addr */ 3808 *cs++ = 0; /* value */ 3809 intel_ring_advance(request, cs); 3810 3811 return 0; 3812 } 3813 3814 static int gen8_emit_flush_render(struct i915_request *request, 3815 u32 mode) 3816 { 3817 bool vf_flush_wa = false, dc_flush_wa = false; 3818 u32 *cs, flags = 0; 3819 int len; 3820 3821 flags |= PIPE_CONTROL_CS_STALL; 3822 3823 if (mode & EMIT_FLUSH) { 3824 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3825 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3826 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3827 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3828 } 3829 3830 if (mode & EMIT_INVALIDATE) { 3831 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3832 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3833 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3834 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3835 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3836 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3837 flags |= PIPE_CONTROL_QW_WRITE; 3838 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3839 3840 /* 3841 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3842 * pipe control. 3843 */ 3844 if (IS_GEN(request->i915, 9)) 3845 vf_flush_wa = true; 3846 3847 /* WaForGAMHang:kbl */ 3848 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3849 dc_flush_wa = true; 3850 } 3851 3852 len = 6; 3853 3854 if (vf_flush_wa) 3855 len += 6; 3856 3857 if (dc_flush_wa) 3858 len += 12; 3859 3860 cs = intel_ring_begin(request, len); 3861 if (IS_ERR(cs)) 3862 return PTR_ERR(cs); 3863 3864 if (vf_flush_wa) 3865 cs = gen8_emit_pipe_control(cs, 0, 0); 3866 3867 if (dc_flush_wa) 3868 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3869 0); 3870 3871 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3872 3873 if (dc_flush_wa) 3874 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3875 3876 intel_ring_advance(request, cs); 3877 3878 return 0; 3879 } 3880 3881 static int gen11_emit_flush_render(struct i915_request *request, 3882 u32 mode) 3883 { 3884 if (mode & EMIT_FLUSH) { 3885 u32 *cs; 3886 u32 flags = 0; 3887 3888 flags |= PIPE_CONTROL_CS_STALL; 3889 3890 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3891 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3892 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3893 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3894 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3895 flags |= PIPE_CONTROL_QW_WRITE; 3896 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3897 3898 cs = intel_ring_begin(request, 6); 3899 if (IS_ERR(cs)) 3900 return PTR_ERR(cs); 3901 3902 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3903 intel_ring_advance(request, cs); 3904 } 3905 3906 if (mode & EMIT_INVALIDATE) { 3907 u32 *cs; 3908 u32 flags = 0; 3909 3910 flags |= PIPE_CONTROL_CS_STALL; 3911 3912 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3913 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3914 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3915 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3916 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3917 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3918 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3919 flags |= PIPE_CONTROL_QW_WRITE; 3920 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3921 3922 cs = intel_ring_begin(request, 6); 3923 if (IS_ERR(cs)) 3924 return PTR_ERR(cs); 3925 3926 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3927 intel_ring_advance(request, cs); 3928 } 3929 3930 return 0; 3931 } 3932 3933 static u32 preparser_disable(bool state) 3934 { 3935 return MI_ARB_CHECK | 1 << 8 | state; 3936 } 3937 3938 static int gen12_emit_flush_render(struct i915_request *request, 3939 u32 mode) 3940 { 3941 if (mode & EMIT_FLUSH) { 3942 u32 flags = 0; 3943 u32 *cs; 3944 3945 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3946 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3947 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3948 /* Wa_1409600907:tgl */ 3949 flags |= PIPE_CONTROL_DEPTH_STALL; 3950 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3951 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3952 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3953 3954 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3955 flags |= PIPE_CONTROL_QW_WRITE; 3956 3957 flags |= PIPE_CONTROL_CS_STALL; 3958 3959 cs = intel_ring_begin(request, 6); 3960 if (IS_ERR(cs)) 3961 return PTR_ERR(cs); 3962 3963 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3964 intel_ring_advance(request, cs); 3965 } 3966 3967 if (mode & EMIT_INVALIDATE) { 3968 u32 flags = 0; 3969 u32 *cs; 3970 3971 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3972 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3973 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3974 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3975 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3976 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3977 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3978 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 3979 3980 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3981 flags |= PIPE_CONTROL_QW_WRITE; 3982 3983 flags |= PIPE_CONTROL_CS_STALL; 3984 3985 cs = intel_ring_begin(request, 8); 3986 if (IS_ERR(cs)) 3987 return PTR_ERR(cs); 3988 3989 /* 3990 * Prevent the pre-parser from skipping past the TLB 3991 * invalidate and loading a stale page for the batch 3992 * buffer / request payload. 3993 */ 3994 *cs++ = preparser_disable(true); 3995 3996 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3997 3998 *cs++ = preparser_disable(false); 3999 intel_ring_advance(request, cs); 4000 4001 /* 4002 * Wa_1604544889:tgl 4003 */ 4004 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 4005 flags = 0; 4006 flags |= PIPE_CONTROL_CS_STALL; 4007 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 4008 4009 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4010 flags |= PIPE_CONTROL_QW_WRITE; 4011 4012 cs = intel_ring_begin(request, 6); 4013 if (IS_ERR(cs)) 4014 return PTR_ERR(cs); 4015 4016 cs = gen8_emit_pipe_control(cs, flags, 4017 LRC_PPHWSP_SCRATCH_ADDR); 4018 intel_ring_advance(request, cs); 4019 } 4020 } 4021 4022 return 0; 4023 } 4024 4025 /* 4026 * Reserve space for 2 NOOPs at the end of each request to be 4027 * used as a workaround for not being allowed to do lite 4028 * restore with HEAD==TAIL (WaIdleLiteRestore). 4029 */ 4030 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4031 { 4032 /* Ensure there's always at least one preemption point per-request. */ 4033 *cs++ = MI_ARB_CHECK; 4034 *cs++ = MI_NOOP; 4035 request->wa_tail = intel_ring_offset(request, cs); 4036 4037 return cs; 4038 } 4039 4040 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4041 { 4042 *cs++ = MI_SEMAPHORE_WAIT | 4043 MI_SEMAPHORE_GLOBAL_GTT | 4044 MI_SEMAPHORE_POLL | 4045 MI_SEMAPHORE_SAD_EQ_SDD; 4046 *cs++ = 0; 4047 *cs++ = intel_hws_preempt_address(request->engine); 4048 *cs++ = 0; 4049 4050 return cs; 4051 } 4052 4053 static __always_inline u32* 4054 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 4055 u32 *cs) 4056 { 4057 *cs++ = MI_USER_INTERRUPT; 4058 4059 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4060 if (intel_engine_has_semaphores(request->engine)) 4061 cs = emit_preempt_busywait(request, cs); 4062 4063 request->tail = intel_ring_offset(request, cs); 4064 assert_ring_tail_valid(request->ring, request->tail); 4065 4066 return gen8_emit_wa_tail(request, cs); 4067 } 4068 4069 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4070 { 4071 cs = gen8_emit_ggtt_write(cs, 4072 request->fence.seqno, 4073 i915_request_active_timeline(request)->hwsp_offset, 4074 0); 4075 4076 return gen8_emit_fini_breadcrumb_footer(request, cs); 4077 } 4078 4079 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4080 { 4081 cs = gen8_emit_pipe_control(cs, 4082 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4083 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4084 PIPE_CONTROL_DC_FLUSH_ENABLE, 4085 0); 4086 4087 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4088 cs = gen8_emit_ggtt_write_rcs(cs, 4089 request->fence.seqno, 4090 i915_request_active_timeline(request)->hwsp_offset, 4091 PIPE_CONTROL_FLUSH_ENABLE | 4092 PIPE_CONTROL_CS_STALL); 4093 4094 return gen8_emit_fini_breadcrumb_footer(request, cs); 4095 } 4096 4097 static u32 * 4098 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4099 { 4100 cs = gen8_emit_ggtt_write_rcs(cs, 4101 request->fence.seqno, 4102 i915_request_active_timeline(request)->hwsp_offset, 4103 PIPE_CONTROL_CS_STALL | 4104 PIPE_CONTROL_TILE_CACHE_FLUSH | 4105 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4106 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4107 PIPE_CONTROL_DC_FLUSH_ENABLE | 4108 PIPE_CONTROL_FLUSH_ENABLE); 4109 4110 return gen8_emit_fini_breadcrumb_footer(request, cs); 4111 } 4112 4113 /* 4114 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4115 * flush and will continue pre-fetching the instructions after it before the 4116 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4117 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4118 * of the next request before the memory has been flushed, we're guaranteed that 4119 * we won't access the batch itself too early. 4120 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4121 * so, if the current request is modifying an instruction in the next request on 4122 * the same intel_context, we might pre-fetch and then execute the pre-update 4123 * instruction. To avoid this, the users of self-modifying code should either 4124 * disable the parser around the code emitting the memory writes, via a new flag 4125 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4126 * the in-kernel use-cases we've opted to use a separate context, see 4127 * reloc_gpu() as an example. 4128 * All the above applies only to the instructions themselves. Non-inline data 4129 * used by the instructions is not pre-fetched. 4130 */ 4131 4132 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4133 { 4134 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4135 MI_SEMAPHORE_GLOBAL_GTT | 4136 MI_SEMAPHORE_POLL | 4137 MI_SEMAPHORE_SAD_EQ_SDD; 4138 *cs++ = 0; 4139 *cs++ = intel_hws_preempt_address(request->engine); 4140 *cs++ = 0; 4141 *cs++ = 0; 4142 *cs++ = MI_NOOP; 4143 4144 return cs; 4145 } 4146 4147 static __always_inline u32* 4148 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 4149 { 4150 *cs++ = MI_USER_INTERRUPT; 4151 4152 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4153 if (intel_engine_has_semaphores(request->engine)) 4154 cs = gen12_emit_preempt_busywait(request, cs); 4155 4156 request->tail = intel_ring_offset(request, cs); 4157 assert_ring_tail_valid(request->ring, request->tail); 4158 4159 return gen8_emit_wa_tail(request, cs); 4160 } 4161 4162 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4163 { 4164 cs = gen8_emit_ggtt_write(cs, 4165 request->fence.seqno, 4166 i915_request_active_timeline(request)->hwsp_offset, 4167 0); 4168 4169 return gen12_emit_fini_breadcrumb_footer(request, cs); 4170 } 4171 4172 static u32 * 4173 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4174 { 4175 cs = gen8_emit_ggtt_write_rcs(cs, 4176 request->fence.seqno, 4177 i915_request_active_timeline(request)->hwsp_offset, 4178 PIPE_CONTROL_CS_STALL | 4179 PIPE_CONTROL_TILE_CACHE_FLUSH | 4180 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4181 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4182 /* Wa_1409600907:tgl */ 4183 PIPE_CONTROL_DEPTH_STALL | 4184 PIPE_CONTROL_DC_FLUSH_ENABLE | 4185 PIPE_CONTROL_FLUSH_ENABLE | 4186 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 4187 4188 return gen12_emit_fini_breadcrumb_footer(request, cs); 4189 } 4190 4191 static void execlists_park(struct intel_engine_cs *engine) 4192 { 4193 cancel_timer(&engine->execlists.timer); 4194 cancel_timer(&engine->execlists.preempt); 4195 } 4196 4197 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4198 { 4199 engine->submit_request = execlists_submit_request; 4200 engine->schedule = i915_schedule; 4201 engine->execlists.tasklet.func = execlists_submission_tasklet; 4202 4203 engine->reset.prepare = execlists_reset_prepare; 4204 engine->reset.rewind = execlists_reset_rewind; 4205 engine->reset.cancel = execlists_reset_cancel; 4206 engine->reset.finish = execlists_reset_finish; 4207 4208 engine->park = execlists_park; 4209 engine->unpark = NULL; 4210 4211 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 4212 if (!intel_vgpu_active(engine->i915)) { 4213 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 4214 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 4215 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 4216 } 4217 4218 if (INTEL_GEN(engine->i915) >= 12) 4219 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 4220 4221 if (intel_engine_has_preemption(engine)) 4222 engine->emit_bb_start = gen8_emit_bb_start; 4223 else 4224 engine->emit_bb_start = gen8_emit_bb_start_noarb; 4225 } 4226 4227 static void execlists_shutdown(struct intel_engine_cs *engine) 4228 { 4229 /* Synchronise with residual timers and any softirq they raise */ 4230 del_timer_sync(&engine->execlists.timer); 4231 del_timer_sync(&engine->execlists.preempt); 4232 tasklet_kill(&engine->execlists.tasklet); 4233 } 4234 4235 static void execlists_release(struct intel_engine_cs *engine) 4236 { 4237 execlists_shutdown(engine); 4238 4239 intel_engine_cleanup_common(engine); 4240 lrc_destroy_wa_ctx(engine); 4241 } 4242 4243 static void 4244 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 4245 { 4246 /* Default vfuncs which can be overriden by each engine. */ 4247 4248 engine->resume = execlists_resume; 4249 4250 engine->cops = &execlists_context_ops; 4251 engine->request_alloc = execlists_request_alloc; 4252 4253 engine->emit_flush = gen8_emit_flush; 4254 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 4255 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 4256 if (INTEL_GEN(engine->i915) >= 12) 4257 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 4258 4259 engine->set_default_submission = intel_execlists_set_default_submission; 4260 4261 if (INTEL_GEN(engine->i915) < 11) { 4262 engine->irq_enable = gen8_logical_ring_enable_irq; 4263 engine->irq_disable = gen8_logical_ring_disable_irq; 4264 } else { 4265 /* 4266 * TODO: On Gen11 interrupt masks need to be clear 4267 * to allow C6 entry. Keep interrupts enabled at 4268 * and take the hit of generating extra interrupts 4269 * until a more refined solution exists. 4270 */ 4271 } 4272 } 4273 4274 static inline void 4275 logical_ring_default_irqs(struct intel_engine_cs *engine) 4276 { 4277 unsigned int shift = 0; 4278 4279 if (INTEL_GEN(engine->i915) < 11) { 4280 const u8 irq_shifts[] = { 4281 [RCS0] = GEN8_RCS_IRQ_SHIFT, 4282 [BCS0] = GEN8_BCS_IRQ_SHIFT, 4283 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 4284 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 4285 [VECS0] = GEN8_VECS_IRQ_SHIFT, 4286 }; 4287 4288 shift = irq_shifts[engine->id]; 4289 } 4290 4291 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 4292 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 4293 } 4294 4295 static void rcs_submission_override(struct intel_engine_cs *engine) 4296 { 4297 switch (INTEL_GEN(engine->i915)) { 4298 case 12: 4299 engine->emit_flush = gen12_emit_flush_render; 4300 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 4301 break; 4302 case 11: 4303 engine->emit_flush = gen11_emit_flush_render; 4304 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 4305 break; 4306 default: 4307 engine->emit_flush = gen8_emit_flush_render; 4308 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 4309 break; 4310 } 4311 } 4312 4313 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 4314 { 4315 struct intel_engine_execlists * const execlists = &engine->execlists; 4316 struct drm_i915_private *i915 = engine->i915; 4317 struct intel_uncore *uncore = engine->uncore; 4318 u32 base = engine->mmio_base; 4319 4320 tasklet_init(&engine->execlists.tasklet, 4321 execlists_submission_tasklet, (unsigned long)engine); 4322 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 4323 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 4324 4325 logical_ring_default_vfuncs(engine); 4326 logical_ring_default_irqs(engine); 4327 4328 if (engine->class == RENDER_CLASS) 4329 rcs_submission_override(engine); 4330 4331 if (intel_init_workaround_bb(engine)) 4332 /* 4333 * We continue even if we fail to initialize WA batch 4334 * because we only expect rare glitches but nothing 4335 * critical to prevent us from using GPU 4336 */ 4337 DRM_ERROR("WA batch buffer initialization failed\n"); 4338 4339 if (HAS_LOGICAL_RING_ELSQ(i915)) { 4340 execlists->submit_reg = uncore->regs + 4341 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 4342 execlists->ctrl_reg = uncore->regs + 4343 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 4344 } else { 4345 execlists->submit_reg = uncore->regs + 4346 i915_mmio_reg_offset(RING_ELSP(base)); 4347 } 4348 4349 execlists->csb_status = 4350 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 4351 4352 execlists->csb_write = 4353 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 4354 4355 if (INTEL_GEN(i915) < 11) 4356 execlists->csb_size = GEN8_CSB_ENTRIES; 4357 else 4358 execlists->csb_size = GEN11_CSB_ENTRIES; 4359 4360 reset_csb_pointers(engine); 4361 4362 /* Finally, take ownership and responsibility for cleanup! */ 4363 engine->release = execlists_release; 4364 4365 return 0; 4366 } 4367 4368 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 4369 { 4370 u32 indirect_ctx_offset; 4371 4372 switch (INTEL_GEN(engine->i915)) { 4373 default: 4374 MISSING_CASE(INTEL_GEN(engine->i915)); 4375 /* fall through */ 4376 case 12: 4377 indirect_ctx_offset = 4378 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4379 break; 4380 case 11: 4381 indirect_ctx_offset = 4382 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4383 break; 4384 case 10: 4385 indirect_ctx_offset = 4386 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4387 break; 4388 case 9: 4389 indirect_ctx_offset = 4390 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4391 break; 4392 case 8: 4393 indirect_ctx_offset = 4394 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4395 break; 4396 } 4397 4398 return indirect_ctx_offset; 4399 } 4400 4401 4402 static void init_common_reg_state(u32 * const regs, 4403 const struct intel_engine_cs *engine, 4404 const struct intel_ring *ring, 4405 bool inhibit) 4406 { 4407 u32 ctl; 4408 4409 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 4410 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4411 if (inhibit) 4412 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 4413 if (INTEL_GEN(engine->i915) < 11) 4414 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 4415 CTX_CTRL_RS_CTX_ENABLE); 4416 regs[CTX_CONTEXT_CONTROL] = ctl; 4417 4418 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 4419 } 4420 4421 static void init_wa_bb_reg_state(u32 * const regs, 4422 const struct intel_engine_cs *engine, 4423 u32 pos_bb_per_ctx) 4424 { 4425 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 4426 4427 if (wa_ctx->per_ctx.size) { 4428 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4429 4430 regs[pos_bb_per_ctx] = 4431 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 4432 } 4433 4434 if (wa_ctx->indirect_ctx.size) { 4435 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4436 4437 regs[pos_bb_per_ctx + 2] = 4438 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 4439 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 4440 4441 regs[pos_bb_per_ctx + 4] = 4442 intel_lr_indirect_ctx_offset(engine) << 6; 4443 } 4444 } 4445 4446 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 4447 { 4448 if (i915_vm_is_4lvl(&ppgtt->vm)) { 4449 /* 64b PPGTT (48bit canonical) 4450 * PDP0_DESCRIPTOR contains the base address to PML4 and 4451 * other PDP Descriptors are ignored. 4452 */ 4453 ASSIGN_CTX_PML4(ppgtt, regs); 4454 } else { 4455 ASSIGN_CTX_PDP(ppgtt, regs, 3); 4456 ASSIGN_CTX_PDP(ppgtt, regs, 2); 4457 ASSIGN_CTX_PDP(ppgtt, regs, 1); 4458 ASSIGN_CTX_PDP(ppgtt, regs, 0); 4459 } 4460 } 4461 4462 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 4463 { 4464 if (i915_is_ggtt(vm)) 4465 return i915_vm_to_ggtt(vm)->alias; 4466 else 4467 return i915_vm_to_ppgtt(vm); 4468 } 4469 4470 static void execlists_init_reg_state(u32 *regs, 4471 const struct intel_context *ce, 4472 const struct intel_engine_cs *engine, 4473 const struct intel_ring *ring, 4474 bool inhibit) 4475 { 4476 /* 4477 * A context is actually a big batch buffer with several 4478 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 4479 * values we are setting here are only for the first context restore: 4480 * on a subsequent save, the GPU will recreate this batchbuffer with new 4481 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 4482 * we are not initializing here). 4483 * 4484 * Must keep consistent with virtual_update_register_offsets(). 4485 */ 4486 set_offsets(regs, reg_offsets(engine), engine, inhibit); 4487 4488 init_common_reg_state(regs, engine, ring, inhibit); 4489 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 4490 4491 init_wa_bb_reg_state(regs, engine, 4492 INTEL_GEN(engine->i915) >= 12 ? 4493 GEN12_CTX_BB_PER_CTX_PTR : 4494 CTX_BB_PER_CTX_PTR); 4495 4496 __reset_stop_ring(regs, engine); 4497 } 4498 4499 static int 4500 populate_lr_context(struct intel_context *ce, 4501 struct drm_i915_gem_object *ctx_obj, 4502 struct intel_engine_cs *engine, 4503 struct intel_ring *ring) 4504 { 4505 bool inhibit = true; 4506 void *vaddr; 4507 int ret; 4508 4509 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4510 if (IS_ERR(vaddr)) { 4511 ret = PTR_ERR(vaddr); 4512 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4513 return ret; 4514 } 4515 4516 set_redzone(vaddr, engine); 4517 4518 if (engine->default_state) { 4519 void *defaults; 4520 4521 defaults = i915_gem_object_pin_map(engine->default_state, 4522 I915_MAP_WB); 4523 if (IS_ERR(defaults)) { 4524 ret = PTR_ERR(defaults); 4525 goto err_unpin_ctx; 4526 } 4527 4528 memcpy(vaddr, defaults, engine->context_size); 4529 i915_gem_object_unpin_map(engine->default_state); 4530 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 4531 inhibit = false; 4532 } 4533 4534 /* The second page of the context object contains some fields which must 4535 * be set up prior to the first execution. */ 4536 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, 4537 ce, engine, ring, inhibit); 4538 4539 ret = 0; 4540 err_unpin_ctx: 4541 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4542 i915_gem_object_unpin_map(ctx_obj); 4543 return ret; 4544 } 4545 4546 static int __execlists_context_alloc(struct intel_context *ce, 4547 struct intel_engine_cs *engine) 4548 { 4549 struct drm_i915_gem_object *ctx_obj; 4550 struct intel_ring *ring; 4551 struct i915_vma *vma; 4552 u32 context_size; 4553 int ret; 4554 4555 GEM_BUG_ON(ce->state); 4556 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4557 4558 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4559 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4560 4561 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4562 if (IS_ERR(ctx_obj)) 4563 return PTR_ERR(ctx_obj); 4564 4565 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4566 if (IS_ERR(vma)) { 4567 ret = PTR_ERR(vma); 4568 goto error_deref_obj; 4569 } 4570 4571 if (!ce->timeline) { 4572 struct intel_timeline *tl; 4573 4574 tl = intel_timeline_create(engine->gt, NULL); 4575 if (IS_ERR(tl)) { 4576 ret = PTR_ERR(tl); 4577 goto error_deref_obj; 4578 } 4579 4580 ce->timeline = tl; 4581 } 4582 4583 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4584 if (IS_ERR(ring)) { 4585 ret = PTR_ERR(ring); 4586 goto error_deref_obj; 4587 } 4588 4589 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4590 if (ret) { 4591 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4592 goto error_ring_free; 4593 } 4594 4595 ce->ring = ring; 4596 ce->state = vma; 4597 4598 return 0; 4599 4600 error_ring_free: 4601 intel_ring_put(ring); 4602 error_deref_obj: 4603 i915_gem_object_put(ctx_obj); 4604 return ret; 4605 } 4606 4607 static struct list_head *virtual_queue(struct virtual_engine *ve) 4608 { 4609 return &ve->base.execlists.default_priolist.requests[0]; 4610 } 4611 4612 static void virtual_context_destroy(struct kref *kref) 4613 { 4614 struct virtual_engine *ve = 4615 container_of(kref, typeof(*ve), context.ref); 4616 unsigned int n; 4617 4618 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4619 GEM_BUG_ON(ve->request); 4620 GEM_BUG_ON(ve->context.inflight); 4621 4622 for (n = 0; n < ve->num_siblings; n++) { 4623 struct intel_engine_cs *sibling = ve->siblings[n]; 4624 struct rb_node *node = &ve->nodes[sibling->id].rb; 4625 unsigned long flags; 4626 4627 if (RB_EMPTY_NODE(node)) 4628 continue; 4629 4630 spin_lock_irqsave(&sibling->active.lock, flags); 4631 4632 /* Detachment is lazily performed in the execlists tasklet */ 4633 if (!RB_EMPTY_NODE(node)) 4634 rb_erase_cached(node, &sibling->execlists.virtual); 4635 4636 spin_unlock_irqrestore(&sibling->active.lock, flags); 4637 } 4638 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4639 4640 if (ve->context.state) 4641 __execlists_context_fini(&ve->context); 4642 intel_context_fini(&ve->context); 4643 4644 kfree(ve->bonds); 4645 kfree(ve); 4646 } 4647 4648 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4649 { 4650 int swp; 4651 4652 /* 4653 * Pick a random sibling on starting to help spread the load around. 4654 * 4655 * New contexts are typically created with exactly the same order 4656 * of siblings, and often started in batches. Due to the way we iterate 4657 * the array of sibling when submitting requests, sibling[0] is 4658 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4659 * randomised across the system, we also help spread the load by the 4660 * first engine we inspect being different each time. 4661 * 4662 * NB This does not force us to execute on this engine, it will just 4663 * typically be the first we inspect for submission. 4664 */ 4665 swp = prandom_u32_max(ve->num_siblings); 4666 if (!swp) 4667 return; 4668 4669 swap(ve->siblings[swp], ve->siblings[0]); 4670 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4671 virtual_update_register_offsets(ve->context.lrc_reg_state, 4672 ve->siblings[0]); 4673 } 4674 4675 static int virtual_context_alloc(struct intel_context *ce) 4676 { 4677 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4678 4679 return __execlists_context_alloc(ce, ve->siblings[0]); 4680 } 4681 4682 static int virtual_context_pin(struct intel_context *ce) 4683 { 4684 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4685 int err; 4686 4687 /* Note: we must use a real engine class for setting up reg state */ 4688 err = __execlists_context_pin(ce, ve->siblings[0]); 4689 if (err) 4690 return err; 4691 4692 virtual_engine_initial_hint(ve); 4693 return 0; 4694 } 4695 4696 static void virtual_context_enter(struct intel_context *ce) 4697 { 4698 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4699 unsigned int n; 4700 4701 for (n = 0; n < ve->num_siblings; n++) 4702 intel_engine_pm_get(ve->siblings[n]); 4703 4704 intel_timeline_enter(ce->timeline); 4705 } 4706 4707 static void virtual_context_exit(struct intel_context *ce) 4708 { 4709 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4710 unsigned int n; 4711 4712 intel_timeline_exit(ce->timeline); 4713 4714 for (n = 0; n < ve->num_siblings; n++) 4715 intel_engine_pm_put(ve->siblings[n]); 4716 } 4717 4718 static const struct intel_context_ops virtual_context_ops = { 4719 .alloc = virtual_context_alloc, 4720 4721 .pin = virtual_context_pin, 4722 .unpin = execlists_context_unpin, 4723 4724 .enter = virtual_context_enter, 4725 .exit = virtual_context_exit, 4726 4727 .destroy = virtual_context_destroy, 4728 }; 4729 4730 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4731 { 4732 struct i915_request *rq; 4733 intel_engine_mask_t mask; 4734 4735 rq = READ_ONCE(ve->request); 4736 if (!rq) 4737 return 0; 4738 4739 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4740 mask = rq->execution_mask; 4741 if (unlikely(!mask)) { 4742 /* Invalid selection, submit to a random engine in error */ 4743 i915_request_skip(rq, -ENODEV); 4744 mask = ve->siblings[0]->mask; 4745 } 4746 4747 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 4748 rq->fence.context, rq->fence.seqno, 4749 mask, ve->base.execlists.queue_priority_hint); 4750 4751 return mask; 4752 } 4753 4754 static void virtual_submission_tasklet(unsigned long data) 4755 { 4756 struct virtual_engine * const ve = (struct virtual_engine *)data; 4757 const int prio = ve->base.execlists.queue_priority_hint; 4758 intel_engine_mask_t mask; 4759 unsigned int n; 4760 4761 rcu_read_lock(); 4762 mask = virtual_submission_mask(ve); 4763 rcu_read_unlock(); 4764 if (unlikely(!mask)) 4765 return; 4766 4767 local_irq_disable(); 4768 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4769 struct intel_engine_cs *sibling = ve->siblings[n]; 4770 struct ve_node * const node = &ve->nodes[sibling->id]; 4771 struct rb_node **parent, *rb; 4772 bool first; 4773 4774 if (unlikely(!(mask & sibling->mask))) { 4775 if (!RB_EMPTY_NODE(&node->rb)) { 4776 spin_lock(&sibling->active.lock); 4777 rb_erase_cached(&node->rb, 4778 &sibling->execlists.virtual); 4779 RB_CLEAR_NODE(&node->rb); 4780 spin_unlock(&sibling->active.lock); 4781 } 4782 continue; 4783 } 4784 4785 spin_lock(&sibling->active.lock); 4786 4787 if (!RB_EMPTY_NODE(&node->rb)) { 4788 /* 4789 * Cheat and avoid rebalancing the tree if we can 4790 * reuse this node in situ. 4791 */ 4792 first = rb_first_cached(&sibling->execlists.virtual) == 4793 &node->rb; 4794 if (prio == node->prio || (prio > node->prio && first)) 4795 goto submit_engine; 4796 4797 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4798 } 4799 4800 rb = NULL; 4801 first = true; 4802 parent = &sibling->execlists.virtual.rb_root.rb_node; 4803 while (*parent) { 4804 struct ve_node *other; 4805 4806 rb = *parent; 4807 other = rb_entry(rb, typeof(*other), rb); 4808 if (prio > other->prio) { 4809 parent = &rb->rb_left; 4810 } else { 4811 parent = &rb->rb_right; 4812 first = false; 4813 } 4814 } 4815 4816 rb_link_node(&node->rb, rb, parent); 4817 rb_insert_color_cached(&node->rb, 4818 &sibling->execlists.virtual, 4819 first); 4820 4821 submit_engine: 4822 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4823 node->prio = prio; 4824 if (first && prio > sibling->execlists.queue_priority_hint) { 4825 sibling->execlists.queue_priority_hint = prio; 4826 tasklet_hi_schedule(&sibling->execlists.tasklet); 4827 } 4828 4829 spin_unlock(&sibling->active.lock); 4830 } 4831 local_irq_enable(); 4832 } 4833 4834 static void virtual_submit_request(struct i915_request *rq) 4835 { 4836 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4837 struct i915_request *old; 4838 unsigned long flags; 4839 4840 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 4841 rq->fence.context, 4842 rq->fence.seqno); 4843 4844 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4845 4846 spin_lock_irqsave(&ve->base.active.lock, flags); 4847 4848 old = ve->request; 4849 if (old) { /* background completion event from preempt-to-busy */ 4850 GEM_BUG_ON(!i915_request_completed(old)); 4851 __i915_request_submit(old); 4852 i915_request_put(old); 4853 } 4854 4855 if (i915_request_completed(rq)) { 4856 __i915_request_submit(rq); 4857 4858 ve->base.execlists.queue_priority_hint = INT_MIN; 4859 ve->request = NULL; 4860 } else { 4861 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4862 ve->request = i915_request_get(rq); 4863 4864 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4865 list_move_tail(&rq->sched.link, virtual_queue(ve)); 4866 4867 tasklet_schedule(&ve->base.execlists.tasklet); 4868 } 4869 4870 spin_unlock_irqrestore(&ve->base.active.lock, flags); 4871 } 4872 4873 static struct ve_bond * 4874 virtual_find_bond(struct virtual_engine *ve, 4875 const struct intel_engine_cs *master) 4876 { 4877 int i; 4878 4879 for (i = 0; i < ve->num_bonds; i++) { 4880 if (ve->bonds[i].master == master) 4881 return &ve->bonds[i]; 4882 } 4883 4884 return NULL; 4885 } 4886 4887 static void 4888 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 4889 { 4890 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4891 intel_engine_mask_t allowed, exec; 4892 struct ve_bond *bond; 4893 4894 allowed = ~to_request(signal)->engine->mask; 4895 4896 bond = virtual_find_bond(ve, to_request(signal)->engine); 4897 if (bond) 4898 allowed &= bond->sibling_mask; 4899 4900 /* Restrict the bonded request to run on only the available engines */ 4901 exec = READ_ONCE(rq->execution_mask); 4902 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 4903 ; 4904 4905 /* Prevent the master from being re-run on the bonded engines */ 4906 to_request(signal)->execution_mask &= ~allowed; 4907 } 4908 4909 struct intel_context * 4910 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 4911 unsigned int count) 4912 { 4913 struct virtual_engine *ve; 4914 unsigned int n; 4915 int err; 4916 4917 if (count == 0) 4918 return ERR_PTR(-EINVAL); 4919 4920 if (count == 1) 4921 return intel_context_create(siblings[0]); 4922 4923 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 4924 if (!ve) 4925 return ERR_PTR(-ENOMEM); 4926 4927 ve->base.i915 = siblings[0]->i915; 4928 ve->base.gt = siblings[0]->gt; 4929 ve->base.uncore = siblings[0]->uncore; 4930 ve->base.id = -1; 4931 4932 ve->base.class = OTHER_CLASS; 4933 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 4934 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4935 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4936 4937 /* 4938 * The decision on whether to submit a request using semaphores 4939 * depends on the saturated state of the engine. We only compute 4940 * this during HW submission of the request, and we need for this 4941 * state to be globally applied to all requests being submitted 4942 * to this engine. Virtual engines encompass more than one physical 4943 * engine and so we cannot accurately tell in advance if one of those 4944 * engines is already saturated and so cannot afford to use a semaphore 4945 * and be pessimized in priority for doing so -- if we are the only 4946 * context using semaphores after all other clients have stopped, we 4947 * will be starved on the saturated system. Such a global switch for 4948 * semaphores is less than ideal, but alas is the current compromise. 4949 */ 4950 ve->base.saturated = ALL_ENGINES; 4951 4952 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 4953 4954 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 4955 intel_engine_init_breadcrumbs(&ve->base); 4956 intel_engine_init_execlists(&ve->base); 4957 4958 ve->base.cops = &virtual_context_ops; 4959 ve->base.request_alloc = execlists_request_alloc; 4960 4961 ve->base.schedule = i915_schedule; 4962 ve->base.submit_request = virtual_submit_request; 4963 ve->base.bond_execute = virtual_bond_execute; 4964 4965 INIT_LIST_HEAD(virtual_queue(ve)); 4966 ve->base.execlists.queue_priority_hint = INT_MIN; 4967 tasklet_init(&ve->base.execlists.tasklet, 4968 virtual_submission_tasklet, 4969 (unsigned long)ve); 4970 4971 intel_context_init(&ve->context, &ve->base); 4972 4973 for (n = 0; n < count; n++) { 4974 struct intel_engine_cs *sibling = siblings[n]; 4975 4976 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 4977 if (sibling->mask & ve->base.mask) { 4978 DRM_DEBUG("duplicate %s entry in load balancer\n", 4979 sibling->name); 4980 err = -EINVAL; 4981 goto err_put; 4982 } 4983 4984 /* 4985 * The virtual engine implementation is tightly coupled to 4986 * the execlists backend -- we push out request directly 4987 * into a tree inside each physical engine. We could support 4988 * layering if we handle cloning of the requests and 4989 * submitting a copy into each backend. 4990 */ 4991 if (sibling->execlists.tasklet.func != 4992 execlists_submission_tasklet) { 4993 err = -ENODEV; 4994 goto err_put; 4995 } 4996 4997 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 4998 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 4999 5000 ve->siblings[ve->num_siblings++] = sibling; 5001 ve->base.mask |= sibling->mask; 5002 5003 /* 5004 * All physical engines must be compatible for their emission 5005 * functions (as we build the instructions during request 5006 * construction and do not alter them before submission 5007 * on the physical engine). We use the engine class as a guide 5008 * here, although that could be refined. 5009 */ 5010 if (ve->base.class != OTHER_CLASS) { 5011 if (ve->base.class != sibling->class) { 5012 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5013 sibling->class, ve->base.class); 5014 err = -EINVAL; 5015 goto err_put; 5016 } 5017 continue; 5018 } 5019 5020 ve->base.class = sibling->class; 5021 ve->base.uabi_class = sibling->uabi_class; 5022 snprintf(ve->base.name, sizeof(ve->base.name), 5023 "v%dx%d", ve->base.class, count); 5024 ve->base.context_size = sibling->context_size; 5025 5026 ve->base.emit_bb_start = sibling->emit_bb_start; 5027 ve->base.emit_flush = sibling->emit_flush; 5028 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5029 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5030 ve->base.emit_fini_breadcrumb_dw = 5031 sibling->emit_fini_breadcrumb_dw; 5032 5033 ve->base.flags = sibling->flags; 5034 } 5035 5036 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5037 5038 return &ve->context; 5039 5040 err_put: 5041 intel_context_put(&ve->context); 5042 return ERR_PTR(err); 5043 } 5044 5045 struct intel_context * 5046 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5047 { 5048 struct virtual_engine *se = to_virtual_engine(src); 5049 struct intel_context *dst; 5050 5051 dst = intel_execlists_create_virtual(se->siblings, 5052 se->num_siblings); 5053 if (IS_ERR(dst)) 5054 return dst; 5055 5056 if (se->num_bonds) { 5057 struct virtual_engine *de = to_virtual_engine(dst->engine); 5058 5059 de->bonds = kmemdup(se->bonds, 5060 sizeof(*se->bonds) * se->num_bonds, 5061 GFP_KERNEL); 5062 if (!de->bonds) { 5063 intel_context_put(dst); 5064 return ERR_PTR(-ENOMEM); 5065 } 5066 5067 de->num_bonds = se->num_bonds; 5068 } 5069 5070 return dst; 5071 } 5072 5073 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5074 const struct intel_engine_cs *master, 5075 const struct intel_engine_cs *sibling) 5076 { 5077 struct virtual_engine *ve = to_virtual_engine(engine); 5078 struct ve_bond *bond; 5079 int n; 5080 5081 /* Sanity check the sibling is part of the virtual engine */ 5082 for (n = 0; n < ve->num_siblings; n++) 5083 if (sibling == ve->siblings[n]) 5084 break; 5085 if (n == ve->num_siblings) 5086 return -EINVAL; 5087 5088 bond = virtual_find_bond(ve, master); 5089 if (bond) { 5090 bond->sibling_mask |= sibling->mask; 5091 return 0; 5092 } 5093 5094 bond = krealloc(ve->bonds, 5095 sizeof(*bond) * (ve->num_bonds + 1), 5096 GFP_KERNEL); 5097 if (!bond) 5098 return -ENOMEM; 5099 5100 bond[ve->num_bonds].master = master; 5101 bond[ve->num_bonds].sibling_mask = sibling->mask; 5102 5103 ve->bonds = bond; 5104 ve->num_bonds++; 5105 5106 return 0; 5107 } 5108 5109 struct intel_engine_cs * 5110 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5111 unsigned int sibling) 5112 { 5113 struct virtual_engine *ve = to_virtual_engine(engine); 5114 5115 if (sibling >= ve->num_siblings) 5116 return NULL; 5117 5118 return ve->siblings[sibling]; 5119 } 5120 5121 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5122 struct drm_printer *m, 5123 void (*show_request)(struct drm_printer *m, 5124 struct i915_request *rq, 5125 const char *prefix), 5126 unsigned int max) 5127 { 5128 const struct intel_engine_execlists *execlists = &engine->execlists; 5129 struct i915_request *rq, *last; 5130 unsigned long flags; 5131 unsigned int count; 5132 struct rb_node *rb; 5133 5134 spin_lock_irqsave(&engine->active.lock, flags); 5135 5136 last = NULL; 5137 count = 0; 5138 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5139 if (count++ < max - 1) 5140 show_request(m, rq, "\t\tE "); 5141 else 5142 last = rq; 5143 } 5144 if (last) { 5145 if (count > max) { 5146 drm_printf(m, 5147 "\t\t...skipping %d executing requests...\n", 5148 count - max); 5149 } 5150 show_request(m, last, "\t\tE "); 5151 } 5152 5153 last = NULL; 5154 count = 0; 5155 if (execlists->queue_priority_hint != INT_MIN) 5156 drm_printf(m, "\t\tQueue priority hint: %d\n", 5157 execlists->queue_priority_hint); 5158 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5159 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5160 int i; 5161 5162 priolist_for_each_request(rq, p, i) { 5163 if (count++ < max - 1) 5164 show_request(m, rq, "\t\tQ "); 5165 else 5166 last = rq; 5167 } 5168 } 5169 if (last) { 5170 if (count > max) { 5171 drm_printf(m, 5172 "\t\t...skipping %d queued requests...\n", 5173 count - max); 5174 } 5175 show_request(m, last, "\t\tQ "); 5176 } 5177 5178 last = NULL; 5179 count = 0; 5180 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5181 struct virtual_engine *ve = 5182 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5183 struct i915_request *rq = READ_ONCE(ve->request); 5184 5185 if (rq) { 5186 if (count++ < max - 1) 5187 show_request(m, rq, "\t\tV "); 5188 else 5189 last = rq; 5190 } 5191 } 5192 if (last) { 5193 if (count > max) { 5194 drm_printf(m, 5195 "\t\t...skipping %d virtual requests...\n", 5196 count - max); 5197 } 5198 show_request(m, last, "\t\tV "); 5199 } 5200 5201 spin_unlock_irqrestore(&engine->active.lock, flags); 5202 } 5203 5204 void intel_lr_context_reset(struct intel_engine_cs *engine, 5205 struct intel_context *ce, 5206 u32 head, 5207 bool scrub) 5208 { 5209 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5210 5211 /* 5212 * We want a simple context + ring to execute the breadcrumb update. 5213 * We cannot rely on the context being intact across the GPU hang, 5214 * so clear it and rebuild just what we need for the breadcrumb. 5215 * All pending requests for this context will be zapped, and any 5216 * future request will be after userspace has had the opportunity 5217 * to recreate its own state. 5218 */ 5219 if (scrub) 5220 restore_default_state(ce, engine); 5221 5222 /* Rerun the request; its payload has been neutered (if guilty). */ 5223 ce->ring->head = head; 5224 intel_ring_update_space(ce->ring); 5225 5226 __execlists_update_reg_state(ce, engine); 5227 } 5228 5229 bool 5230 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5231 { 5232 return engine->set_default_submission == 5233 intel_execlists_set_default_submission; 5234 } 5235 5236 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5237 #include "selftest_lrc.c" 5238 #endif 5239