1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 151 #define RING_EXECLIST_QFULL (1 << 0x2) 152 #define RING_EXECLIST1_VALID (1 << 0x3) 153 #define RING_EXECLIST0_VALID (1 << 0x4) 154 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 155 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 156 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 157 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 159 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 162 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 163 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 164 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 166 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 167 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 169 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 172 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 173 #define GEN12_IDLE_CTX_ID 0x7FF 174 #define GEN12_CSB_CTX_VALID(csb_dw) \ 175 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 176 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 179 180 struct virtual_engine { 181 struct intel_engine_cs base; 182 struct intel_context context; 183 184 /* 185 * We allow only a single request through the virtual engine at a time 186 * (each request in the timeline waits for the completion fence of 187 * the previous before being submitted). By restricting ourselves to 188 * only submitting a single request, each request is placed on to a 189 * physical to maximise load spreading (by virtue of the late greedy 190 * scheduling -- each real engine takes the next available request 191 * upon idling). 192 */ 193 struct i915_request *request; 194 195 /* 196 * We keep a rbtree of available virtual engines inside each physical 197 * engine, sorted by priority. Here we preallocate the nodes we need 198 * for the virtual engine, indexed by physical_engine->id. 199 */ 200 struct ve_node { 201 struct rb_node rb; 202 int prio; 203 } nodes[I915_NUM_ENGINES]; 204 205 /* 206 * Keep track of bonded pairs -- restrictions upon on our selection 207 * of physical engines any particular request may be submitted to. 208 * If we receive a submit-fence from a master engine, we will only 209 * use one of sibling_mask physical engines. 210 */ 211 struct ve_bond { 212 const struct intel_engine_cs *master; 213 intel_engine_mask_t sibling_mask; 214 } *bonds; 215 unsigned int num_bonds; 216 217 /* And finally, which physical engines this virtual engine maps onto. */ 218 unsigned int num_siblings; 219 struct intel_engine_cs *siblings[0]; 220 }; 221 222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 223 { 224 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 225 return container_of(engine, struct virtual_engine, base); 226 } 227 228 static int __execlists_context_alloc(struct intel_context *ce, 229 struct intel_engine_cs *engine); 230 231 static void execlists_init_reg_state(u32 *reg_state, 232 const struct intel_context *ce, 233 const struct intel_engine_cs *engine, 234 const struct intel_ring *ring, 235 bool close); 236 static void 237 __execlists_update_reg_state(const struct intel_context *ce, 238 const struct intel_engine_cs *engine, 239 u32 head); 240 241 static void mark_eio(struct i915_request *rq) 242 { 243 if (i915_request_completed(rq)) 244 return; 245 246 GEM_BUG_ON(i915_request_signaled(rq)); 247 248 dma_fence_set_error(&rq->fence, -EIO); 249 i915_request_mark_complete(rq); 250 } 251 252 static struct i915_request * 253 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 254 { 255 struct i915_request *active = rq; 256 257 rcu_read_lock(); 258 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 259 if (i915_request_completed(rq)) 260 break; 261 262 active = rq; 263 } 264 rcu_read_unlock(); 265 266 return active; 267 } 268 269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 270 { 271 return (i915_ggtt_offset(engine->status_page.vma) + 272 I915_GEM_HWS_PREEMPT_ADDR); 273 } 274 275 static inline void 276 ring_set_paused(const struct intel_engine_cs *engine, int state) 277 { 278 /* 279 * We inspect HWS_PREEMPT with a semaphore inside 280 * engine->emit_fini_breadcrumb. If the dword is true, 281 * the ring is paused as the semaphore will busywait 282 * until the dword is false. 283 */ 284 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 285 if (state) 286 wmb(); 287 } 288 289 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 290 { 291 return rb_entry(rb, struct i915_priolist, node); 292 } 293 294 static inline int rq_prio(const struct i915_request *rq) 295 { 296 return rq->sched.attr.priority; 297 } 298 299 static int effective_prio(const struct i915_request *rq) 300 { 301 int prio = rq_prio(rq); 302 303 /* 304 * If this request is special and must not be interrupted at any 305 * cost, so be it. Note we are only checking the most recent request 306 * in the context and so may be masking an earlier vip request. It 307 * is hoped that under the conditions where nopreempt is used, this 308 * will not matter (i.e. all requests to that context will be 309 * nopreempt for as long as desired). 310 */ 311 if (i915_request_has_nopreempt(rq)) 312 prio = I915_PRIORITY_UNPREEMPTABLE; 313 314 /* 315 * On unwinding the active request, we give it a priority bump 316 * if it has completed waiting on any semaphore. If we know that 317 * the request has already started, we can prevent an unwanted 318 * preempt-to-idle cycle by taking that into account now. 319 */ 320 if (__i915_request_has_started(rq)) 321 prio |= I915_PRIORITY_NOSEMAPHORE; 322 323 /* Restrict mere WAIT boosts from triggering preemption */ 324 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 325 return prio | __NO_PREEMPTION; 326 } 327 328 static int queue_prio(const struct intel_engine_execlists *execlists) 329 { 330 struct i915_priolist *p; 331 struct rb_node *rb; 332 333 rb = rb_first_cached(&execlists->queue); 334 if (!rb) 335 return INT_MIN; 336 337 /* 338 * As the priolist[] are inverted, with the highest priority in [0], 339 * we have to flip the index value to become priority. 340 */ 341 p = to_priolist(rb); 342 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 343 } 344 345 static inline bool need_preempt(const struct intel_engine_cs *engine, 346 const struct i915_request *rq, 347 struct rb_node *rb) 348 { 349 int last_prio; 350 351 if (!intel_engine_has_semaphores(engine)) 352 return false; 353 354 /* 355 * Check if the current priority hint merits a preemption attempt. 356 * 357 * We record the highest value priority we saw during rescheduling 358 * prior to this dequeue, therefore we know that if it is strictly 359 * less than the current tail of ESLP[0], we do not need to force 360 * a preempt-to-idle cycle. 361 * 362 * However, the priority hint is a mere hint that we may need to 363 * preempt. If that hint is stale or we may be trying to preempt 364 * ourselves, ignore the request. 365 * 366 * More naturally we would write 367 * prio >= max(0, last); 368 * except that we wish to prevent triggering preemption at the same 369 * priority level: the task that is running should remain running 370 * to preserve FIFO ordering of dependencies. 371 */ 372 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 373 if (engine->execlists.queue_priority_hint <= last_prio) 374 return false; 375 376 /* 377 * Check against the first request in ELSP[1], it will, thanks to the 378 * power of PI, be the highest priority of that context. 379 */ 380 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 381 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 382 return true; 383 384 if (rb) { 385 struct virtual_engine *ve = 386 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 387 bool preempt = false; 388 389 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 390 struct i915_request *next; 391 392 rcu_read_lock(); 393 next = READ_ONCE(ve->request); 394 if (next) 395 preempt = rq_prio(next) > last_prio; 396 rcu_read_unlock(); 397 } 398 399 if (preempt) 400 return preempt; 401 } 402 403 /* 404 * If the inflight context did not trigger the preemption, then maybe 405 * it was the set of queued requests? Pick the highest priority in 406 * the queue (the first active priolist) and see if it deserves to be 407 * running instead of ELSP[0]. 408 * 409 * The highest priority request in the queue can not be either 410 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 411 * context, it's priority would not exceed ELSP[0] aka last_prio. 412 */ 413 return queue_prio(&engine->execlists) > last_prio; 414 } 415 416 __maybe_unused static inline bool 417 assert_priority_queue(const struct i915_request *prev, 418 const struct i915_request *next) 419 { 420 /* 421 * Without preemption, the prev may refer to the still active element 422 * which we refuse to let go. 423 * 424 * Even with preemption, there are times when we think it is better not 425 * to preempt and leave an ostensibly lower priority request in flight. 426 */ 427 if (i915_request_is_active(prev)) 428 return true; 429 430 return rq_prio(prev) >= rq_prio(next); 431 } 432 433 /* 434 * The context descriptor encodes various attributes of a context, 435 * including its GTT address and some flags. Because it's fairly 436 * expensive to calculate, we'll just do it once and cache the result, 437 * which remains valid until the context is unpinned. 438 * 439 * This is what a descriptor looks like, from LSB to MSB:: 440 * 441 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 442 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 443 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 444 * bits 53-54: mbz, reserved for use by hardware 445 * bits 55-63: group ID, currently unused and set to 0 446 * 447 * Starting from Gen11, the upper dword of the descriptor has a new format: 448 * 449 * bits 32-36: reserved 450 * bits 37-47: SW context ID 451 * bits 48:53: engine instance 452 * bit 54: mbz, reserved for use by hardware 453 * bits 55-60: SW counter 454 * bits 61-63: engine class 455 * 456 * engine info, SW context ID and SW counter need to form a unique number 457 * (Context ID) per lrc. 458 */ 459 static u64 460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 461 { 462 u64 desc; 463 464 desc = INTEL_LEGACY_32B_CONTEXT; 465 if (i915_vm_is_4lvl(ce->vm)) 466 desc = INTEL_LEGACY_64B_CONTEXT; 467 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 468 469 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 470 if (IS_GEN(engine->i915, 8)) 471 desc |= GEN8_CTX_L3LLC_COHERENT; 472 473 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 474 /* 475 * The following 32bits are copied into the OA reports (dword 2). 476 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 477 * anything below. 478 */ 479 if (INTEL_GEN(engine->i915) >= 11) { 480 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 481 /* bits 48-53 */ 482 483 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 484 /* bits 61-63 */ 485 } 486 487 return desc; 488 } 489 490 static inline unsigned int dword_in_page(void *addr) 491 { 492 return offset_in_page(addr) / sizeof(u32); 493 } 494 495 static void set_offsets(u32 *regs, 496 const u8 *data, 497 const struct intel_engine_cs *engine, 498 bool clear) 499 #define NOP(x) (BIT(7) | (x)) 500 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 501 #define POSTED BIT(0) 502 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 503 #define REG16(x) \ 504 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 505 (((x) >> 2) & 0x7f) 506 #define END(x) 0, (x) 507 { 508 const u32 base = engine->mmio_base; 509 510 while (*data) { 511 u8 count, flags; 512 513 if (*data & BIT(7)) { /* skip */ 514 count = *data++ & ~BIT(7); 515 if (clear) 516 memset32(regs, MI_NOOP, count); 517 regs += count; 518 continue; 519 } 520 521 count = *data & 0x3f; 522 flags = *data >> 6; 523 data++; 524 525 *regs = MI_LOAD_REGISTER_IMM(count); 526 if (flags & POSTED) 527 *regs |= MI_LRI_FORCE_POSTED; 528 if (INTEL_GEN(engine->i915) >= 11) 529 *regs |= MI_LRI_CS_MMIO; 530 regs++; 531 532 GEM_BUG_ON(!count); 533 do { 534 u32 offset = 0; 535 u8 v; 536 537 do { 538 v = *data++; 539 offset <<= 7; 540 offset |= v & ~BIT(7); 541 } while (v & BIT(7)); 542 543 regs[0] = base + (offset << 2); 544 if (clear) 545 regs[1] = 0; 546 regs += 2; 547 } while (--count); 548 } 549 550 if (clear) { 551 u8 count = *++data; 552 553 /* Clear past the tail for HW access */ 554 GEM_BUG_ON(dword_in_page(regs) > count); 555 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 556 557 /* Close the batch; used mainly by live_lrc_layout() */ 558 *regs = MI_BATCH_BUFFER_END; 559 if (INTEL_GEN(engine->i915) >= 10) 560 *regs |= BIT(0); 561 } 562 } 563 564 static const u8 gen8_xcs_offsets[] = { 565 NOP(1), 566 LRI(11, 0), 567 REG16(0x244), 568 REG(0x034), 569 REG(0x030), 570 REG(0x038), 571 REG(0x03c), 572 REG(0x168), 573 REG(0x140), 574 REG(0x110), 575 REG(0x11c), 576 REG(0x114), 577 REG(0x118), 578 579 NOP(9), 580 LRI(9, 0), 581 REG16(0x3a8), 582 REG16(0x28c), 583 REG16(0x288), 584 REG16(0x284), 585 REG16(0x280), 586 REG16(0x27c), 587 REG16(0x278), 588 REG16(0x274), 589 REG16(0x270), 590 591 NOP(13), 592 LRI(2, 0), 593 REG16(0x200), 594 REG(0x028), 595 596 END(80) 597 }; 598 599 static const u8 gen9_xcs_offsets[] = { 600 NOP(1), 601 LRI(14, POSTED), 602 REG16(0x244), 603 REG(0x034), 604 REG(0x030), 605 REG(0x038), 606 REG(0x03c), 607 REG(0x168), 608 REG(0x140), 609 REG(0x110), 610 REG(0x11c), 611 REG(0x114), 612 REG(0x118), 613 REG(0x1c0), 614 REG(0x1c4), 615 REG(0x1c8), 616 617 NOP(3), 618 LRI(9, POSTED), 619 REG16(0x3a8), 620 REG16(0x28c), 621 REG16(0x288), 622 REG16(0x284), 623 REG16(0x280), 624 REG16(0x27c), 625 REG16(0x278), 626 REG16(0x274), 627 REG16(0x270), 628 629 NOP(13), 630 LRI(1, POSTED), 631 REG16(0x200), 632 633 NOP(13), 634 LRI(44, POSTED), 635 REG(0x028), 636 REG(0x09c), 637 REG(0x0c0), 638 REG(0x178), 639 REG(0x17c), 640 REG16(0x358), 641 REG(0x170), 642 REG(0x150), 643 REG(0x154), 644 REG(0x158), 645 REG16(0x41c), 646 REG16(0x600), 647 REG16(0x604), 648 REG16(0x608), 649 REG16(0x60c), 650 REG16(0x610), 651 REG16(0x614), 652 REG16(0x618), 653 REG16(0x61c), 654 REG16(0x620), 655 REG16(0x624), 656 REG16(0x628), 657 REG16(0x62c), 658 REG16(0x630), 659 REG16(0x634), 660 REG16(0x638), 661 REG16(0x63c), 662 REG16(0x640), 663 REG16(0x644), 664 REG16(0x648), 665 REG16(0x64c), 666 REG16(0x650), 667 REG16(0x654), 668 REG16(0x658), 669 REG16(0x65c), 670 REG16(0x660), 671 REG16(0x664), 672 REG16(0x668), 673 REG16(0x66c), 674 REG16(0x670), 675 REG16(0x674), 676 REG16(0x678), 677 REG16(0x67c), 678 REG(0x068), 679 680 END(176) 681 }; 682 683 static const u8 gen12_xcs_offsets[] = { 684 NOP(1), 685 LRI(13, POSTED), 686 REG16(0x244), 687 REG(0x034), 688 REG(0x030), 689 REG(0x038), 690 REG(0x03c), 691 REG(0x168), 692 REG(0x140), 693 REG(0x110), 694 REG(0x1c0), 695 REG(0x1c4), 696 REG(0x1c8), 697 REG(0x180), 698 REG16(0x2b4), 699 700 NOP(5), 701 LRI(9, POSTED), 702 REG16(0x3a8), 703 REG16(0x28c), 704 REG16(0x288), 705 REG16(0x284), 706 REG16(0x280), 707 REG16(0x27c), 708 REG16(0x278), 709 REG16(0x274), 710 REG16(0x270), 711 712 END(80) 713 }; 714 715 static const u8 gen8_rcs_offsets[] = { 716 NOP(1), 717 LRI(14, POSTED), 718 REG16(0x244), 719 REG(0x034), 720 REG(0x030), 721 REG(0x038), 722 REG(0x03c), 723 REG(0x168), 724 REG(0x140), 725 REG(0x110), 726 REG(0x11c), 727 REG(0x114), 728 REG(0x118), 729 REG(0x1c0), 730 REG(0x1c4), 731 REG(0x1c8), 732 733 NOP(3), 734 LRI(9, POSTED), 735 REG16(0x3a8), 736 REG16(0x28c), 737 REG16(0x288), 738 REG16(0x284), 739 REG16(0x280), 740 REG16(0x27c), 741 REG16(0x278), 742 REG16(0x274), 743 REG16(0x270), 744 745 NOP(13), 746 LRI(1, 0), 747 REG(0x0c8), 748 749 END(80) 750 }; 751 752 static const u8 gen9_rcs_offsets[] = { 753 NOP(1), 754 LRI(14, POSTED), 755 REG16(0x244), 756 REG(0x34), 757 REG(0x30), 758 REG(0x38), 759 REG(0x3c), 760 REG(0x168), 761 REG(0x140), 762 REG(0x110), 763 REG(0x11c), 764 REG(0x114), 765 REG(0x118), 766 REG(0x1c0), 767 REG(0x1c4), 768 REG(0x1c8), 769 770 NOP(3), 771 LRI(9, POSTED), 772 REG16(0x3a8), 773 REG16(0x28c), 774 REG16(0x288), 775 REG16(0x284), 776 REG16(0x280), 777 REG16(0x27c), 778 REG16(0x278), 779 REG16(0x274), 780 REG16(0x270), 781 782 NOP(13), 783 LRI(1, 0), 784 REG(0xc8), 785 786 NOP(13), 787 LRI(44, POSTED), 788 REG(0x28), 789 REG(0x9c), 790 REG(0xc0), 791 REG(0x178), 792 REG(0x17c), 793 REG16(0x358), 794 REG(0x170), 795 REG(0x150), 796 REG(0x154), 797 REG(0x158), 798 REG16(0x41c), 799 REG16(0x600), 800 REG16(0x604), 801 REG16(0x608), 802 REG16(0x60c), 803 REG16(0x610), 804 REG16(0x614), 805 REG16(0x618), 806 REG16(0x61c), 807 REG16(0x620), 808 REG16(0x624), 809 REG16(0x628), 810 REG16(0x62c), 811 REG16(0x630), 812 REG16(0x634), 813 REG16(0x638), 814 REG16(0x63c), 815 REG16(0x640), 816 REG16(0x644), 817 REG16(0x648), 818 REG16(0x64c), 819 REG16(0x650), 820 REG16(0x654), 821 REG16(0x658), 822 REG16(0x65c), 823 REG16(0x660), 824 REG16(0x664), 825 REG16(0x668), 826 REG16(0x66c), 827 REG16(0x670), 828 REG16(0x674), 829 REG16(0x678), 830 REG16(0x67c), 831 REG(0x68), 832 833 END(176) 834 }; 835 836 static const u8 gen11_rcs_offsets[] = { 837 NOP(1), 838 LRI(15, POSTED), 839 REG16(0x244), 840 REG(0x034), 841 REG(0x030), 842 REG(0x038), 843 REG(0x03c), 844 REG(0x168), 845 REG(0x140), 846 REG(0x110), 847 REG(0x11c), 848 REG(0x114), 849 REG(0x118), 850 REG(0x1c0), 851 REG(0x1c4), 852 REG(0x1c8), 853 REG(0x180), 854 855 NOP(1), 856 LRI(9, POSTED), 857 REG16(0x3a8), 858 REG16(0x28c), 859 REG16(0x288), 860 REG16(0x284), 861 REG16(0x280), 862 REG16(0x27c), 863 REG16(0x278), 864 REG16(0x274), 865 REG16(0x270), 866 867 LRI(1, POSTED), 868 REG(0x1b0), 869 870 NOP(10), 871 LRI(1, 0), 872 REG(0x0c8), 873 874 END(80) 875 }; 876 877 static const u8 gen12_rcs_offsets[] = { 878 NOP(1), 879 LRI(13, POSTED), 880 REG16(0x244), 881 REG(0x034), 882 REG(0x030), 883 REG(0x038), 884 REG(0x03c), 885 REG(0x168), 886 REG(0x140), 887 REG(0x110), 888 REG(0x1c0), 889 REG(0x1c4), 890 REG(0x1c8), 891 REG(0x180), 892 REG16(0x2b4), 893 894 NOP(5), 895 LRI(9, POSTED), 896 REG16(0x3a8), 897 REG16(0x28c), 898 REG16(0x288), 899 REG16(0x284), 900 REG16(0x280), 901 REG16(0x27c), 902 REG16(0x278), 903 REG16(0x274), 904 REG16(0x270), 905 906 LRI(3, POSTED), 907 REG(0x1b0), 908 REG16(0x5a8), 909 REG16(0x5ac), 910 911 NOP(6), 912 LRI(1, 0), 913 REG(0x0c8), 914 915 END(80) 916 }; 917 918 #undef END 919 #undef REG16 920 #undef REG 921 #undef LRI 922 #undef NOP 923 924 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 925 { 926 /* 927 * The gen12+ lists only have the registers we program in the basic 928 * default state. We rely on the context image using relative 929 * addressing to automatic fixup the register state between the 930 * physical engines for virtual engine. 931 */ 932 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 933 !intel_engine_has_relative_mmio(engine)); 934 935 if (engine->class == RENDER_CLASS) { 936 if (INTEL_GEN(engine->i915) >= 12) 937 return gen12_rcs_offsets; 938 else if (INTEL_GEN(engine->i915) >= 11) 939 return gen11_rcs_offsets; 940 else if (INTEL_GEN(engine->i915) >= 9) 941 return gen9_rcs_offsets; 942 else 943 return gen8_rcs_offsets; 944 } else { 945 if (INTEL_GEN(engine->i915) >= 12) 946 return gen12_xcs_offsets; 947 else if (INTEL_GEN(engine->i915) >= 9) 948 return gen9_xcs_offsets; 949 else 950 return gen8_xcs_offsets; 951 } 952 } 953 954 static struct i915_request * 955 __unwind_incomplete_requests(struct intel_engine_cs *engine) 956 { 957 struct i915_request *rq, *rn, *active = NULL; 958 struct list_head *uninitialized_var(pl); 959 int prio = I915_PRIORITY_INVALID; 960 961 lockdep_assert_held(&engine->active.lock); 962 963 list_for_each_entry_safe_reverse(rq, rn, 964 &engine->active.requests, 965 sched.link) { 966 if (i915_request_completed(rq)) 967 continue; /* XXX */ 968 969 __i915_request_unsubmit(rq); 970 971 /* 972 * Push the request back into the queue for later resubmission. 973 * If this request is not native to this physical engine (i.e. 974 * it came from a virtual source), push it back onto the virtual 975 * engine so that it can be moved across onto another physical 976 * engine as load dictates. 977 */ 978 if (likely(rq->execution_mask == engine->mask)) { 979 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 980 if (rq_prio(rq) != prio) { 981 prio = rq_prio(rq); 982 pl = i915_sched_lookup_priolist(engine, prio); 983 } 984 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 985 986 list_move(&rq->sched.link, pl); 987 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 988 989 active = rq; 990 } else { 991 struct intel_engine_cs *owner = rq->context->engine; 992 993 /* 994 * Decouple the virtual breadcrumb before moving it 995 * back to the virtual engine -- we don't want the 996 * request to complete in the background and try 997 * and cancel the breadcrumb on the virtual engine 998 * (instead of the old engine where it is linked)! 999 */ 1000 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1001 &rq->fence.flags)) { 1002 spin_lock_nested(&rq->lock, 1003 SINGLE_DEPTH_NESTING); 1004 i915_request_cancel_breadcrumb(rq); 1005 spin_unlock(&rq->lock); 1006 } 1007 rq->engine = owner; 1008 owner->submit_request(rq); 1009 active = NULL; 1010 } 1011 } 1012 1013 return active; 1014 } 1015 1016 struct i915_request * 1017 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1018 { 1019 struct intel_engine_cs *engine = 1020 container_of(execlists, typeof(*engine), execlists); 1021 1022 return __unwind_incomplete_requests(engine); 1023 } 1024 1025 static inline void 1026 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1027 { 1028 /* 1029 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1030 * The compiler should eliminate this function as dead-code. 1031 */ 1032 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1033 return; 1034 1035 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1036 status, rq); 1037 } 1038 1039 static void intel_engine_context_in(struct intel_engine_cs *engine) 1040 { 1041 unsigned long flags; 1042 1043 if (READ_ONCE(engine->stats.enabled) == 0) 1044 return; 1045 1046 write_seqlock_irqsave(&engine->stats.lock, flags); 1047 1048 if (engine->stats.enabled > 0) { 1049 if (engine->stats.active++ == 0) 1050 engine->stats.start = ktime_get(); 1051 GEM_BUG_ON(engine->stats.active == 0); 1052 } 1053 1054 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1055 } 1056 1057 static void intel_engine_context_out(struct intel_engine_cs *engine) 1058 { 1059 unsigned long flags; 1060 1061 if (READ_ONCE(engine->stats.enabled) == 0) 1062 return; 1063 1064 write_seqlock_irqsave(&engine->stats.lock, flags); 1065 1066 if (engine->stats.enabled > 0) { 1067 ktime_t last; 1068 1069 if (engine->stats.active && --engine->stats.active == 0) { 1070 /* 1071 * Decrement the active context count and in case GPU 1072 * is now idle add up to the running total. 1073 */ 1074 last = ktime_sub(ktime_get(), engine->stats.start); 1075 1076 engine->stats.total = ktime_add(engine->stats.total, 1077 last); 1078 } else if (engine->stats.active == 0) { 1079 /* 1080 * After turning on engine stats, context out might be 1081 * the first event in which case we account from the 1082 * time stats gathering was turned on. 1083 */ 1084 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 1085 1086 engine->stats.total = ktime_add(engine->stats.total, 1087 last); 1088 } 1089 } 1090 1091 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1092 } 1093 1094 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 1095 { 1096 if (INTEL_GEN(engine->i915) >= 12) 1097 return 0x60; 1098 else if (INTEL_GEN(engine->i915) >= 9) 1099 return 0x54; 1100 else if (engine->class == RENDER_CLASS) 1101 return 0x58; 1102 else 1103 return -1; 1104 } 1105 1106 static void 1107 execlists_check_context(const struct intel_context *ce, 1108 const struct intel_engine_cs *engine) 1109 { 1110 const struct intel_ring *ring = ce->ring; 1111 u32 *regs = ce->lrc_reg_state; 1112 bool valid = true; 1113 int x; 1114 1115 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1116 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1117 engine->name, 1118 regs[CTX_RING_START], 1119 i915_ggtt_offset(ring->vma)); 1120 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1121 valid = false; 1122 } 1123 1124 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1125 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1126 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1127 engine->name, 1128 regs[CTX_RING_CTL], 1129 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1130 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1131 valid = false; 1132 } 1133 1134 x = lrc_ring_mi_mode(engine); 1135 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1136 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1137 engine->name, regs[x + 1]); 1138 regs[x + 1] &= ~STOP_RING; 1139 regs[x + 1] |= STOP_RING << 16; 1140 valid = false; 1141 } 1142 1143 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1144 } 1145 1146 static void restore_default_state(struct intel_context *ce, 1147 struct intel_engine_cs *engine) 1148 { 1149 u32 *regs = ce->lrc_reg_state; 1150 1151 if (engine->pinned_default_state) 1152 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1153 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1154 engine->context_size - PAGE_SIZE); 1155 1156 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1157 } 1158 1159 static void reset_active(struct i915_request *rq, 1160 struct intel_engine_cs *engine) 1161 { 1162 struct intel_context * const ce = rq->context; 1163 u32 head; 1164 1165 /* 1166 * The executing context has been cancelled. We want to prevent 1167 * further execution along this context and propagate the error on 1168 * to anything depending on its results. 1169 * 1170 * In __i915_request_submit(), we apply the -EIO and remove the 1171 * requests' payloads for any banned requests. But first, we must 1172 * rewind the context back to the start of the incomplete request so 1173 * that we do not jump back into the middle of the batch. 1174 * 1175 * We preserve the breadcrumbs and semaphores of the incomplete 1176 * requests so that inter-timeline dependencies (i.e other timelines) 1177 * remain correctly ordered. And we defer to __i915_request_submit() 1178 * so that all asynchronous waits are correctly handled. 1179 */ 1180 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1181 rq->fence.context, rq->fence.seqno); 1182 1183 /* On resubmission of the active request, payload will be scrubbed */ 1184 if (i915_request_completed(rq)) 1185 head = rq->tail; 1186 else 1187 head = active_request(ce->timeline, rq)->head; 1188 head = intel_ring_wrap(ce->ring, head); 1189 1190 /* Scrub the context image to prevent replaying the previous batch */ 1191 restore_default_state(ce, engine); 1192 __execlists_update_reg_state(ce, engine, head); 1193 1194 /* We've switched away, so this should be a no-op, but intent matters */ 1195 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1196 } 1197 1198 static u32 intel_context_get_runtime(const struct intel_context *ce) 1199 { 1200 /* 1201 * We can use either ppHWSP[16] which is recorded before the context 1202 * switch (and so excludes the cost of context switches) or use the 1203 * value from the context image itself, which is saved/restored earlier 1204 * and so includes the cost of the save. 1205 */ 1206 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1207 } 1208 1209 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1210 { 1211 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1212 ce->runtime.num_underflow += dt < 0; 1213 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1214 #endif 1215 } 1216 1217 static void intel_context_update_runtime(struct intel_context *ce) 1218 { 1219 u32 old; 1220 s32 dt; 1221 1222 if (intel_context_is_barrier(ce)) 1223 return; 1224 1225 old = ce->runtime.last; 1226 ce->runtime.last = intel_context_get_runtime(ce); 1227 dt = ce->runtime.last - old; 1228 1229 if (unlikely(dt <= 0)) { 1230 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1231 old, ce->runtime.last, dt); 1232 st_update_runtime_underflow(ce, dt); 1233 return; 1234 } 1235 1236 ewma_runtime_add(&ce->runtime.avg, dt); 1237 ce->runtime.total += dt; 1238 } 1239 1240 static inline struct intel_engine_cs * 1241 __execlists_schedule_in(struct i915_request *rq) 1242 { 1243 struct intel_engine_cs * const engine = rq->engine; 1244 struct intel_context * const ce = rq->context; 1245 1246 intel_context_get(ce); 1247 1248 if (unlikely(intel_context_is_banned(ce))) 1249 reset_active(rq, engine); 1250 1251 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1252 execlists_check_context(ce, engine); 1253 1254 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1255 if (ce->tag) { 1256 /* Use a fixed tag for OA and friends */ 1257 ce->lrc_desc |= (u64)ce->tag << 32; 1258 } else { 1259 /* We don't need a strict matching tag, just different values */ 1260 ce->lrc_desc |= 1261 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) << 1262 GEN11_SW_CTX_ID_SHIFT; 1263 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1264 } 1265 1266 __intel_gt_pm_get(engine->gt); 1267 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1268 intel_engine_context_in(engine); 1269 1270 return engine; 1271 } 1272 1273 static inline struct i915_request * 1274 execlists_schedule_in(struct i915_request *rq, int idx) 1275 { 1276 struct intel_context * const ce = rq->context; 1277 struct intel_engine_cs *old; 1278 1279 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1280 trace_i915_request_in(rq, idx); 1281 1282 old = READ_ONCE(ce->inflight); 1283 do { 1284 if (!old) { 1285 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1286 break; 1287 } 1288 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1289 1290 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1291 return i915_request_get(rq); 1292 } 1293 1294 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1295 { 1296 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1297 struct i915_request *next = READ_ONCE(ve->request); 1298 1299 if (next && next->execution_mask & ~rq->execution_mask) 1300 tasklet_schedule(&ve->base.execlists.tasklet); 1301 } 1302 1303 static inline void 1304 __execlists_schedule_out(struct i915_request *rq, 1305 struct intel_engine_cs * const engine) 1306 { 1307 struct intel_context * const ce = rq->context; 1308 1309 /* 1310 * NB process_csb() is not under the engine->active.lock and hence 1311 * schedule_out can race with schedule_in meaning that we should 1312 * refrain from doing non-trivial work here. 1313 */ 1314 1315 /* 1316 * If we have just completed this context, the engine may now be 1317 * idle and we want to re-enter powersaving. 1318 */ 1319 if (list_is_last(&rq->link, &ce->timeline->requests) && 1320 i915_request_completed(rq)) 1321 intel_engine_add_retire(engine, ce->timeline); 1322 1323 intel_context_update_runtime(ce); 1324 intel_engine_context_out(engine); 1325 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1326 intel_gt_pm_put_async(engine->gt); 1327 1328 /* 1329 * If this is part of a virtual engine, its next request may 1330 * have been blocked waiting for access to the active context. 1331 * We have to kick all the siblings again in case we need to 1332 * switch (e.g. the next request is not runnable on this 1333 * engine). Hopefully, we will already have submitted the next 1334 * request before the tasklet runs and do not need to rebuild 1335 * each virtual tree and kick everyone again. 1336 */ 1337 if (ce->engine != engine) 1338 kick_siblings(rq, ce); 1339 1340 intel_context_put(ce); 1341 } 1342 1343 static inline void 1344 execlists_schedule_out(struct i915_request *rq) 1345 { 1346 struct intel_context * const ce = rq->context; 1347 struct intel_engine_cs *cur, *old; 1348 1349 trace_i915_request_out(rq); 1350 1351 old = READ_ONCE(ce->inflight); 1352 do 1353 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1354 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1355 if (!cur) 1356 __execlists_schedule_out(rq, old); 1357 1358 i915_request_put(rq); 1359 } 1360 1361 static u64 execlists_update_context(struct i915_request *rq) 1362 { 1363 struct intel_context *ce = rq->context; 1364 u64 desc = ce->lrc_desc; 1365 u32 tail, prev; 1366 1367 /* 1368 * WaIdleLiteRestore:bdw,skl 1369 * 1370 * We should never submit the context with the same RING_TAIL twice 1371 * just in case we submit an empty ring, which confuses the HW. 1372 * 1373 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1374 * the normal request to be able to always advance the RING_TAIL on 1375 * subsequent resubmissions (for lite restore). Should that fail us, 1376 * and we try and submit the same tail again, force the context 1377 * reload. 1378 * 1379 * If we need to return to a preempted context, we need to skip the 1380 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1381 * HW has a tendency to ignore us rewinding the TAIL to the end of 1382 * an earlier request. 1383 */ 1384 tail = intel_ring_set_tail(rq->ring, rq->tail); 1385 prev = ce->lrc_reg_state[CTX_RING_TAIL]; 1386 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1387 desc |= CTX_DESC_FORCE_RESTORE; 1388 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1389 rq->tail = rq->wa_tail; 1390 1391 /* 1392 * Make sure the context image is complete before we submit it to HW. 1393 * 1394 * Ostensibly, writes (including the WCB) should be flushed prior to 1395 * an uncached write such as our mmio register access, the empirical 1396 * evidence (esp. on Braswell) suggests that the WC write into memory 1397 * may not be visible to the HW prior to the completion of the UC 1398 * register write and that we may begin execution from the context 1399 * before its image is complete leading to invalid PD chasing. 1400 */ 1401 wmb(); 1402 1403 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1404 return desc; 1405 } 1406 1407 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1408 { 1409 if (execlists->ctrl_reg) { 1410 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1411 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1412 } else { 1413 writel(upper_32_bits(desc), execlists->submit_reg); 1414 writel(lower_32_bits(desc), execlists->submit_reg); 1415 } 1416 } 1417 1418 static __maybe_unused void 1419 trace_ports(const struct intel_engine_execlists *execlists, 1420 const char *msg, 1421 struct i915_request * const *ports) 1422 { 1423 const struct intel_engine_cs *engine = 1424 container_of(execlists, typeof(*engine), execlists); 1425 1426 if (!ports[0]) 1427 return; 1428 1429 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg, 1430 ports[0]->fence.context, 1431 ports[0]->fence.seqno, 1432 i915_request_completed(ports[0]) ? "!" : 1433 i915_request_started(ports[0]) ? "*" : 1434 "", 1435 ports[1] ? ports[1]->fence.context : 0, 1436 ports[1] ? ports[1]->fence.seqno : 0); 1437 } 1438 1439 static inline bool 1440 reset_in_progress(const struct intel_engine_execlists *execlists) 1441 { 1442 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1443 } 1444 1445 static __maybe_unused bool 1446 assert_pending_valid(const struct intel_engine_execlists *execlists, 1447 const char *msg) 1448 { 1449 struct i915_request * const *port, *rq; 1450 struct intel_context *ce = NULL; 1451 1452 trace_ports(execlists, msg, execlists->pending); 1453 1454 /* We may be messing around with the lists during reset, lalala */ 1455 if (reset_in_progress(execlists)) 1456 return true; 1457 1458 if (!execlists->pending[0]) { 1459 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1460 return false; 1461 } 1462 1463 if (execlists->pending[execlists_num_ports(execlists)]) { 1464 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1465 execlists_num_ports(execlists)); 1466 return false; 1467 } 1468 1469 for (port = execlists->pending; (rq = *port); port++) { 1470 unsigned long flags; 1471 bool ok = true; 1472 1473 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1474 GEM_BUG_ON(!i915_request_is_active(rq)); 1475 1476 if (ce == rq->context) { 1477 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n", 1478 ce->timeline->fence_context, 1479 port - execlists->pending); 1480 return false; 1481 } 1482 ce = rq->context; 1483 1484 /* Hold tightly onto the lock to prevent concurrent retires! */ 1485 if (!spin_trylock_irqsave(&rq->lock, flags)) 1486 continue; 1487 1488 if (i915_request_completed(rq)) 1489 goto unlock; 1490 1491 if (i915_active_is_idle(&ce->active) && 1492 !intel_context_is_barrier(ce)) { 1493 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n", 1494 ce->timeline->fence_context, 1495 port - execlists->pending); 1496 ok = false; 1497 goto unlock; 1498 } 1499 1500 if (!i915_vma_is_pinned(ce->state)) { 1501 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n", 1502 ce->timeline->fence_context, 1503 port - execlists->pending); 1504 ok = false; 1505 goto unlock; 1506 } 1507 1508 if (!i915_vma_is_pinned(ce->ring->vma)) { 1509 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n", 1510 ce->timeline->fence_context, 1511 port - execlists->pending); 1512 ok = false; 1513 goto unlock; 1514 } 1515 1516 unlock: 1517 spin_unlock_irqrestore(&rq->lock, flags); 1518 if (!ok) 1519 return false; 1520 } 1521 1522 return ce; 1523 } 1524 1525 static void execlists_submit_ports(struct intel_engine_cs *engine) 1526 { 1527 struct intel_engine_execlists *execlists = &engine->execlists; 1528 unsigned int n; 1529 1530 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1531 1532 /* 1533 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1534 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1535 * not be relinquished until the device is idle (see 1536 * i915_gem_idle_work_handler()). As a precaution, we make sure 1537 * that all ELSP are drained i.e. we have processed the CSB, 1538 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1539 */ 1540 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1541 1542 /* 1543 * ELSQ note: the submit queue is not cleared after being submitted 1544 * to the HW so we need to make sure we always clean it up. This is 1545 * currently ensured by the fact that we always write the same number 1546 * of elsq entries, keep this in mind before changing the loop below. 1547 */ 1548 for (n = execlists_num_ports(execlists); n--; ) { 1549 struct i915_request *rq = execlists->pending[n]; 1550 1551 write_desc(execlists, 1552 rq ? execlists_update_context(rq) : 0, 1553 n); 1554 } 1555 1556 /* we need to manually load the submit queue */ 1557 if (execlists->ctrl_reg) 1558 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1559 } 1560 1561 static bool ctx_single_port_submission(const struct intel_context *ce) 1562 { 1563 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1564 intel_context_force_single_submission(ce)); 1565 } 1566 1567 static bool can_merge_ctx(const struct intel_context *prev, 1568 const struct intel_context *next) 1569 { 1570 if (prev != next) 1571 return false; 1572 1573 if (ctx_single_port_submission(prev)) 1574 return false; 1575 1576 return true; 1577 } 1578 1579 static bool can_merge_rq(const struct i915_request *prev, 1580 const struct i915_request *next) 1581 { 1582 GEM_BUG_ON(prev == next); 1583 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1584 1585 /* 1586 * We do not submit known completed requests. Therefore if the next 1587 * request is already completed, we can pretend to merge it in 1588 * with the previous context (and we will skip updating the ELSP 1589 * and tracking). Thus hopefully keeping the ELSP full with active 1590 * contexts, despite the best efforts of preempt-to-busy to confuse 1591 * us. 1592 */ 1593 if (i915_request_completed(next)) 1594 return true; 1595 1596 if (unlikely((prev->fence.flags ^ next->fence.flags) & 1597 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1598 BIT(I915_FENCE_FLAG_SENTINEL)))) 1599 return false; 1600 1601 if (!can_merge_ctx(prev->context, next->context)) 1602 return false; 1603 1604 return true; 1605 } 1606 1607 static void virtual_update_register_offsets(u32 *regs, 1608 struct intel_engine_cs *engine) 1609 { 1610 set_offsets(regs, reg_offsets(engine), engine, false); 1611 } 1612 1613 static bool virtual_matches(const struct virtual_engine *ve, 1614 const struct i915_request *rq, 1615 const struct intel_engine_cs *engine) 1616 { 1617 const struct intel_engine_cs *inflight; 1618 1619 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1620 return false; 1621 1622 /* 1623 * We track when the HW has completed saving the context image 1624 * (i.e. when we have seen the final CS event switching out of 1625 * the context) and must not overwrite the context image before 1626 * then. This restricts us to only using the active engine 1627 * while the previous virtualized request is inflight (so 1628 * we reuse the register offsets). This is a very small 1629 * hystersis on the greedy seelction algorithm. 1630 */ 1631 inflight = intel_context_inflight(&ve->context); 1632 if (inflight && inflight != engine) 1633 return false; 1634 1635 return true; 1636 } 1637 1638 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1639 struct intel_engine_cs *engine) 1640 { 1641 struct intel_engine_cs *old = ve->siblings[0]; 1642 1643 /* All unattached (rq->engine == old) must already be completed */ 1644 1645 spin_lock(&old->breadcrumbs.irq_lock); 1646 if (!list_empty(&ve->context.signal_link)) { 1647 list_move_tail(&ve->context.signal_link, 1648 &engine->breadcrumbs.signalers); 1649 intel_engine_signal_breadcrumbs(engine); 1650 } 1651 spin_unlock(&old->breadcrumbs.irq_lock); 1652 } 1653 1654 static struct i915_request * 1655 last_active(const struct intel_engine_execlists *execlists) 1656 { 1657 struct i915_request * const *last = READ_ONCE(execlists->active); 1658 1659 while (*last && i915_request_completed(*last)) 1660 last++; 1661 1662 return *last; 1663 } 1664 1665 #define for_each_waiter(p__, rq__) \ 1666 list_for_each_entry_lockless(p__, \ 1667 &(rq__)->sched.waiters_list, \ 1668 wait_link) 1669 1670 #define for_each_signaler(p__, rq__) \ 1671 list_for_each_entry_rcu(p__, \ 1672 &(rq__)->sched.signalers_list, \ 1673 signal_link) 1674 1675 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1676 { 1677 LIST_HEAD(list); 1678 1679 /* 1680 * We want to move the interrupted request to the back of 1681 * the round-robin list (i.e. its priority level), but 1682 * in doing so, we must then move all requests that were in 1683 * flight and were waiting for the interrupted request to 1684 * be run after it again. 1685 */ 1686 do { 1687 struct i915_dependency *p; 1688 1689 GEM_BUG_ON(i915_request_is_active(rq)); 1690 list_move_tail(&rq->sched.link, pl); 1691 1692 for_each_waiter(p, rq) { 1693 struct i915_request *w = 1694 container_of(p->waiter, typeof(*w), sched); 1695 1696 /* Leave semaphores spinning on the other engines */ 1697 if (w->engine != rq->engine) 1698 continue; 1699 1700 /* No waiter should start before its signaler */ 1701 GEM_BUG_ON(i915_request_started(w) && 1702 !i915_request_completed(rq)); 1703 1704 GEM_BUG_ON(i915_request_is_active(w)); 1705 if (!i915_request_is_ready(w)) 1706 continue; 1707 1708 if (rq_prio(w) < rq_prio(rq)) 1709 continue; 1710 1711 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1712 list_move_tail(&w->sched.link, &list); 1713 } 1714 1715 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1716 } while (rq); 1717 } 1718 1719 static void defer_active(struct intel_engine_cs *engine) 1720 { 1721 struct i915_request *rq; 1722 1723 rq = __unwind_incomplete_requests(engine); 1724 if (!rq) 1725 return; 1726 1727 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1728 } 1729 1730 static bool 1731 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1732 { 1733 int hint; 1734 1735 if (!intel_engine_has_timeslices(engine)) 1736 return false; 1737 1738 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1739 return false; 1740 1741 hint = max(rq_prio(list_next_entry(rq, sched.link)), 1742 engine->execlists.queue_priority_hint); 1743 1744 return hint >= effective_prio(rq); 1745 } 1746 1747 static int 1748 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1749 { 1750 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1751 return INT_MIN; 1752 1753 return rq_prio(list_next_entry(rq, sched.link)); 1754 } 1755 1756 static inline unsigned long 1757 timeslice(const struct intel_engine_cs *engine) 1758 { 1759 return READ_ONCE(engine->props.timeslice_duration_ms); 1760 } 1761 1762 static unsigned long 1763 active_timeslice(const struct intel_engine_cs *engine) 1764 { 1765 const struct i915_request *rq = *engine->execlists.active; 1766 1767 if (!rq || i915_request_completed(rq)) 1768 return 0; 1769 1770 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1771 return 0; 1772 1773 return timeslice(engine); 1774 } 1775 1776 static void set_timeslice(struct intel_engine_cs *engine) 1777 { 1778 if (!intel_engine_has_timeslices(engine)) 1779 return; 1780 1781 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1782 } 1783 1784 static void record_preemption(struct intel_engine_execlists *execlists) 1785 { 1786 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1787 } 1788 1789 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1790 { 1791 struct i915_request *rq; 1792 1793 rq = last_active(&engine->execlists); 1794 if (!rq) 1795 return 0; 1796 1797 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1798 if (unlikely(intel_context_is_banned(rq->context))) 1799 return 1; 1800 1801 return READ_ONCE(engine->props.preempt_timeout_ms); 1802 } 1803 1804 static void set_preempt_timeout(struct intel_engine_cs *engine) 1805 { 1806 if (!intel_engine_has_preempt_reset(engine)) 1807 return; 1808 1809 set_timer_ms(&engine->execlists.preempt, 1810 active_preempt_timeout(engine)); 1811 } 1812 1813 static inline void clear_ports(struct i915_request **ports, int count) 1814 { 1815 memset_p((void **)ports, NULL, count); 1816 } 1817 1818 static void execlists_dequeue(struct intel_engine_cs *engine) 1819 { 1820 struct intel_engine_execlists * const execlists = &engine->execlists; 1821 struct i915_request **port = execlists->pending; 1822 struct i915_request ** const last_port = port + execlists->port_mask; 1823 struct i915_request *last; 1824 struct rb_node *rb; 1825 bool submit = false; 1826 1827 /* 1828 * Hardware submission is through 2 ports. Conceptually each port 1829 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1830 * static for a context, and unique to each, so we only execute 1831 * requests belonging to a single context from each ring. RING_HEAD 1832 * is maintained by the CS in the context image, it marks the place 1833 * where it got up to last time, and through RING_TAIL we tell the CS 1834 * where we want to execute up to this time. 1835 * 1836 * In this list the requests are in order of execution. Consecutive 1837 * requests from the same context are adjacent in the ringbuffer. We 1838 * can combine these requests into a single RING_TAIL update: 1839 * 1840 * RING_HEAD...req1...req2 1841 * ^- RING_TAIL 1842 * since to execute req2 the CS must first execute req1. 1843 * 1844 * Our goal then is to point each port to the end of a consecutive 1845 * sequence of requests as being the most optimal (fewest wake ups 1846 * and context switches) submission. 1847 */ 1848 1849 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1850 struct virtual_engine *ve = 1851 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1852 struct i915_request *rq = READ_ONCE(ve->request); 1853 1854 if (!rq) { /* lazily cleanup after another engine handled rq */ 1855 rb_erase_cached(rb, &execlists->virtual); 1856 RB_CLEAR_NODE(rb); 1857 rb = rb_first_cached(&execlists->virtual); 1858 continue; 1859 } 1860 1861 if (!virtual_matches(ve, rq, engine)) { 1862 rb = rb_next(rb); 1863 continue; 1864 } 1865 1866 break; 1867 } 1868 1869 /* 1870 * If the queue is higher priority than the last 1871 * request in the currently active context, submit afresh. 1872 * We will resubmit again afterwards in case we need to split 1873 * the active context to interject the preemption request, 1874 * i.e. we will retrigger preemption following the ack in case 1875 * of trouble. 1876 */ 1877 last = last_active(execlists); 1878 if (last) { 1879 if (need_preempt(engine, last, rb)) { 1880 ENGINE_TRACE(engine, 1881 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 1882 last->fence.context, 1883 last->fence.seqno, 1884 last->sched.attr.priority, 1885 execlists->queue_priority_hint); 1886 record_preemption(execlists); 1887 1888 /* 1889 * Don't let the RING_HEAD advance past the breadcrumb 1890 * as we unwind (and until we resubmit) so that we do 1891 * not accidentally tell it to go backwards. 1892 */ 1893 ring_set_paused(engine, 1); 1894 1895 /* 1896 * Note that we have not stopped the GPU at this point, 1897 * so we are unwinding the incomplete requests as they 1898 * remain inflight and so by the time we do complete 1899 * the preemption, some of the unwound requests may 1900 * complete! 1901 */ 1902 __unwind_incomplete_requests(engine); 1903 1904 last = NULL; 1905 } else if (need_timeslice(engine, last) && 1906 timer_expired(&engine->execlists.timer)) { 1907 ENGINE_TRACE(engine, 1908 "expired last=%llx:%lld, prio=%d, hint=%d\n", 1909 last->fence.context, 1910 last->fence.seqno, 1911 last->sched.attr.priority, 1912 execlists->queue_priority_hint); 1913 1914 ring_set_paused(engine, 1); 1915 defer_active(engine); 1916 1917 /* 1918 * Unlike for preemption, if we rewind and continue 1919 * executing the same context as previously active, 1920 * the order of execution will remain the same and 1921 * the tail will only advance. We do not need to 1922 * force a full context restore, as a lite-restore 1923 * is sufficient to resample the monotonic TAIL. 1924 * 1925 * If we switch to any other context, similarly we 1926 * will not rewind TAIL of current context, and 1927 * normal save/restore will preserve state and allow 1928 * us to later continue executing the same request. 1929 */ 1930 last = NULL; 1931 } else { 1932 /* 1933 * Otherwise if we already have a request pending 1934 * for execution after the current one, we can 1935 * just wait until the next CS event before 1936 * queuing more. In either case we will force a 1937 * lite-restore preemption event, but if we wait 1938 * we hopefully coalesce several updates into a single 1939 * submission. 1940 */ 1941 if (!list_is_last(&last->sched.link, 1942 &engine->active.requests)) { 1943 /* 1944 * Even if ELSP[1] is occupied and not worthy 1945 * of timeslices, our queue might be. 1946 */ 1947 if (!execlists->timer.expires && 1948 need_timeslice(engine, last)) 1949 set_timer_ms(&execlists->timer, 1950 timeslice(engine)); 1951 1952 return; 1953 } 1954 } 1955 } 1956 1957 while (rb) { /* XXX virtual is always taking precedence */ 1958 struct virtual_engine *ve = 1959 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1960 struct i915_request *rq; 1961 1962 spin_lock(&ve->base.active.lock); 1963 1964 rq = ve->request; 1965 if (unlikely(!rq)) { /* lost the race to a sibling */ 1966 spin_unlock(&ve->base.active.lock); 1967 rb_erase_cached(rb, &execlists->virtual); 1968 RB_CLEAR_NODE(rb); 1969 rb = rb_first_cached(&execlists->virtual); 1970 continue; 1971 } 1972 1973 GEM_BUG_ON(rq != ve->request); 1974 GEM_BUG_ON(rq->engine != &ve->base); 1975 GEM_BUG_ON(rq->context != &ve->context); 1976 1977 if (rq_prio(rq) >= queue_prio(execlists)) { 1978 if (!virtual_matches(ve, rq, engine)) { 1979 spin_unlock(&ve->base.active.lock); 1980 rb = rb_next(rb); 1981 continue; 1982 } 1983 1984 if (last && !can_merge_rq(last, rq)) { 1985 spin_unlock(&ve->base.active.lock); 1986 return; /* leave this for another */ 1987 } 1988 1989 ENGINE_TRACE(engine, 1990 "virtual rq=%llx:%lld%s, new engine? %s\n", 1991 rq->fence.context, 1992 rq->fence.seqno, 1993 i915_request_completed(rq) ? "!" : 1994 i915_request_started(rq) ? "*" : 1995 "", 1996 yesno(engine != ve->siblings[0])); 1997 1998 ve->request = NULL; 1999 ve->base.execlists.queue_priority_hint = INT_MIN; 2000 rb_erase_cached(rb, &execlists->virtual); 2001 RB_CLEAR_NODE(rb); 2002 2003 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2004 rq->engine = engine; 2005 2006 if (engine != ve->siblings[0]) { 2007 u32 *regs = ve->context.lrc_reg_state; 2008 unsigned int n; 2009 2010 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 2011 2012 if (!intel_engine_has_relative_mmio(engine)) 2013 virtual_update_register_offsets(regs, 2014 engine); 2015 2016 if (!list_empty(&ve->context.signals)) 2017 virtual_xfer_breadcrumbs(ve, engine); 2018 2019 /* 2020 * Move the bound engine to the top of the list 2021 * for future execution. We then kick this 2022 * tasklet first before checking others, so that 2023 * we preferentially reuse this set of bound 2024 * registers. 2025 */ 2026 for (n = 1; n < ve->num_siblings; n++) { 2027 if (ve->siblings[n] == engine) { 2028 swap(ve->siblings[n], 2029 ve->siblings[0]); 2030 break; 2031 } 2032 } 2033 2034 GEM_BUG_ON(ve->siblings[0] != engine); 2035 } 2036 2037 if (__i915_request_submit(rq)) { 2038 submit = true; 2039 last = rq; 2040 } 2041 i915_request_put(rq); 2042 2043 /* 2044 * Hmm, we have a bunch of virtual engine requests, 2045 * but the first one was already completed (thanks 2046 * preempt-to-busy!). Keep looking at the veng queue 2047 * until we have no more relevant requests (i.e. 2048 * the normal submit queue has higher priority). 2049 */ 2050 if (!submit) { 2051 spin_unlock(&ve->base.active.lock); 2052 rb = rb_first_cached(&execlists->virtual); 2053 continue; 2054 } 2055 } 2056 2057 spin_unlock(&ve->base.active.lock); 2058 break; 2059 } 2060 2061 while ((rb = rb_first_cached(&execlists->queue))) { 2062 struct i915_priolist *p = to_priolist(rb); 2063 struct i915_request *rq, *rn; 2064 int i; 2065 2066 priolist_for_each_request_consume(rq, rn, p, i) { 2067 bool merge = true; 2068 2069 /* 2070 * Can we combine this request with the current port? 2071 * It has to be the same context/ringbuffer and not 2072 * have any exceptions (e.g. GVT saying never to 2073 * combine contexts). 2074 * 2075 * If we can combine the requests, we can execute both 2076 * by updating the RING_TAIL to point to the end of the 2077 * second request, and so we never need to tell the 2078 * hardware about the first. 2079 */ 2080 if (last && !can_merge_rq(last, rq)) { 2081 /* 2082 * If we are on the second port and cannot 2083 * combine this request with the last, then we 2084 * are done. 2085 */ 2086 if (port == last_port) 2087 goto done; 2088 2089 /* 2090 * We must not populate both ELSP[] with the 2091 * same LRCA, i.e. we must submit 2 different 2092 * contexts if we submit 2 ELSP. 2093 */ 2094 if (last->context == rq->context) 2095 goto done; 2096 2097 if (i915_request_has_sentinel(last)) 2098 goto done; 2099 2100 /* 2101 * If GVT overrides us we only ever submit 2102 * port[0], leaving port[1] empty. Note that we 2103 * also have to be careful that we don't queue 2104 * the same context (even though a different 2105 * request) to the second port. 2106 */ 2107 if (ctx_single_port_submission(last->context) || 2108 ctx_single_port_submission(rq->context)) 2109 goto done; 2110 2111 merge = false; 2112 } 2113 2114 if (__i915_request_submit(rq)) { 2115 if (!merge) { 2116 *port = execlists_schedule_in(last, port - execlists->pending); 2117 port++; 2118 last = NULL; 2119 } 2120 2121 GEM_BUG_ON(last && 2122 !can_merge_ctx(last->context, 2123 rq->context)); 2124 2125 submit = true; 2126 last = rq; 2127 } 2128 } 2129 2130 rb_erase_cached(&p->node, &execlists->queue); 2131 i915_priolist_free(p); 2132 } 2133 2134 done: 2135 /* 2136 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2137 * 2138 * We choose the priority hint such that if we add a request of greater 2139 * priority than this, we kick the submission tasklet to decide on 2140 * the right order of submitting the requests to hardware. We must 2141 * also be prepared to reorder requests as they are in-flight on the 2142 * HW. We derive the priority hint then as the first "hole" in 2143 * the HW submission ports and if there are no available slots, 2144 * the priority of the lowest executing request, i.e. last. 2145 * 2146 * When we do receive a higher priority request ready to run from the 2147 * user, see queue_request(), the priority hint is bumped to that 2148 * request triggering preemption on the next dequeue (or subsequent 2149 * interrupt for secondary ports). 2150 */ 2151 execlists->queue_priority_hint = queue_prio(execlists); 2152 2153 if (submit) { 2154 *port = execlists_schedule_in(last, port - execlists->pending); 2155 execlists->switch_priority_hint = 2156 switch_prio(engine, *execlists->pending); 2157 2158 /* 2159 * Skip if we ended up with exactly the same set of requests, 2160 * e.g. trying to timeslice a pair of ordered contexts 2161 */ 2162 if (!memcmp(execlists->active, execlists->pending, 2163 (port - execlists->pending + 1) * sizeof(*port))) { 2164 do 2165 execlists_schedule_out(fetch_and_zero(port)); 2166 while (port-- != execlists->pending); 2167 2168 goto skip_submit; 2169 } 2170 clear_ports(port + 1, last_port - port); 2171 2172 execlists_submit_ports(engine); 2173 set_preempt_timeout(engine); 2174 } else { 2175 skip_submit: 2176 ring_set_paused(engine, 0); 2177 } 2178 } 2179 2180 static void 2181 cancel_port_requests(struct intel_engine_execlists * const execlists) 2182 { 2183 struct i915_request * const *port; 2184 2185 for (port = execlists->pending; *port; port++) 2186 execlists_schedule_out(*port); 2187 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2188 2189 /* Mark the end of active before we overwrite *active */ 2190 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2191 execlists_schedule_out(*port); 2192 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2193 2194 WRITE_ONCE(execlists->active, execlists->inflight); 2195 } 2196 2197 static inline void 2198 invalidate_csb_entries(const u32 *first, const u32 *last) 2199 { 2200 clflush((void *)first); 2201 clflush((void *)last); 2202 } 2203 2204 /* 2205 * Starting with Gen12, the status has a new format: 2206 * 2207 * bit 0: switched to new queue 2208 * bit 1: reserved 2209 * bit 2: semaphore wait mode (poll or signal), only valid when 2210 * switch detail is set to "wait on semaphore" 2211 * bits 3-5: engine class 2212 * bits 6-11: engine instance 2213 * bits 12-14: reserved 2214 * bits 15-25: sw context id of the lrc the GT switched to 2215 * bits 26-31: sw counter of the lrc the GT switched to 2216 * bits 32-35: context switch detail 2217 * - 0: ctx complete 2218 * - 1: wait on sync flip 2219 * - 2: wait on vblank 2220 * - 3: wait on scanline 2221 * - 4: wait on semaphore 2222 * - 5: context preempted (not on SEMAPHORE_WAIT or 2223 * WAIT_FOR_EVENT) 2224 * bit 36: reserved 2225 * bits 37-43: wait detail (for switch detail 1 to 4) 2226 * bits 44-46: reserved 2227 * bits 47-57: sw context id of the lrc the GT switched away from 2228 * bits 58-63: sw counter of the lrc the GT switched away from 2229 */ 2230 static inline bool 2231 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2232 { 2233 u32 lower_dw = csb[0]; 2234 u32 upper_dw = csb[1]; 2235 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2236 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2237 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2238 2239 /* 2240 * The context switch detail is not guaranteed to be 5 when a preemption 2241 * occurs, so we can't just check for that. The check below works for 2242 * all the cases we care about, including preemptions of WAIT 2243 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2244 * would require some extra handling, but we don't support that. 2245 */ 2246 if (!ctx_away_valid || new_queue) { 2247 GEM_BUG_ON(!ctx_to_valid); 2248 return true; 2249 } 2250 2251 /* 2252 * switch detail = 5 is covered by the case above and we do not expect a 2253 * context switch on an unsuccessful wait instruction since we always 2254 * use polling mode. 2255 */ 2256 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2257 return false; 2258 } 2259 2260 static inline bool 2261 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2262 { 2263 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2264 } 2265 2266 static void process_csb(struct intel_engine_cs *engine) 2267 { 2268 struct intel_engine_execlists * const execlists = &engine->execlists; 2269 const u32 * const buf = execlists->csb_status; 2270 const u8 num_entries = execlists->csb_size; 2271 u8 head, tail; 2272 2273 /* 2274 * As we modify our execlists state tracking we require exclusive 2275 * access. Either we are inside the tasklet, or the tasklet is disabled 2276 * and we assume that is only inside the reset paths and so serialised. 2277 */ 2278 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2279 !reset_in_progress(execlists)); 2280 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2281 2282 /* 2283 * Note that csb_write, csb_status may be either in HWSP or mmio. 2284 * When reading from the csb_write mmio register, we have to be 2285 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2286 * the low 4bits. As it happens we know the next 4bits are always 2287 * zero and so we can simply masked off the low u8 of the register 2288 * and treat it identically to reading from the HWSP (without having 2289 * to use explicit shifting and masking, and probably bifurcating 2290 * the code to handle the legacy mmio read). 2291 */ 2292 head = execlists->csb_head; 2293 tail = READ_ONCE(*execlists->csb_write); 2294 if (unlikely(head == tail)) 2295 return; 2296 2297 /* 2298 * Hopefully paired with a wmb() in HW! 2299 * 2300 * We must complete the read of the write pointer before any reads 2301 * from the CSB, so that we do not see stale values. Without an rmb 2302 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2303 * we perform the READ_ONCE(*csb_write). 2304 */ 2305 rmb(); 2306 2307 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2308 do { 2309 bool promote; 2310 2311 if (++head == num_entries) 2312 head = 0; 2313 2314 /* 2315 * We are flying near dragons again. 2316 * 2317 * We hold a reference to the request in execlist_port[] 2318 * but no more than that. We are operating in softirq 2319 * context and so cannot hold any mutex or sleep. That 2320 * prevents us stopping the requests we are processing 2321 * in port[] from being retired simultaneously (the 2322 * breadcrumb will be complete before we see the 2323 * context-switch). As we only hold the reference to the 2324 * request, any pointer chasing underneath the request 2325 * is subject to a potential use-after-free. Thus we 2326 * store all of the bookkeeping within port[] as 2327 * required, and avoid using unguarded pointers beneath 2328 * request itself. The same applies to the atomic 2329 * status notifier. 2330 */ 2331 2332 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2333 head, buf[2 * head + 0], buf[2 * head + 1]); 2334 2335 if (INTEL_GEN(engine->i915) >= 12) 2336 promote = gen12_csb_parse(execlists, buf + 2 * head); 2337 else 2338 promote = gen8_csb_parse(execlists, buf + 2 * head); 2339 if (promote) { 2340 struct i915_request * const *old = execlists->active; 2341 2342 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2343 2344 ring_set_paused(engine, 0); 2345 2346 /* Point active to the new ELSP; prevent overwriting */ 2347 WRITE_ONCE(execlists->active, execlists->pending); 2348 2349 /* cancel old inflight, prepare for switch */ 2350 trace_ports(execlists, "preempted", old); 2351 while (*old) 2352 execlists_schedule_out(*old++); 2353 2354 /* switch pending to inflight */ 2355 WRITE_ONCE(execlists->active, 2356 memcpy(execlists->inflight, 2357 execlists->pending, 2358 execlists_num_ports(execlists) * 2359 sizeof(*execlists->pending))); 2360 2361 WRITE_ONCE(execlists->pending[0], NULL); 2362 } else { 2363 GEM_BUG_ON(!*execlists->active); 2364 2365 /* port0 completed, advanced to port1 */ 2366 trace_ports(execlists, "completed", execlists->active); 2367 2368 /* 2369 * We rely on the hardware being strongly 2370 * ordered, that the breadcrumb write is 2371 * coherent (visible from the CPU) before the 2372 * user interrupt and CSB is processed. 2373 */ 2374 if (GEM_SHOW_DEBUG() && 2375 !i915_request_completed(*execlists->active) && 2376 !reset_in_progress(execlists)) { 2377 struct i915_request *rq __maybe_unused = 2378 *execlists->active; 2379 const u32 *regs __maybe_unused = 2380 rq->context->lrc_reg_state; 2381 2382 ENGINE_TRACE(engine, 2383 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2384 ENGINE_READ(engine, RING_START), 2385 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2386 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2387 ENGINE_READ(engine, RING_CTL), 2388 ENGINE_READ(engine, RING_MI_MODE)); 2389 ENGINE_TRACE(engine, 2390 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2391 i915_ggtt_offset(rq->ring->vma), 2392 rq->head, rq->tail, 2393 rq->fence.context, 2394 lower_32_bits(rq->fence.seqno), 2395 hwsp_seqno(rq)); 2396 ENGINE_TRACE(engine, 2397 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2398 regs[CTX_RING_START], 2399 regs[CTX_RING_HEAD], 2400 regs[CTX_RING_TAIL]); 2401 2402 GEM_BUG_ON("context completed before request"); 2403 } 2404 2405 execlists_schedule_out(*execlists->active++); 2406 2407 GEM_BUG_ON(execlists->active - execlists->inflight > 2408 execlists_num_ports(execlists)); 2409 } 2410 } while (head != tail); 2411 2412 execlists->csb_head = head; 2413 set_timeslice(engine); 2414 2415 /* 2416 * Gen11 has proven to fail wrt global observation point between 2417 * entry and tail update, failing on the ordering and thus 2418 * we see an old entry in the context status buffer. 2419 * 2420 * Forcibly evict out entries for the next gpu csb update, 2421 * to increase the odds that we get a fresh entries with non 2422 * working hardware. The cost for doing so comes out mostly with 2423 * the wash as hardware, working or not, will need to do the 2424 * invalidation before. 2425 */ 2426 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2427 } 2428 2429 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2430 { 2431 lockdep_assert_held(&engine->active.lock); 2432 if (!READ_ONCE(engine->execlists.pending[0])) { 2433 rcu_read_lock(); /* protect peeking at execlists->active */ 2434 execlists_dequeue(engine); 2435 rcu_read_unlock(); 2436 } 2437 } 2438 2439 static void __execlists_hold(struct i915_request *rq) 2440 { 2441 LIST_HEAD(list); 2442 2443 do { 2444 struct i915_dependency *p; 2445 2446 if (i915_request_is_active(rq)) 2447 __i915_request_unsubmit(rq); 2448 2449 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2450 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2451 i915_request_set_hold(rq); 2452 RQ_TRACE(rq, "on hold\n"); 2453 2454 for_each_waiter(p, rq) { 2455 struct i915_request *w = 2456 container_of(p->waiter, typeof(*w), sched); 2457 2458 /* Leave semaphores spinning on the other engines */ 2459 if (w->engine != rq->engine) 2460 continue; 2461 2462 if (!i915_request_is_ready(w)) 2463 continue; 2464 2465 if (i915_request_completed(w)) 2466 continue; 2467 2468 if (i915_request_on_hold(w)) 2469 continue; 2470 2471 list_move_tail(&w->sched.link, &list); 2472 } 2473 2474 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2475 } while (rq); 2476 } 2477 2478 static bool execlists_hold(struct intel_engine_cs *engine, 2479 struct i915_request *rq) 2480 { 2481 spin_lock_irq(&engine->active.lock); 2482 2483 if (i915_request_completed(rq)) { /* too late! */ 2484 rq = NULL; 2485 goto unlock; 2486 } 2487 2488 if (rq->engine != engine) { /* preempted virtual engine */ 2489 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2490 2491 /* 2492 * intel_context_inflight() is only protected by virtue 2493 * of process_csb() being called only by the tasklet (or 2494 * directly from inside reset while the tasklet is suspended). 2495 * Assert that neither of those are allowed to run while we 2496 * poke at the request queues. 2497 */ 2498 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2499 2500 /* 2501 * An unsubmitted request along a virtual engine will 2502 * remain on the active (this) engine until we are able 2503 * to process the context switch away (and so mark the 2504 * context as no longer in flight). That cannot have happened 2505 * yet, otherwise we would not be hanging! 2506 */ 2507 spin_lock(&ve->base.active.lock); 2508 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2509 GEM_BUG_ON(ve->request != rq); 2510 ve->request = NULL; 2511 spin_unlock(&ve->base.active.lock); 2512 i915_request_put(rq); 2513 2514 rq->engine = engine; 2515 } 2516 2517 /* 2518 * Transfer this request onto the hold queue to prevent it 2519 * being resumbitted to HW (and potentially completed) before we have 2520 * released it. Since we may have already submitted following 2521 * requests, we need to remove those as well. 2522 */ 2523 GEM_BUG_ON(i915_request_on_hold(rq)); 2524 GEM_BUG_ON(rq->engine != engine); 2525 __execlists_hold(rq); 2526 GEM_BUG_ON(list_empty(&engine->active.hold)); 2527 2528 unlock: 2529 spin_unlock_irq(&engine->active.lock); 2530 return rq; 2531 } 2532 2533 static bool hold_request(const struct i915_request *rq) 2534 { 2535 struct i915_dependency *p; 2536 bool result = false; 2537 2538 /* 2539 * If one of our ancestors is on hold, we must also be on hold, 2540 * otherwise we will bypass it and execute before it. 2541 */ 2542 rcu_read_lock(); 2543 for_each_signaler(p, rq) { 2544 const struct i915_request *s = 2545 container_of(p->signaler, typeof(*s), sched); 2546 2547 if (s->engine != rq->engine) 2548 continue; 2549 2550 result = i915_request_on_hold(s); 2551 if (result) 2552 break; 2553 } 2554 rcu_read_unlock(); 2555 2556 return result; 2557 } 2558 2559 static void __execlists_unhold(struct i915_request *rq) 2560 { 2561 LIST_HEAD(list); 2562 2563 do { 2564 struct i915_dependency *p; 2565 2566 RQ_TRACE(rq, "hold release\n"); 2567 2568 GEM_BUG_ON(!i915_request_on_hold(rq)); 2569 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2570 2571 i915_request_clear_hold(rq); 2572 list_move_tail(&rq->sched.link, 2573 i915_sched_lookup_priolist(rq->engine, 2574 rq_prio(rq))); 2575 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2576 2577 /* Also release any children on this engine that are ready */ 2578 for_each_waiter(p, rq) { 2579 struct i915_request *w = 2580 container_of(p->waiter, typeof(*w), sched); 2581 2582 if (w->engine != rq->engine) 2583 continue; 2584 2585 if (!i915_request_on_hold(w)) 2586 continue; 2587 2588 /* Check that no other parents are also on hold */ 2589 if (hold_request(w)) 2590 continue; 2591 2592 list_move_tail(&w->sched.link, &list); 2593 } 2594 2595 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2596 } while (rq); 2597 } 2598 2599 static void execlists_unhold(struct intel_engine_cs *engine, 2600 struct i915_request *rq) 2601 { 2602 spin_lock_irq(&engine->active.lock); 2603 2604 /* 2605 * Move this request back to the priority queue, and all of its 2606 * children and grandchildren that were suspended along with it. 2607 */ 2608 __execlists_unhold(rq); 2609 2610 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2611 engine->execlists.queue_priority_hint = rq_prio(rq); 2612 tasklet_hi_schedule(&engine->execlists.tasklet); 2613 } 2614 2615 spin_unlock_irq(&engine->active.lock); 2616 } 2617 2618 struct execlists_capture { 2619 struct work_struct work; 2620 struct i915_request *rq; 2621 struct i915_gpu_coredump *error; 2622 }; 2623 2624 static void execlists_capture_work(struct work_struct *work) 2625 { 2626 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2627 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2628 struct intel_engine_cs *engine = cap->rq->engine; 2629 struct intel_gt_coredump *gt = cap->error->gt; 2630 struct intel_engine_capture_vma *vma; 2631 2632 /* Compress all the objects attached to the request, slow! */ 2633 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2634 if (vma) { 2635 struct i915_vma_compress *compress = 2636 i915_vma_capture_prepare(gt); 2637 2638 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2639 i915_vma_capture_finish(gt, compress); 2640 } 2641 2642 gt->simulated = gt->engine->simulated; 2643 cap->error->simulated = gt->simulated; 2644 2645 /* Publish the error state, and announce it to the world */ 2646 i915_error_state_store(cap->error); 2647 i915_gpu_coredump_put(cap->error); 2648 2649 /* Return this request and all that depend upon it for signaling */ 2650 execlists_unhold(engine, cap->rq); 2651 i915_request_put(cap->rq); 2652 2653 kfree(cap); 2654 } 2655 2656 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2657 { 2658 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2659 struct execlists_capture *cap; 2660 2661 cap = kmalloc(sizeof(*cap), gfp); 2662 if (!cap) 2663 return NULL; 2664 2665 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2666 if (!cap->error) 2667 goto err_cap; 2668 2669 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2670 if (!cap->error->gt) 2671 goto err_gpu; 2672 2673 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2674 if (!cap->error->gt->engine) 2675 goto err_gt; 2676 2677 return cap; 2678 2679 err_gt: 2680 kfree(cap->error->gt); 2681 err_gpu: 2682 kfree(cap->error); 2683 err_cap: 2684 kfree(cap); 2685 return NULL; 2686 } 2687 2688 static bool execlists_capture(struct intel_engine_cs *engine) 2689 { 2690 struct execlists_capture *cap; 2691 2692 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 2693 return true; 2694 2695 /* 2696 * We need to _quickly_ capture the engine state before we reset. 2697 * We are inside an atomic section (softirq) here and we are delaying 2698 * the forced preemption event. 2699 */ 2700 cap = capture_regs(engine); 2701 if (!cap) 2702 return true; 2703 2704 spin_lock_irq(&engine->active.lock); 2705 cap->rq = execlists_active(&engine->execlists); 2706 if (cap->rq) { 2707 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 2708 cap->rq = i915_request_get_rcu(cap->rq); 2709 } 2710 spin_unlock_irq(&engine->active.lock); 2711 if (!cap->rq) 2712 goto err_free; 2713 2714 /* 2715 * Remove the request from the execlists queue, and take ownership 2716 * of the request. We pass it to our worker who will _slowly_ compress 2717 * all the pages the _user_ requested for debugging their batch, after 2718 * which we return it to the queue for signaling. 2719 * 2720 * By removing them from the execlists queue, we also remove the 2721 * requests from being processed by __unwind_incomplete_requests() 2722 * during the intel_engine_reset(), and so they will *not* be replayed 2723 * afterwards. 2724 * 2725 * Note that because we have not yet reset the engine at this point, 2726 * it is possible for the request that we have identified as being 2727 * guilty, did in fact complete and we will then hit an arbitration 2728 * point allowing the outstanding preemption to succeed. The likelihood 2729 * of that is very low (as capturing of the engine registers should be 2730 * fast enough to run inside an irq-off atomic section!), so we will 2731 * simply hold that request accountable for being non-preemptible 2732 * long enough to force the reset. 2733 */ 2734 if (!execlists_hold(engine, cap->rq)) 2735 goto err_rq; 2736 2737 INIT_WORK(&cap->work, execlists_capture_work); 2738 schedule_work(&cap->work); 2739 return true; 2740 2741 err_rq: 2742 i915_request_put(cap->rq); 2743 err_free: 2744 i915_gpu_coredump_put(cap->error); 2745 kfree(cap); 2746 return false; 2747 } 2748 2749 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 2750 { 2751 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2752 unsigned long *lock = &engine->gt->reset.flags; 2753 2754 if (!intel_has_reset_engine(engine->gt)) 2755 return; 2756 2757 if (test_and_set_bit(bit, lock)) 2758 return; 2759 2760 ENGINE_TRACE(engine, "reset for %s\n", msg); 2761 2762 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2763 tasklet_disable_nosync(&engine->execlists.tasklet); 2764 2765 ring_set_paused(engine, 1); /* Freeze the current request in place */ 2766 if (execlists_capture(engine)) 2767 intel_engine_reset(engine, msg); 2768 else 2769 ring_set_paused(engine, 0); 2770 2771 tasklet_enable(&engine->execlists.tasklet); 2772 clear_and_wake_up_bit(bit, lock); 2773 } 2774 2775 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2776 { 2777 const struct timer_list *t = &engine->execlists.preempt; 2778 2779 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2780 return false; 2781 2782 if (!timer_expired(t)) 2783 return false; 2784 2785 return READ_ONCE(engine->execlists.pending[0]); 2786 } 2787 2788 /* 2789 * Check the unread Context Status Buffers and manage the submission of new 2790 * contexts to the ELSP accordingly. 2791 */ 2792 static void execlists_submission_tasklet(unsigned long data) 2793 { 2794 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2795 bool timeout = preempt_timeout(engine); 2796 2797 process_csb(engine); 2798 2799 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 2800 engine->execlists.error_interrupt = 0; 2801 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */ 2802 execlists_reset(engine, "CS error"); 2803 } 2804 2805 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2806 unsigned long flags; 2807 2808 spin_lock_irqsave(&engine->active.lock, flags); 2809 __execlists_submission_tasklet(engine); 2810 spin_unlock_irqrestore(&engine->active.lock, flags); 2811 2812 /* Recheck after serialising with direct-submission */ 2813 if (unlikely(timeout && preempt_timeout(engine))) 2814 execlists_reset(engine, "preemption time out"); 2815 } 2816 } 2817 2818 static void __execlists_kick(struct intel_engine_execlists *execlists) 2819 { 2820 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2821 tasklet_hi_schedule(&execlists->tasklet); 2822 } 2823 2824 #define execlists_kick(t, member) \ 2825 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2826 2827 static void execlists_timeslice(struct timer_list *timer) 2828 { 2829 execlists_kick(timer, timer); 2830 } 2831 2832 static void execlists_preempt(struct timer_list *timer) 2833 { 2834 execlists_kick(timer, preempt); 2835 } 2836 2837 static void queue_request(struct intel_engine_cs *engine, 2838 struct i915_request *rq) 2839 { 2840 GEM_BUG_ON(!list_empty(&rq->sched.link)); 2841 list_add_tail(&rq->sched.link, 2842 i915_sched_lookup_priolist(engine, rq_prio(rq))); 2843 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2844 } 2845 2846 static void __submit_queue_imm(struct intel_engine_cs *engine) 2847 { 2848 struct intel_engine_execlists * const execlists = &engine->execlists; 2849 2850 if (reset_in_progress(execlists)) 2851 return; /* defer until we restart the engine following reset */ 2852 2853 if (execlists->tasklet.func == execlists_submission_tasklet) 2854 __execlists_submission_tasklet(engine); 2855 else 2856 tasklet_hi_schedule(&execlists->tasklet); 2857 } 2858 2859 static void submit_queue(struct intel_engine_cs *engine, 2860 const struct i915_request *rq) 2861 { 2862 struct intel_engine_execlists *execlists = &engine->execlists; 2863 2864 if (rq_prio(rq) <= execlists->queue_priority_hint) 2865 return; 2866 2867 execlists->queue_priority_hint = rq_prio(rq); 2868 __submit_queue_imm(engine); 2869 } 2870 2871 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 2872 const struct i915_request *rq) 2873 { 2874 GEM_BUG_ON(i915_request_on_hold(rq)); 2875 return !list_empty(&engine->active.hold) && hold_request(rq); 2876 } 2877 2878 static void execlists_submit_request(struct i915_request *request) 2879 { 2880 struct intel_engine_cs *engine = request->engine; 2881 unsigned long flags; 2882 2883 /* Will be called from irq-context when using foreign fences. */ 2884 spin_lock_irqsave(&engine->active.lock, flags); 2885 2886 if (unlikely(ancestor_on_hold(engine, request))) { 2887 RQ_TRACE(request, "ancestor on hold\n"); 2888 list_add_tail(&request->sched.link, &engine->active.hold); 2889 i915_request_set_hold(request); 2890 } else { 2891 queue_request(engine, request); 2892 2893 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2894 GEM_BUG_ON(list_empty(&request->sched.link)); 2895 2896 submit_queue(engine, request); 2897 } 2898 2899 spin_unlock_irqrestore(&engine->active.lock, flags); 2900 } 2901 2902 static void __execlists_context_fini(struct intel_context *ce) 2903 { 2904 intel_ring_put(ce->ring); 2905 i915_vma_put(ce->state); 2906 } 2907 2908 static void execlists_context_destroy(struct kref *kref) 2909 { 2910 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2911 2912 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2913 GEM_BUG_ON(intel_context_is_pinned(ce)); 2914 2915 if (ce->state) 2916 __execlists_context_fini(ce); 2917 2918 intel_context_fini(ce); 2919 intel_context_free(ce); 2920 } 2921 2922 static void 2923 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2924 { 2925 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2926 return; 2927 2928 vaddr += engine->context_size; 2929 2930 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 2931 } 2932 2933 static void 2934 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2935 { 2936 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2937 return; 2938 2939 vaddr += engine->context_size; 2940 2941 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 2942 dev_err_once(engine->i915->drm.dev, 2943 "%s context redzone overwritten!\n", 2944 engine->name); 2945 } 2946 2947 static void execlists_context_unpin(struct intel_context *ce) 2948 { 2949 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2950 ce->engine); 2951 2952 i915_gem_object_unpin_map(ce->state->obj); 2953 } 2954 2955 static void 2956 __execlists_update_reg_state(const struct intel_context *ce, 2957 const struct intel_engine_cs *engine, 2958 u32 head) 2959 { 2960 struct intel_ring *ring = ce->ring; 2961 u32 *regs = ce->lrc_reg_state; 2962 2963 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 2964 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2965 2966 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 2967 regs[CTX_RING_HEAD] = head; 2968 regs[CTX_RING_TAIL] = ring->tail; 2969 2970 /* RPCS */ 2971 if (engine->class == RENDER_CLASS) { 2972 regs[CTX_R_PWR_CLK_STATE] = 2973 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2974 2975 i915_oa_init_reg_state(ce, engine); 2976 } 2977 } 2978 2979 static int 2980 __execlists_context_pin(struct intel_context *ce, 2981 struct intel_engine_cs *engine) 2982 { 2983 void *vaddr; 2984 2985 GEM_BUG_ON(!ce->state); 2986 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2987 2988 vaddr = i915_gem_object_pin_map(ce->state->obj, 2989 i915_coherent_map_type(engine->i915) | 2990 I915_MAP_OVERRIDE); 2991 if (IS_ERR(vaddr)) 2992 return PTR_ERR(vaddr); 2993 2994 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 2995 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2996 __execlists_update_reg_state(ce, engine, ce->ring->tail); 2997 2998 return 0; 2999 } 3000 3001 static int execlists_context_pin(struct intel_context *ce) 3002 { 3003 return __execlists_context_pin(ce, ce->engine); 3004 } 3005 3006 static int execlists_context_alloc(struct intel_context *ce) 3007 { 3008 return __execlists_context_alloc(ce, ce->engine); 3009 } 3010 3011 static void execlists_context_reset(struct intel_context *ce) 3012 { 3013 CE_TRACE(ce, "reset\n"); 3014 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3015 3016 intel_ring_reset(ce->ring, ce->ring->emit); 3017 3018 /* Scrub away the garbage */ 3019 execlists_init_reg_state(ce->lrc_reg_state, 3020 ce, ce->engine, ce->ring, true); 3021 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3022 3023 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 3024 } 3025 3026 static const struct intel_context_ops execlists_context_ops = { 3027 .alloc = execlists_context_alloc, 3028 3029 .pin = execlists_context_pin, 3030 .unpin = execlists_context_unpin, 3031 3032 .enter = intel_context_enter_engine, 3033 .exit = intel_context_exit_engine, 3034 3035 .reset = execlists_context_reset, 3036 .destroy = execlists_context_destroy, 3037 }; 3038 3039 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3040 { 3041 u32 *cs; 3042 3043 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3044 return 0; 3045 3046 cs = intel_ring_begin(rq, 6); 3047 if (IS_ERR(cs)) 3048 return PTR_ERR(cs); 3049 3050 /* 3051 * Check if we have been preempted before we even get started. 3052 * 3053 * After this point i915_request_started() reports true, even if 3054 * we get preempted and so are no longer running. 3055 */ 3056 *cs++ = MI_ARB_CHECK; 3057 *cs++ = MI_NOOP; 3058 3059 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3060 *cs++ = i915_request_timeline(rq)->hwsp_offset; 3061 *cs++ = 0; 3062 *cs++ = rq->fence.seqno - 1; 3063 3064 intel_ring_advance(rq, cs); 3065 3066 /* Record the updated position of the request's payload */ 3067 rq->infix = intel_ring_offset(rq, cs); 3068 3069 return 0; 3070 } 3071 3072 static int execlists_request_alloc(struct i915_request *request) 3073 { 3074 int ret; 3075 3076 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3077 3078 /* 3079 * Flush enough space to reduce the likelihood of waiting after 3080 * we start building the request - in which case we will just 3081 * have to repeat work. 3082 */ 3083 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3084 3085 /* 3086 * Note that after this point, we have committed to using 3087 * this request as it is being used to both track the 3088 * state of engine initialisation and liveness of the 3089 * golden renderstate above. Think twice before you try 3090 * to cancel/unwind this request now. 3091 */ 3092 3093 /* Unconditionally invalidate GPU caches and TLBs. */ 3094 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3095 if (ret) 3096 return ret; 3097 3098 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3099 return 0; 3100 } 3101 3102 /* 3103 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3104 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3105 * but there is a slight complication as this is applied in WA batch where the 3106 * values are only initialized once so we cannot take register value at the 3107 * beginning and reuse it further; hence we save its value to memory, upload a 3108 * constant value with bit21 set and then we restore it back with the saved value. 3109 * To simplify the WA, a constant value is formed by using the default value 3110 * of this register. This shouldn't be a problem because we are only modifying 3111 * it for a short period and this batch in non-premptible. We can ofcourse 3112 * use additional instructions that read the actual value of the register 3113 * at that time and set our bit of interest but it makes the WA complicated. 3114 * 3115 * This WA is also required for Gen9 so extracting as a function avoids 3116 * code duplication. 3117 */ 3118 static u32 * 3119 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3120 { 3121 /* NB no one else is allowed to scribble over scratch + 256! */ 3122 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3123 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3124 *batch++ = intel_gt_scratch_offset(engine->gt, 3125 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3126 *batch++ = 0; 3127 3128 *batch++ = MI_LOAD_REGISTER_IMM(1); 3129 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3130 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3131 3132 batch = gen8_emit_pipe_control(batch, 3133 PIPE_CONTROL_CS_STALL | 3134 PIPE_CONTROL_DC_FLUSH_ENABLE, 3135 0); 3136 3137 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3138 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3139 *batch++ = intel_gt_scratch_offset(engine->gt, 3140 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3141 *batch++ = 0; 3142 3143 return batch; 3144 } 3145 3146 /* 3147 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3148 * initialized at the beginning and shared across all contexts but this field 3149 * helps us to have multiple batches at different offsets and select them based 3150 * on a criteria. At the moment this batch always start at the beginning of the page 3151 * and at this point we don't have multiple wa_ctx batch buffers. 3152 * 3153 * The number of WA applied are not known at the beginning; we use this field 3154 * to return the no of DWORDS written. 3155 * 3156 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3157 * so it adds NOOPs as padding to make it cacheline aligned. 3158 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3159 * makes a complete batch buffer. 3160 */ 3161 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3162 { 3163 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3164 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3165 3166 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3167 if (IS_BROADWELL(engine->i915)) 3168 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3169 3170 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3171 /* Actual scratch location is at 128 bytes offset */ 3172 batch = gen8_emit_pipe_control(batch, 3173 PIPE_CONTROL_FLUSH_L3 | 3174 PIPE_CONTROL_STORE_DATA_INDEX | 3175 PIPE_CONTROL_CS_STALL | 3176 PIPE_CONTROL_QW_WRITE, 3177 LRC_PPHWSP_SCRATCH_ADDR); 3178 3179 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3180 3181 /* Pad to end of cacheline */ 3182 while ((unsigned long)batch % CACHELINE_BYTES) 3183 *batch++ = MI_NOOP; 3184 3185 /* 3186 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3187 * execution depends on the length specified in terms of cache lines 3188 * in the register CTX_RCS_INDIRECT_CTX 3189 */ 3190 3191 return batch; 3192 } 3193 3194 struct lri { 3195 i915_reg_t reg; 3196 u32 value; 3197 }; 3198 3199 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3200 { 3201 GEM_BUG_ON(!count || count > 63); 3202 3203 *batch++ = MI_LOAD_REGISTER_IMM(count); 3204 do { 3205 *batch++ = i915_mmio_reg_offset(lri->reg); 3206 *batch++ = lri->value; 3207 } while (lri++, --count); 3208 *batch++ = MI_NOOP; 3209 3210 return batch; 3211 } 3212 3213 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3214 { 3215 static const struct lri lri[] = { 3216 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3217 { 3218 COMMON_SLICE_CHICKEN2, 3219 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3220 0), 3221 }, 3222 3223 /* BSpec: 11391 */ 3224 { 3225 FF_SLICE_CHICKEN, 3226 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3227 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3228 }, 3229 3230 /* BSpec: 11299 */ 3231 { 3232 _3D_CHICKEN3, 3233 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3234 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3235 } 3236 }; 3237 3238 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3239 3240 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3241 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3242 3243 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3244 batch = gen8_emit_pipe_control(batch, 3245 PIPE_CONTROL_FLUSH_L3 | 3246 PIPE_CONTROL_STORE_DATA_INDEX | 3247 PIPE_CONTROL_CS_STALL | 3248 PIPE_CONTROL_QW_WRITE, 3249 LRC_PPHWSP_SCRATCH_ADDR); 3250 3251 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3252 3253 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3254 if (HAS_POOLED_EU(engine->i915)) { 3255 /* 3256 * EU pool configuration is setup along with golden context 3257 * during context initialization. This value depends on 3258 * device type (2x6 or 3x6) and needs to be updated based 3259 * on which subslice is disabled especially for 2x6 3260 * devices, however it is safe to load default 3261 * configuration of 3x6 device instead of masking off 3262 * corresponding bits because HW ignores bits of a disabled 3263 * subslice and drops down to appropriate config. Please 3264 * see render_state_setup() in i915_gem_render_state.c for 3265 * possible configurations, to avoid duplication they are 3266 * not shown here again. 3267 */ 3268 *batch++ = GEN9_MEDIA_POOL_STATE; 3269 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3270 *batch++ = 0x00777000; 3271 *batch++ = 0; 3272 *batch++ = 0; 3273 *batch++ = 0; 3274 } 3275 3276 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3277 3278 /* Pad to end of cacheline */ 3279 while ((unsigned long)batch % CACHELINE_BYTES) 3280 *batch++ = MI_NOOP; 3281 3282 return batch; 3283 } 3284 3285 static u32 * 3286 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3287 { 3288 int i; 3289 3290 /* 3291 * WaPipeControlBefore3DStateSamplePattern: cnl 3292 * 3293 * Ensure the engine is idle prior to programming a 3294 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3295 */ 3296 batch = gen8_emit_pipe_control(batch, 3297 PIPE_CONTROL_CS_STALL, 3298 0); 3299 /* 3300 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3301 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3302 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3303 * confusing. Since gen8_emit_pipe_control() already advances the 3304 * batch by 6 dwords, we advance the other 10 here, completing a 3305 * cacheline. It's not clear if the workaround requires this padding 3306 * before other commands, or if it's just the regular padding we would 3307 * already have for the workaround bb, so leave it here for now. 3308 */ 3309 for (i = 0; i < 10; i++) 3310 *batch++ = MI_NOOP; 3311 3312 /* Pad to end of cacheline */ 3313 while ((unsigned long)batch % CACHELINE_BYTES) 3314 *batch++ = MI_NOOP; 3315 3316 return batch; 3317 } 3318 3319 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3320 3321 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3322 { 3323 struct drm_i915_gem_object *obj; 3324 struct i915_vma *vma; 3325 int err; 3326 3327 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3328 if (IS_ERR(obj)) 3329 return PTR_ERR(obj); 3330 3331 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3332 if (IS_ERR(vma)) { 3333 err = PTR_ERR(vma); 3334 goto err; 3335 } 3336 3337 err = i915_ggtt_pin(vma, 0, PIN_HIGH); 3338 if (err) 3339 goto err; 3340 3341 engine->wa_ctx.vma = vma; 3342 return 0; 3343 3344 err: 3345 i915_gem_object_put(obj); 3346 return err; 3347 } 3348 3349 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3350 { 3351 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3352 } 3353 3354 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3355 3356 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3357 { 3358 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3359 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3360 &wa_ctx->per_ctx }; 3361 wa_bb_func_t wa_bb_fn[2]; 3362 struct page *page; 3363 void *batch, *batch_ptr; 3364 unsigned int i; 3365 int ret; 3366 3367 if (engine->class != RENDER_CLASS) 3368 return 0; 3369 3370 switch (INTEL_GEN(engine->i915)) { 3371 case 12: 3372 case 11: 3373 return 0; 3374 case 10: 3375 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3376 wa_bb_fn[1] = NULL; 3377 break; 3378 case 9: 3379 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3380 wa_bb_fn[1] = NULL; 3381 break; 3382 case 8: 3383 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3384 wa_bb_fn[1] = NULL; 3385 break; 3386 default: 3387 MISSING_CASE(INTEL_GEN(engine->i915)); 3388 return 0; 3389 } 3390 3391 ret = lrc_setup_wa_ctx(engine); 3392 if (ret) { 3393 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 3394 return ret; 3395 } 3396 3397 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 3398 batch = batch_ptr = kmap_atomic(page); 3399 3400 /* 3401 * Emit the two workaround batch buffers, recording the offset from the 3402 * start of the workaround batch buffer object for each and their 3403 * respective sizes. 3404 */ 3405 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3406 wa_bb[i]->offset = batch_ptr - batch; 3407 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3408 CACHELINE_BYTES))) { 3409 ret = -EINVAL; 3410 break; 3411 } 3412 if (wa_bb_fn[i]) 3413 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3414 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3415 } 3416 3417 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3418 3419 kunmap_atomic(batch); 3420 if (ret) 3421 lrc_destroy_wa_ctx(engine); 3422 3423 return ret; 3424 } 3425 3426 static void enable_error_interrupt(struct intel_engine_cs *engine) 3427 { 3428 u32 status; 3429 3430 engine->execlists.error_interrupt = 0; 3431 ENGINE_WRITE(engine, RING_EMR, ~0u); 3432 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 3433 3434 status = ENGINE_READ(engine, RING_ESR); 3435 if (unlikely(status)) { 3436 dev_err(engine->i915->drm.dev, 3437 "engine '%s' resumed still in error: %08x\n", 3438 engine->name, status); 3439 __intel_gt_reset(engine->gt, engine->mask); 3440 } 3441 3442 /* 3443 * On current gen8+, we have 2 signals to play with 3444 * 3445 * - I915_ERROR_INSTUCTION (bit 0) 3446 * 3447 * Generate an error if the command parser encounters an invalid 3448 * instruction 3449 * 3450 * This is a fatal error. 3451 * 3452 * - CP_PRIV (bit 2) 3453 * 3454 * Generate an error on privilege violation (where the CP replaces 3455 * the instruction with a no-op). This also fires for writes into 3456 * read-only scratch pages. 3457 * 3458 * This is a non-fatal error, parsing continues. 3459 * 3460 * * there are a few others defined for odd HW that we do not use 3461 * 3462 * Since CP_PRIV fires for cases where we have chosen to ignore the 3463 * error (as the HW is validating and suppressing the mistakes), we 3464 * only unmask the instruction error bit. 3465 */ 3466 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 3467 } 3468 3469 static void enable_execlists(struct intel_engine_cs *engine) 3470 { 3471 u32 mode; 3472 3473 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 3474 3475 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 3476 3477 if (INTEL_GEN(engine->i915) >= 11) 3478 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 3479 else 3480 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 3481 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 3482 3483 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 3484 3485 ENGINE_WRITE_FW(engine, 3486 RING_HWS_PGA, 3487 i915_ggtt_offset(engine->status_page.vma)); 3488 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 3489 3490 enable_error_interrupt(engine); 3491 3492 engine->context_tag = 0; 3493 } 3494 3495 static bool unexpected_starting_state(struct intel_engine_cs *engine) 3496 { 3497 bool unexpected = false; 3498 3499 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 3500 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 3501 unexpected = true; 3502 } 3503 3504 return unexpected; 3505 } 3506 3507 static int execlists_resume(struct intel_engine_cs *engine) 3508 { 3509 intel_mocs_init_engine(engine); 3510 3511 intel_engine_reset_breadcrumbs(engine); 3512 3513 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 3514 struct drm_printer p = drm_debug_printer(__func__); 3515 3516 intel_engine_dump(engine, &p, NULL); 3517 } 3518 3519 enable_execlists(engine); 3520 3521 return 0; 3522 } 3523 3524 static void execlists_reset_prepare(struct intel_engine_cs *engine) 3525 { 3526 struct intel_engine_execlists * const execlists = &engine->execlists; 3527 unsigned long flags; 3528 3529 ENGINE_TRACE(engine, "depth<-%d\n", 3530 atomic_read(&execlists->tasklet.count)); 3531 3532 /* 3533 * Prevent request submission to the hardware until we have 3534 * completed the reset in i915_gem_reset_finish(). If a request 3535 * is completed by one engine, it may then queue a request 3536 * to a second via its execlists->tasklet *just* as we are 3537 * calling engine->resume() and also writing the ELSP. 3538 * Turning off the execlists->tasklet until the reset is over 3539 * prevents the race. 3540 */ 3541 __tasklet_disable_sync_once(&execlists->tasklet); 3542 GEM_BUG_ON(!reset_in_progress(execlists)); 3543 3544 /* And flush any current direct submission. */ 3545 spin_lock_irqsave(&engine->active.lock, flags); 3546 spin_unlock_irqrestore(&engine->active.lock, flags); 3547 3548 /* 3549 * We stop engines, otherwise we might get failed reset and a 3550 * dead gpu (on elk). Also as modern gpu as kbl can suffer 3551 * from system hang if batchbuffer is progressing when 3552 * the reset is issued, regardless of READY_TO_RESET ack. 3553 * Thus assume it is best to stop engines on all gens 3554 * where we have a gpu reset. 3555 * 3556 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 3557 * 3558 * FIXME: Wa for more modern gens needs to be validated 3559 */ 3560 intel_engine_stop_cs(engine); 3561 } 3562 3563 static void reset_csb_pointers(struct intel_engine_cs *engine) 3564 { 3565 struct intel_engine_execlists * const execlists = &engine->execlists; 3566 const unsigned int reset_value = execlists->csb_size - 1; 3567 3568 ring_set_paused(engine, 0); 3569 3570 /* 3571 * After a reset, the HW starts writing into CSB entry [0]. We 3572 * therefore have to set our HEAD pointer back one entry so that 3573 * the *first* entry we check is entry 0. To complicate this further, 3574 * as we don't wait for the first interrupt after reset, we have to 3575 * fake the HW write to point back to the last entry so that our 3576 * inline comparison of our cached head position against the last HW 3577 * write works even before the first interrupt. 3578 */ 3579 execlists->csb_head = reset_value; 3580 WRITE_ONCE(*execlists->csb_write, reset_value); 3581 wmb(); /* Make sure this is visible to HW (paranoia?) */ 3582 3583 /* 3584 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3585 * Bludgeon them with a mmio update to be sure. 3586 */ 3587 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3588 reset_value << 8 | reset_value); 3589 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3590 3591 invalidate_csb_entries(&execlists->csb_status[0], 3592 &execlists->csb_status[reset_value]); 3593 } 3594 3595 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 3596 { 3597 int x; 3598 3599 x = lrc_ring_mi_mode(engine); 3600 if (x != -1) { 3601 regs[x + 1] &= ~STOP_RING; 3602 regs[x + 1] |= STOP_RING << 16; 3603 } 3604 } 3605 3606 static void __execlists_reset_reg_state(const struct intel_context *ce, 3607 const struct intel_engine_cs *engine) 3608 { 3609 u32 *regs = ce->lrc_reg_state; 3610 3611 __reset_stop_ring(regs, engine); 3612 } 3613 3614 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 3615 { 3616 struct intel_engine_execlists * const execlists = &engine->execlists; 3617 struct intel_context *ce; 3618 struct i915_request *rq; 3619 u32 head; 3620 3621 mb(); /* paranoia: read the CSB pointers from after the reset */ 3622 clflush(execlists->csb_write); 3623 mb(); 3624 3625 process_csb(engine); /* drain preemption events */ 3626 3627 /* Following the reset, we need to reload the CSB read/write pointers */ 3628 reset_csb_pointers(engine); 3629 3630 /* 3631 * Save the currently executing context, even if we completed 3632 * its request, it was still running at the time of the 3633 * reset and will have been clobbered. 3634 */ 3635 rq = execlists_active(execlists); 3636 if (!rq) 3637 goto unwind; 3638 3639 /* We still have requests in-flight; the engine should be active */ 3640 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3641 3642 ce = rq->context; 3643 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3644 3645 if (i915_request_completed(rq)) { 3646 /* Idle context; tidy up the ring so we can restart afresh */ 3647 head = intel_ring_wrap(ce->ring, rq->tail); 3648 goto out_replay; 3649 } 3650 3651 /* Context has requests still in-flight; it should not be idle! */ 3652 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3653 rq = active_request(ce->timeline, rq); 3654 head = intel_ring_wrap(ce->ring, rq->head); 3655 GEM_BUG_ON(head == ce->ring->tail); 3656 3657 /* 3658 * If this request hasn't started yet, e.g. it is waiting on a 3659 * semaphore, we need to avoid skipping the request or else we 3660 * break the signaling chain. However, if the context is corrupt 3661 * the request will not restart and we will be stuck with a wedged 3662 * device. It is quite often the case that if we issue a reset 3663 * while the GPU is loading the context image, that the context 3664 * image becomes corrupt. 3665 * 3666 * Otherwise, if we have not started yet, the request should replay 3667 * perfectly and we do not need to flag the result as being erroneous. 3668 */ 3669 if (!i915_request_started(rq)) 3670 goto out_replay; 3671 3672 /* 3673 * If the request was innocent, we leave the request in the ELSP 3674 * and will try to replay it on restarting. The context image may 3675 * have been corrupted by the reset, in which case we may have 3676 * to service a new GPU hang, but more likely we can continue on 3677 * without impact. 3678 * 3679 * If the request was guilty, we presume the context is corrupt 3680 * and have to at least restore the RING register in the context 3681 * image back to the expected values to skip over the guilty request. 3682 */ 3683 __i915_request_reset(rq, stalled); 3684 if (!stalled) 3685 goto out_replay; 3686 3687 /* 3688 * We want a simple context + ring to execute the breadcrumb update. 3689 * We cannot rely on the context being intact across the GPU hang, 3690 * so clear it and rebuild just what we need for the breadcrumb. 3691 * All pending requests for this context will be zapped, and any 3692 * future request will be after userspace has had the opportunity 3693 * to recreate its own state. 3694 */ 3695 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3696 restore_default_state(ce, engine); 3697 3698 out_replay: 3699 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 3700 head, ce->ring->tail); 3701 __execlists_reset_reg_state(ce, engine); 3702 __execlists_update_reg_state(ce, engine, head); 3703 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3704 3705 unwind: 3706 /* Push back any incomplete requests for replay after the reset. */ 3707 cancel_port_requests(execlists); 3708 __unwind_incomplete_requests(engine); 3709 } 3710 3711 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 3712 { 3713 unsigned long flags; 3714 3715 ENGINE_TRACE(engine, "\n"); 3716 3717 spin_lock_irqsave(&engine->active.lock, flags); 3718 3719 __execlists_reset(engine, stalled); 3720 3721 spin_unlock_irqrestore(&engine->active.lock, flags); 3722 } 3723 3724 static void nop_submission_tasklet(unsigned long data) 3725 { 3726 /* The driver is wedged; don't process any more events. */ 3727 } 3728 3729 static void execlists_reset_cancel(struct intel_engine_cs *engine) 3730 { 3731 struct intel_engine_execlists * const execlists = &engine->execlists; 3732 struct i915_request *rq, *rn; 3733 struct rb_node *rb; 3734 unsigned long flags; 3735 3736 ENGINE_TRACE(engine, "\n"); 3737 3738 /* 3739 * Before we call engine->cancel_requests(), we should have exclusive 3740 * access to the submission state. This is arranged for us by the 3741 * caller disabling the interrupt generation, the tasklet and other 3742 * threads that may then access the same state, giving us a free hand 3743 * to reset state. However, we still need to let lockdep be aware that 3744 * we know this state may be accessed in hardirq context, so we 3745 * disable the irq around this manipulation and we want to keep 3746 * the spinlock focused on its duties and not accidentally conflate 3747 * coverage to the submission's irq state. (Similarly, although we 3748 * shouldn't need to disable irq around the manipulation of the 3749 * submission's irq state, we also wish to remind ourselves that 3750 * it is irq state.) 3751 */ 3752 spin_lock_irqsave(&engine->active.lock, flags); 3753 3754 __execlists_reset(engine, true); 3755 3756 /* Mark all executing requests as skipped. */ 3757 list_for_each_entry(rq, &engine->active.requests, sched.link) 3758 mark_eio(rq); 3759 3760 /* Flush the queued requests to the timeline list (for retiring). */ 3761 while ((rb = rb_first_cached(&execlists->queue))) { 3762 struct i915_priolist *p = to_priolist(rb); 3763 int i; 3764 3765 priolist_for_each_request_consume(rq, rn, p, i) { 3766 mark_eio(rq); 3767 __i915_request_submit(rq); 3768 } 3769 3770 rb_erase_cached(&p->node, &execlists->queue); 3771 i915_priolist_free(p); 3772 } 3773 3774 /* On-hold requests will be flushed to timeline upon their release */ 3775 list_for_each_entry(rq, &engine->active.hold, sched.link) 3776 mark_eio(rq); 3777 3778 /* Cancel all attached virtual engines */ 3779 while ((rb = rb_first_cached(&execlists->virtual))) { 3780 struct virtual_engine *ve = 3781 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3782 3783 rb_erase_cached(rb, &execlists->virtual); 3784 RB_CLEAR_NODE(rb); 3785 3786 spin_lock(&ve->base.active.lock); 3787 rq = fetch_and_zero(&ve->request); 3788 if (rq) { 3789 mark_eio(rq); 3790 3791 rq->engine = engine; 3792 __i915_request_submit(rq); 3793 i915_request_put(rq); 3794 3795 ve->base.execlists.queue_priority_hint = INT_MIN; 3796 } 3797 spin_unlock(&ve->base.active.lock); 3798 } 3799 3800 /* Remaining _unready_ requests will be nop'ed when submitted */ 3801 3802 execlists->queue_priority_hint = INT_MIN; 3803 execlists->queue = RB_ROOT_CACHED; 3804 3805 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3806 execlists->tasklet.func = nop_submission_tasklet; 3807 3808 spin_unlock_irqrestore(&engine->active.lock, flags); 3809 } 3810 3811 static void execlists_reset_finish(struct intel_engine_cs *engine) 3812 { 3813 struct intel_engine_execlists * const execlists = &engine->execlists; 3814 3815 /* 3816 * After a GPU reset, we may have requests to replay. Do so now while 3817 * we still have the forcewake to be sure that the GPU is not allowed 3818 * to sleep before we restart and reload a context. 3819 */ 3820 GEM_BUG_ON(!reset_in_progress(execlists)); 3821 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3822 execlists->tasklet.func(execlists->tasklet.data); 3823 3824 if (__tasklet_enable(&execlists->tasklet)) 3825 /* And kick in case we missed a new request submission. */ 3826 tasklet_hi_schedule(&execlists->tasklet); 3827 ENGINE_TRACE(engine, "depth->%d\n", 3828 atomic_read(&execlists->tasklet.count)); 3829 } 3830 3831 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 3832 u64 offset, u32 len, 3833 const unsigned int flags) 3834 { 3835 u32 *cs; 3836 3837 cs = intel_ring_begin(rq, 4); 3838 if (IS_ERR(cs)) 3839 return PTR_ERR(cs); 3840 3841 /* 3842 * WaDisableCtxRestoreArbitration:bdw,chv 3843 * 3844 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3845 * particular all the gen that do not need the w/a at all!), if we 3846 * took care to make sure that on every switch into this context 3847 * (both ordinary and for preemption) that arbitrartion was enabled 3848 * we would be fine. However, for gen8 there is another w/a that 3849 * requires us to not preempt inside GPGPU execution, so we keep 3850 * arbitration disabled for gen8 batches. Arbitration will be 3851 * re-enabled before we close the request 3852 * (engine->emit_fini_breadcrumb). 3853 */ 3854 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3855 3856 /* FIXME(BDW+): Address space and security selectors. */ 3857 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3858 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3859 *cs++ = lower_32_bits(offset); 3860 *cs++ = upper_32_bits(offset); 3861 3862 intel_ring_advance(rq, cs); 3863 3864 return 0; 3865 } 3866 3867 static int gen8_emit_bb_start(struct i915_request *rq, 3868 u64 offset, u32 len, 3869 const unsigned int flags) 3870 { 3871 u32 *cs; 3872 3873 cs = intel_ring_begin(rq, 6); 3874 if (IS_ERR(cs)) 3875 return PTR_ERR(cs); 3876 3877 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3878 3879 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3880 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3881 *cs++ = lower_32_bits(offset); 3882 *cs++ = upper_32_bits(offset); 3883 3884 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3885 *cs++ = MI_NOOP; 3886 3887 intel_ring_advance(rq, cs); 3888 3889 return 0; 3890 } 3891 3892 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3893 { 3894 ENGINE_WRITE(engine, RING_IMR, 3895 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3896 ENGINE_POSTING_READ(engine, RING_IMR); 3897 } 3898 3899 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3900 { 3901 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3902 } 3903 3904 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3905 { 3906 u32 cmd, *cs; 3907 3908 cs = intel_ring_begin(request, 4); 3909 if (IS_ERR(cs)) 3910 return PTR_ERR(cs); 3911 3912 cmd = MI_FLUSH_DW + 1; 3913 3914 /* We always require a command barrier so that subsequent 3915 * commands, such as breadcrumb interrupts, are strictly ordered 3916 * wrt the contents of the write cache being flushed to memory 3917 * (and thus being coherent from the CPU). 3918 */ 3919 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3920 3921 if (mode & EMIT_INVALIDATE) { 3922 cmd |= MI_INVALIDATE_TLB; 3923 if (request->engine->class == VIDEO_DECODE_CLASS) 3924 cmd |= MI_INVALIDATE_BSD; 3925 } 3926 3927 *cs++ = cmd; 3928 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3929 *cs++ = 0; /* upper addr */ 3930 *cs++ = 0; /* value */ 3931 intel_ring_advance(request, cs); 3932 3933 return 0; 3934 } 3935 3936 static int gen8_emit_flush_render(struct i915_request *request, 3937 u32 mode) 3938 { 3939 bool vf_flush_wa = false, dc_flush_wa = false; 3940 u32 *cs, flags = 0; 3941 int len; 3942 3943 flags |= PIPE_CONTROL_CS_STALL; 3944 3945 if (mode & EMIT_FLUSH) { 3946 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3947 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3948 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3949 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3950 } 3951 3952 if (mode & EMIT_INVALIDATE) { 3953 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3954 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3955 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3956 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3957 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3958 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3959 flags |= PIPE_CONTROL_QW_WRITE; 3960 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3961 3962 /* 3963 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3964 * pipe control. 3965 */ 3966 if (IS_GEN(request->i915, 9)) 3967 vf_flush_wa = true; 3968 3969 /* WaForGAMHang:kbl */ 3970 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3971 dc_flush_wa = true; 3972 } 3973 3974 len = 6; 3975 3976 if (vf_flush_wa) 3977 len += 6; 3978 3979 if (dc_flush_wa) 3980 len += 12; 3981 3982 cs = intel_ring_begin(request, len); 3983 if (IS_ERR(cs)) 3984 return PTR_ERR(cs); 3985 3986 if (vf_flush_wa) 3987 cs = gen8_emit_pipe_control(cs, 0, 0); 3988 3989 if (dc_flush_wa) 3990 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3991 0); 3992 3993 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3994 3995 if (dc_flush_wa) 3996 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3997 3998 intel_ring_advance(request, cs); 3999 4000 return 0; 4001 } 4002 4003 static int gen11_emit_flush_render(struct i915_request *request, 4004 u32 mode) 4005 { 4006 if (mode & EMIT_FLUSH) { 4007 u32 *cs; 4008 u32 flags = 0; 4009 4010 flags |= PIPE_CONTROL_CS_STALL; 4011 4012 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4013 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4014 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4015 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4016 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4017 flags |= PIPE_CONTROL_QW_WRITE; 4018 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4019 4020 cs = intel_ring_begin(request, 6); 4021 if (IS_ERR(cs)) 4022 return PTR_ERR(cs); 4023 4024 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4025 intel_ring_advance(request, cs); 4026 } 4027 4028 if (mode & EMIT_INVALIDATE) { 4029 u32 *cs; 4030 u32 flags = 0; 4031 4032 flags |= PIPE_CONTROL_CS_STALL; 4033 4034 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4035 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4036 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4037 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4038 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4039 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4040 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4041 flags |= PIPE_CONTROL_QW_WRITE; 4042 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4043 4044 cs = intel_ring_begin(request, 6); 4045 if (IS_ERR(cs)) 4046 return PTR_ERR(cs); 4047 4048 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4049 intel_ring_advance(request, cs); 4050 } 4051 4052 return 0; 4053 } 4054 4055 static u32 preparser_disable(bool state) 4056 { 4057 return MI_ARB_CHECK | 1 << 8 | state; 4058 } 4059 4060 static int gen12_emit_flush_render(struct i915_request *request, 4061 u32 mode) 4062 { 4063 if (mode & EMIT_FLUSH) { 4064 u32 flags = 0; 4065 u32 *cs; 4066 4067 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4068 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4069 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4070 /* Wa_1409600907:tgl */ 4071 flags |= PIPE_CONTROL_DEPTH_STALL; 4072 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4073 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4074 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 4075 4076 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4077 flags |= PIPE_CONTROL_QW_WRITE; 4078 4079 flags |= PIPE_CONTROL_CS_STALL; 4080 4081 cs = intel_ring_begin(request, 6); 4082 if (IS_ERR(cs)) 4083 return PTR_ERR(cs); 4084 4085 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4086 intel_ring_advance(request, cs); 4087 } 4088 4089 if (mode & EMIT_INVALIDATE) { 4090 u32 flags = 0; 4091 u32 *cs; 4092 4093 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4094 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4095 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4096 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4097 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4098 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4099 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4100 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 4101 4102 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4103 flags |= PIPE_CONTROL_QW_WRITE; 4104 4105 flags |= PIPE_CONTROL_CS_STALL; 4106 4107 cs = intel_ring_begin(request, 8); 4108 if (IS_ERR(cs)) 4109 return PTR_ERR(cs); 4110 4111 /* 4112 * Prevent the pre-parser from skipping past the TLB 4113 * invalidate and loading a stale page for the batch 4114 * buffer / request payload. 4115 */ 4116 *cs++ = preparser_disable(true); 4117 4118 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4119 4120 *cs++ = preparser_disable(false); 4121 intel_ring_advance(request, cs); 4122 4123 /* 4124 * Wa_1604544889:tgl 4125 */ 4126 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 4127 flags = 0; 4128 flags |= PIPE_CONTROL_CS_STALL; 4129 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 4130 4131 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4132 flags |= PIPE_CONTROL_QW_WRITE; 4133 4134 cs = intel_ring_begin(request, 6); 4135 if (IS_ERR(cs)) 4136 return PTR_ERR(cs); 4137 4138 cs = gen8_emit_pipe_control(cs, flags, 4139 LRC_PPHWSP_SCRATCH_ADDR); 4140 intel_ring_advance(request, cs); 4141 } 4142 } 4143 4144 return 0; 4145 } 4146 4147 /* 4148 * Reserve space for 2 NOOPs at the end of each request to be 4149 * used as a workaround for not being allowed to do lite 4150 * restore with HEAD==TAIL (WaIdleLiteRestore). 4151 */ 4152 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4153 { 4154 /* Ensure there's always at least one preemption point per-request. */ 4155 *cs++ = MI_ARB_CHECK; 4156 *cs++ = MI_NOOP; 4157 request->wa_tail = intel_ring_offset(request, cs); 4158 4159 return cs; 4160 } 4161 4162 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4163 { 4164 *cs++ = MI_SEMAPHORE_WAIT | 4165 MI_SEMAPHORE_GLOBAL_GTT | 4166 MI_SEMAPHORE_POLL | 4167 MI_SEMAPHORE_SAD_EQ_SDD; 4168 *cs++ = 0; 4169 *cs++ = intel_hws_preempt_address(request->engine); 4170 *cs++ = 0; 4171 4172 return cs; 4173 } 4174 4175 static __always_inline u32* 4176 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 4177 u32 *cs) 4178 { 4179 *cs++ = MI_USER_INTERRUPT; 4180 4181 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4182 if (intel_engine_has_semaphores(request->engine)) 4183 cs = emit_preempt_busywait(request, cs); 4184 4185 request->tail = intel_ring_offset(request, cs); 4186 assert_ring_tail_valid(request->ring, request->tail); 4187 4188 return gen8_emit_wa_tail(request, cs); 4189 } 4190 4191 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4192 { 4193 cs = gen8_emit_ggtt_write(cs, 4194 request->fence.seqno, 4195 i915_request_active_timeline(request)->hwsp_offset, 4196 0); 4197 4198 return gen8_emit_fini_breadcrumb_footer(request, cs); 4199 } 4200 4201 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4202 { 4203 cs = gen8_emit_pipe_control(cs, 4204 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4205 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4206 PIPE_CONTROL_DC_FLUSH_ENABLE, 4207 0); 4208 4209 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4210 cs = gen8_emit_ggtt_write_rcs(cs, 4211 request->fence.seqno, 4212 i915_request_active_timeline(request)->hwsp_offset, 4213 PIPE_CONTROL_FLUSH_ENABLE | 4214 PIPE_CONTROL_CS_STALL); 4215 4216 return gen8_emit_fini_breadcrumb_footer(request, cs); 4217 } 4218 4219 static u32 * 4220 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4221 { 4222 cs = gen8_emit_ggtt_write_rcs(cs, 4223 request->fence.seqno, 4224 i915_request_active_timeline(request)->hwsp_offset, 4225 PIPE_CONTROL_CS_STALL | 4226 PIPE_CONTROL_TILE_CACHE_FLUSH | 4227 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4228 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4229 PIPE_CONTROL_DC_FLUSH_ENABLE | 4230 PIPE_CONTROL_FLUSH_ENABLE); 4231 4232 return gen8_emit_fini_breadcrumb_footer(request, cs); 4233 } 4234 4235 /* 4236 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4237 * flush and will continue pre-fetching the instructions after it before the 4238 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4239 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4240 * of the next request before the memory has been flushed, we're guaranteed that 4241 * we won't access the batch itself too early. 4242 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4243 * so, if the current request is modifying an instruction in the next request on 4244 * the same intel_context, we might pre-fetch and then execute the pre-update 4245 * instruction. To avoid this, the users of self-modifying code should either 4246 * disable the parser around the code emitting the memory writes, via a new flag 4247 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4248 * the in-kernel use-cases we've opted to use a separate context, see 4249 * reloc_gpu() as an example. 4250 * All the above applies only to the instructions themselves. Non-inline data 4251 * used by the instructions is not pre-fetched. 4252 */ 4253 4254 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4255 { 4256 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4257 MI_SEMAPHORE_GLOBAL_GTT | 4258 MI_SEMAPHORE_POLL | 4259 MI_SEMAPHORE_SAD_EQ_SDD; 4260 *cs++ = 0; 4261 *cs++ = intel_hws_preempt_address(request->engine); 4262 *cs++ = 0; 4263 *cs++ = 0; 4264 *cs++ = MI_NOOP; 4265 4266 return cs; 4267 } 4268 4269 static __always_inline u32* 4270 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 4271 { 4272 *cs++ = MI_USER_INTERRUPT; 4273 4274 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4275 if (intel_engine_has_semaphores(request->engine)) 4276 cs = gen12_emit_preempt_busywait(request, cs); 4277 4278 request->tail = intel_ring_offset(request, cs); 4279 assert_ring_tail_valid(request->ring, request->tail); 4280 4281 return gen8_emit_wa_tail(request, cs); 4282 } 4283 4284 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4285 { 4286 cs = gen8_emit_ggtt_write(cs, 4287 request->fence.seqno, 4288 i915_request_active_timeline(request)->hwsp_offset, 4289 0); 4290 4291 return gen12_emit_fini_breadcrumb_footer(request, cs); 4292 } 4293 4294 static u32 * 4295 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4296 { 4297 cs = gen8_emit_ggtt_write_rcs(cs, 4298 request->fence.seqno, 4299 i915_request_active_timeline(request)->hwsp_offset, 4300 PIPE_CONTROL_CS_STALL | 4301 PIPE_CONTROL_TILE_CACHE_FLUSH | 4302 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4303 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4304 /* Wa_1409600907:tgl */ 4305 PIPE_CONTROL_DEPTH_STALL | 4306 PIPE_CONTROL_DC_FLUSH_ENABLE | 4307 PIPE_CONTROL_FLUSH_ENABLE | 4308 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 4309 4310 return gen12_emit_fini_breadcrumb_footer(request, cs); 4311 } 4312 4313 static void execlists_park(struct intel_engine_cs *engine) 4314 { 4315 cancel_timer(&engine->execlists.timer); 4316 cancel_timer(&engine->execlists.preempt); 4317 } 4318 4319 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4320 { 4321 engine->submit_request = execlists_submit_request; 4322 engine->schedule = i915_schedule; 4323 engine->execlists.tasklet.func = execlists_submission_tasklet; 4324 4325 engine->reset.prepare = execlists_reset_prepare; 4326 engine->reset.rewind = execlists_reset_rewind; 4327 engine->reset.cancel = execlists_reset_cancel; 4328 engine->reset.finish = execlists_reset_finish; 4329 4330 engine->park = execlists_park; 4331 engine->unpark = NULL; 4332 4333 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 4334 if (!intel_vgpu_active(engine->i915)) { 4335 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 4336 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 4337 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 4338 } 4339 4340 if (INTEL_GEN(engine->i915) >= 12) 4341 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 4342 4343 if (intel_engine_has_preemption(engine)) 4344 engine->emit_bb_start = gen8_emit_bb_start; 4345 else 4346 engine->emit_bb_start = gen8_emit_bb_start_noarb; 4347 } 4348 4349 static void execlists_shutdown(struct intel_engine_cs *engine) 4350 { 4351 /* Synchronise with residual timers and any softirq they raise */ 4352 del_timer_sync(&engine->execlists.timer); 4353 del_timer_sync(&engine->execlists.preempt); 4354 tasklet_kill(&engine->execlists.tasklet); 4355 } 4356 4357 static void execlists_release(struct intel_engine_cs *engine) 4358 { 4359 execlists_shutdown(engine); 4360 4361 intel_engine_cleanup_common(engine); 4362 lrc_destroy_wa_ctx(engine); 4363 } 4364 4365 static void 4366 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 4367 { 4368 /* Default vfuncs which can be overriden by each engine. */ 4369 4370 engine->resume = execlists_resume; 4371 4372 engine->cops = &execlists_context_ops; 4373 engine->request_alloc = execlists_request_alloc; 4374 4375 engine->emit_flush = gen8_emit_flush; 4376 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 4377 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 4378 if (INTEL_GEN(engine->i915) >= 12) 4379 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 4380 4381 engine->set_default_submission = intel_execlists_set_default_submission; 4382 4383 if (INTEL_GEN(engine->i915) < 11) { 4384 engine->irq_enable = gen8_logical_ring_enable_irq; 4385 engine->irq_disable = gen8_logical_ring_disable_irq; 4386 } else { 4387 /* 4388 * TODO: On Gen11 interrupt masks need to be clear 4389 * to allow C6 entry. Keep interrupts enabled at 4390 * and take the hit of generating extra interrupts 4391 * until a more refined solution exists. 4392 */ 4393 } 4394 } 4395 4396 static inline void 4397 logical_ring_default_irqs(struct intel_engine_cs *engine) 4398 { 4399 unsigned int shift = 0; 4400 4401 if (INTEL_GEN(engine->i915) < 11) { 4402 const u8 irq_shifts[] = { 4403 [RCS0] = GEN8_RCS_IRQ_SHIFT, 4404 [BCS0] = GEN8_BCS_IRQ_SHIFT, 4405 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 4406 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 4407 [VECS0] = GEN8_VECS_IRQ_SHIFT, 4408 }; 4409 4410 shift = irq_shifts[engine->id]; 4411 } 4412 4413 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 4414 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 4415 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 4416 } 4417 4418 static void rcs_submission_override(struct intel_engine_cs *engine) 4419 { 4420 switch (INTEL_GEN(engine->i915)) { 4421 case 12: 4422 engine->emit_flush = gen12_emit_flush_render; 4423 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 4424 break; 4425 case 11: 4426 engine->emit_flush = gen11_emit_flush_render; 4427 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 4428 break; 4429 default: 4430 engine->emit_flush = gen8_emit_flush_render; 4431 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 4432 break; 4433 } 4434 } 4435 4436 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 4437 { 4438 struct intel_engine_execlists * const execlists = &engine->execlists; 4439 struct drm_i915_private *i915 = engine->i915; 4440 struct intel_uncore *uncore = engine->uncore; 4441 u32 base = engine->mmio_base; 4442 4443 tasklet_init(&engine->execlists.tasklet, 4444 execlists_submission_tasklet, (unsigned long)engine); 4445 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 4446 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 4447 4448 logical_ring_default_vfuncs(engine); 4449 logical_ring_default_irqs(engine); 4450 4451 if (engine->class == RENDER_CLASS) 4452 rcs_submission_override(engine); 4453 4454 if (intel_init_workaround_bb(engine)) 4455 /* 4456 * We continue even if we fail to initialize WA batch 4457 * because we only expect rare glitches but nothing 4458 * critical to prevent us from using GPU 4459 */ 4460 DRM_ERROR("WA batch buffer initialization failed\n"); 4461 4462 if (HAS_LOGICAL_RING_ELSQ(i915)) { 4463 execlists->submit_reg = uncore->regs + 4464 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 4465 execlists->ctrl_reg = uncore->regs + 4466 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 4467 } else { 4468 execlists->submit_reg = uncore->regs + 4469 i915_mmio_reg_offset(RING_ELSP(base)); 4470 } 4471 4472 execlists->csb_status = 4473 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 4474 4475 execlists->csb_write = 4476 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 4477 4478 if (INTEL_GEN(i915) < 11) 4479 execlists->csb_size = GEN8_CSB_ENTRIES; 4480 else 4481 execlists->csb_size = GEN11_CSB_ENTRIES; 4482 4483 reset_csb_pointers(engine); 4484 4485 /* Finally, take ownership and responsibility for cleanup! */ 4486 engine->release = execlists_release; 4487 4488 return 0; 4489 } 4490 4491 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 4492 { 4493 u32 indirect_ctx_offset; 4494 4495 switch (INTEL_GEN(engine->i915)) { 4496 default: 4497 MISSING_CASE(INTEL_GEN(engine->i915)); 4498 /* fall through */ 4499 case 12: 4500 indirect_ctx_offset = 4501 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4502 break; 4503 case 11: 4504 indirect_ctx_offset = 4505 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4506 break; 4507 case 10: 4508 indirect_ctx_offset = 4509 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4510 break; 4511 case 9: 4512 indirect_ctx_offset = 4513 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4514 break; 4515 case 8: 4516 indirect_ctx_offset = 4517 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4518 break; 4519 } 4520 4521 return indirect_ctx_offset; 4522 } 4523 4524 4525 static void init_common_reg_state(u32 * const regs, 4526 const struct intel_engine_cs *engine, 4527 const struct intel_ring *ring, 4528 bool inhibit) 4529 { 4530 u32 ctl; 4531 4532 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 4533 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4534 if (inhibit) 4535 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 4536 if (INTEL_GEN(engine->i915) < 11) 4537 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 4538 CTX_CTRL_RS_CTX_ENABLE); 4539 regs[CTX_CONTEXT_CONTROL] = ctl; 4540 4541 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 4542 } 4543 4544 static void init_wa_bb_reg_state(u32 * const regs, 4545 const struct intel_engine_cs *engine, 4546 u32 pos_bb_per_ctx) 4547 { 4548 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 4549 4550 if (wa_ctx->per_ctx.size) { 4551 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4552 4553 regs[pos_bb_per_ctx] = 4554 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 4555 } 4556 4557 if (wa_ctx->indirect_ctx.size) { 4558 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4559 4560 regs[pos_bb_per_ctx + 2] = 4561 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 4562 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 4563 4564 regs[pos_bb_per_ctx + 4] = 4565 intel_lr_indirect_ctx_offset(engine) << 6; 4566 } 4567 } 4568 4569 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 4570 { 4571 if (i915_vm_is_4lvl(&ppgtt->vm)) { 4572 /* 64b PPGTT (48bit canonical) 4573 * PDP0_DESCRIPTOR contains the base address to PML4 and 4574 * other PDP Descriptors are ignored. 4575 */ 4576 ASSIGN_CTX_PML4(ppgtt, regs); 4577 } else { 4578 ASSIGN_CTX_PDP(ppgtt, regs, 3); 4579 ASSIGN_CTX_PDP(ppgtt, regs, 2); 4580 ASSIGN_CTX_PDP(ppgtt, regs, 1); 4581 ASSIGN_CTX_PDP(ppgtt, regs, 0); 4582 } 4583 } 4584 4585 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 4586 { 4587 if (i915_is_ggtt(vm)) 4588 return i915_vm_to_ggtt(vm)->alias; 4589 else 4590 return i915_vm_to_ppgtt(vm); 4591 } 4592 4593 static void execlists_init_reg_state(u32 *regs, 4594 const struct intel_context *ce, 4595 const struct intel_engine_cs *engine, 4596 const struct intel_ring *ring, 4597 bool inhibit) 4598 { 4599 /* 4600 * A context is actually a big batch buffer with several 4601 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 4602 * values we are setting here are only for the first context restore: 4603 * on a subsequent save, the GPU will recreate this batchbuffer with new 4604 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 4605 * we are not initializing here). 4606 * 4607 * Must keep consistent with virtual_update_register_offsets(). 4608 */ 4609 set_offsets(regs, reg_offsets(engine), engine, inhibit); 4610 4611 init_common_reg_state(regs, engine, ring, inhibit); 4612 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 4613 4614 init_wa_bb_reg_state(regs, engine, 4615 INTEL_GEN(engine->i915) >= 12 ? 4616 GEN12_CTX_BB_PER_CTX_PTR : 4617 CTX_BB_PER_CTX_PTR); 4618 4619 __reset_stop_ring(regs, engine); 4620 } 4621 4622 static int 4623 populate_lr_context(struct intel_context *ce, 4624 struct drm_i915_gem_object *ctx_obj, 4625 struct intel_engine_cs *engine, 4626 struct intel_ring *ring) 4627 { 4628 bool inhibit = true; 4629 void *vaddr; 4630 int ret; 4631 4632 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4633 if (IS_ERR(vaddr)) { 4634 ret = PTR_ERR(vaddr); 4635 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4636 return ret; 4637 } 4638 4639 set_redzone(vaddr, engine); 4640 4641 if (engine->default_state) { 4642 void *defaults; 4643 4644 defaults = i915_gem_object_pin_map(engine->default_state, 4645 I915_MAP_WB); 4646 if (IS_ERR(defaults)) { 4647 ret = PTR_ERR(defaults); 4648 goto err_unpin_ctx; 4649 } 4650 4651 memcpy(vaddr, defaults, engine->context_size); 4652 i915_gem_object_unpin_map(engine->default_state); 4653 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 4654 inhibit = false; 4655 } 4656 4657 /* Clear the ppHWSP (inc. per-context counters) */ 4658 memset(vaddr, 0, PAGE_SIZE); 4659 4660 /* 4661 * The second page of the context object contains some registers which 4662 * must be set up prior to the first execution. 4663 */ 4664 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, 4665 ce, engine, ring, inhibit); 4666 4667 ret = 0; 4668 err_unpin_ctx: 4669 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4670 i915_gem_object_unpin_map(ctx_obj); 4671 return ret; 4672 } 4673 4674 static int __execlists_context_alloc(struct intel_context *ce, 4675 struct intel_engine_cs *engine) 4676 { 4677 struct drm_i915_gem_object *ctx_obj; 4678 struct intel_ring *ring; 4679 struct i915_vma *vma; 4680 u32 context_size; 4681 int ret; 4682 4683 GEM_BUG_ON(ce->state); 4684 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4685 4686 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4687 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4688 4689 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4690 if (IS_ERR(ctx_obj)) 4691 return PTR_ERR(ctx_obj); 4692 4693 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4694 if (IS_ERR(vma)) { 4695 ret = PTR_ERR(vma); 4696 goto error_deref_obj; 4697 } 4698 4699 if (!ce->timeline) { 4700 struct intel_timeline *tl; 4701 struct i915_vma *hwsp; 4702 4703 /* 4704 * Use the static global HWSP for the kernel context, and 4705 * a dynamically allocated cacheline for everyone else. 4706 */ 4707 hwsp = NULL; 4708 if (unlikely(intel_context_is_barrier(ce))) 4709 hwsp = engine->status_page.vma; 4710 4711 tl = intel_timeline_create(engine->gt, hwsp); 4712 if (IS_ERR(tl)) { 4713 ret = PTR_ERR(tl); 4714 goto error_deref_obj; 4715 } 4716 4717 ce->timeline = tl; 4718 } 4719 4720 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4721 if (IS_ERR(ring)) { 4722 ret = PTR_ERR(ring); 4723 goto error_deref_obj; 4724 } 4725 4726 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4727 if (ret) { 4728 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4729 goto error_ring_free; 4730 } 4731 4732 ce->ring = ring; 4733 ce->state = vma; 4734 4735 return 0; 4736 4737 error_ring_free: 4738 intel_ring_put(ring); 4739 error_deref_obj: 4740 i915_gem_object_put(ctx_obj); 4741 return ret; 4742 } 4743 4744 static struct list_head *virtual_queue(struct virtual_engine *ve) 4745 { 4746 return &ve->base.execlists.default_priolist.requests[0]; 4747 } 4748 4749 static void virtual_context_destroy(struct kref *kref) 4750 { 4751 struct virtual_engine *ve = 4752 container_of(kref, typeof(*ve), context.ref); 4753 unsigned int n; 4754 4755 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4756 GEM_BUG_ON(ve->request); 4757 GEM_BUG_ON(ve->context.inflight); 4758 4759 for (n = 0; n < ve->num_siblings; n++) { 4760 struct intel_engine_cs *sibling = ve->siblings[n]; 4761 struct rb_node *node = &ve->nodes[sibling->id].rb; 4762 unsigned long flags; 4763 4764 if (RB_EMPTY_NODE(node)) 4765 continue; 4766 4767 spin_lock_irqsave(&sibling->active.lock, flags); 4768 4769 /* Detachment is lazily performed in the execlists tasklet */ 4770 if (!RB_EMPTY_NODE(node)) 4771 rb_erase_cached(node, &sibling->execlists.virtual); 4772 4773 spin_unlock_irqrestore(&sibling->active.lock, flags); 4774 } 4775 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4776 4777 if (ve->context.state) 4778 __execlists_context_fini(&ve->context); 4779 intel_context_fini(&ve->context); 4780 4781 kfree(ve->bonds); 4782 kfree(ve); 4783 } 4784 4785 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4786 { 4787 int swp; 4788 4789 /* 4790 * Pick a random sibling on starting to help spread the load around. 4791 * 4792 * New contexts are typically created with exactly the same order 4793 * of siblings, and often started in batches. Due to the way we iterate 4794 * the array of sibling when submitting requests, sibling[0] is 4795 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4796 * randomised across the system, we also help spread the load by the 4797 * first engine we inspect being different each time. 4798 * 4799 * NB This does not force us to execute on this engine, it will just 4800 * typically be the first we inspect for submission. 4801 */ 4802 swp = prandom_u32_max(ve->num_siblings); 4803 if (!swp) 4804 return; 4805 4806 swap(ve->siblings[swp], ve->siblings[0]); 4807 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4808 virtual_update_register_offsets(ve->context.lrc_reg_state, 4809 ve->siblings[0]); 4810 } 4811 4812 static int virtual_context_alloc(struct intel_context *ce) 4813 { 4814 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4815 4816 return __execlists_context_alloc(ce, ve->siblings[0]); 4817 } 4818 4819 static int virtual_context_pin(struct intel_context *ce) 4820 { 4821 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4822 int err; 4823 4824 /* Note: we must use a real engine class for setting up reg state */ 4825 err = __execlists_context_pin(ce, ve->siblings[0]); 4826 if (err) 4827 return err; 4828 4829 virtual_engine_initial_hint(ve); 4830 return 0; 4831 } 4832 4833 static void virtual_context_enter(struct intel_context *ce) 4834 { 4835 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4836 unsigned int n; 4837 4838 for (n = 0; n < ve->num_siblings; n++) 4839 intel_engine_pm_get(ve->siblings[n]); 4840 4841 intel_timeline_enter(ce->timeline); 4842 } 4843 4844 static void virtual_context_exit(struct intel_context *ce) 4845 { 4846 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4847 unsigned int n; 4848 4849 intel_timeline_exit(ce->timeline); 4850 4851 for (n = 0; n < ve->num_siblings; n++) 4852 intel_engine_pm_put(ve->siblings[n]); 4853 } 4854 4855 static const struct intel_context_ops virtual_context_ops = { 4856 .alloc = virtual_context_alloc, 4857 4858 .pin = virtual_context_pin, 4859 .unpin = execlists_context_unpin, 4860 4861 .enter = virtual_context_enter, 4862 .exit = virtual_context_exit, 4863 4864 .destroy = virtual_context_destroy, 4865 }; 4866 4867 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4868 { 4869 struct i915_request *rq; 4870 intel_engine_mask_t mask; 4871 4872 rq = READ_ONCE(ve->request); 4873 if (!rq) 4874 return 0; 4875 4876 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4877 mask = rq->execution_mask; 4878 if (unlikely(!mask)) { 4879 /* Invalid selection, submit to a random engine in error */ 4880 i915_request_skip(rq, -ENODEV); 4881 mask = ve->siblings[0]->mask; 4882 } 4883 4884 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 4885 rq->fence.context, rq->fence.seqno, 4886 mask, ve->base.execlists.queue_priority_hint); 4887 4888 return mask; 4889 } 4890 4891 static void virtual_submission_tasklet(unsigned long data) 4892 { 4893 struct virtual_engine * const ve = (struct virtual_engine *)data; 4894 const int prio = ve->base.execlists.queue_priority_hint; 4895 intel_engine_mask_t mask; 4896 unsigned int n; 4897 4898 rcu_read_lock(); 4899 mask = virtual_submission_mask(ve); 4900 rcu_read_unlock(); 4901 if (unlikely(!mask)) 4902 return; 4903 4904 local_irq_disable(); 4905 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4906 struct intel_engine_cs *sibling = ve->siblings[n]; 4907 struct ve_node * const node = &ve->nodes[sibling->id]; 4908 struct rb_node **parent, *rb; 4909 bool first; 4910 4911 if (unlikely(!(mask & sibling->mask))) { 4912 if (!RB_EMPTY_NODE(&node->rb)) { 4913 spin_lock(&sibling->active.lock); 4914 rb_erase_cached(&node->rb, 4915 &sibling->execlists.virtual); 4916 RB_CLEAR_NODE(&node->rb); 4917 spin_unlock(&sibling->active.lock); 4918 } 4919 continue; 4920 } 4921 4922 spin_lock(&sibling->active.lock); 4923 4924 if (!RB_EMPTY_NODE(&node->rb)) { 4925 /* 4926 * Cheat and avoid rebalancing the tree if we can 4927 * reuse this node in situ. 4928 */ 4929 first = rb_first_cached(&sibling->execlists.virtual) == 4930 &node->rb; 4931 if (prio == node->prio || (prio > node->prio && first)) 4932 goto submit_engine; 4933 4934 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4935 } 4936 4937 rb = NULL; 4938 first = true; 4939 parent = &sibling->execlists.virtual.rb_root.rb_node; 4940 while (*parent) { 4941 struct ve_node *other; 4942 4943 rb = *parent; 4944 other = rb_entry(rb, typeof(*other), rb); 4945 if (prio > other->prio) { 4946 parent = &rb->rb_left; 4947 } else { 4948 parent = &rb->rb_right; 4949 first = false; 4950 } 4951 } 4952 4953 rb_link_node(&node->rb, rb, parent); 4954 rb_insert_color_cached(&node->rb, 4955 &sibling->execlists.virtual, 4956 first); 4957 4958 submit_engine: 4959 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4960 node->prio = prio; 4961 if (first && prio > sibling->execlists.queue_priority_hint) { 4962 sibling->execlists.queue_priority_hint = prio; 4963 tasklet_hi_schedule(&sibling->execlists.tasklet); 4964 } 4965 4966 spin_unlock(&sibling->active.lock); 4967 } 4968 local_irq_enable(); 4969 } 4970 4971 static void virtual_submit_request(struct i915_request *rq) 4972 { 4973 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4974 struct i915_request *old; 4975 unsigned long flags; 4976 4977 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 4978 rq->fence.context, 4979 rq->fence.seqno); 4980 4981 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4982 4983 spin_lock_irqsave(&ve->base.active.lock, flags); 4984 4985 old = ve->request; 4986 if (old) { /* background completion event from preempt-to-busy */ 4987 GEM_BUG_ON(!i915_request_completed(old)); 4988 __i915_request_submit(old); 4989 i915_request_put(old); 4990 } 4991 4992 if (i915_request_completed(rq)) { 4993 __i915_request_submit(rq); 4994 4995 ve->base.execlists.queue_priority_hint = INT_MIN; 4996 ve->request = NULL; 4997 } else { 4998 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4999 ve->request = i915_request_get(rq); 5000 5001 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5002 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5003 5004 tasklet_schedule(&ve->base.execlists.tasklet); 5005 } 5006 5007 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5008 } 5009 5010 static struct ve_bond * 5011 virtual_find_bond(struct virtual_engine *ve, 5012 const struct intel_engine_cs *master) 5013 { 5014 int i; 5015 5016 for (i = 0; i < ve->num_bonds; i++) { 5017 if (ve->bonds[i].master == master) 5018 return &ve->bonds[i]; 5019 } 5020 5021 return NULL; 5022 } 5023 5024 static void 5025 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5026 { 5027 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5028 intel_engine_mask_t allowed, exec; 5029 struct ve_bond *bond; 5030 5031 allowed = ~to_request(signal)->engine->mask; 5032 5033 bond = virtual_find_bond(ve, to_request(signal)->engine); 5034 if (bond) 5035 allowed &= bond->sibling_mask; 5036 5037 /* Restrict the bonded request to run on only the available engines */ 5038 exec = READ_ONCE(rq->execution_mask); 5039 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5040 ; 5041 5042 /* Prevent the master from being re-run on the bonded engines */ 5043 to_request(signal)->execution_mask &= ~allowed; 5044 } 5045 5046 struct intel_context * 5047 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5048 unsigned int count) 5049 { 5050 struct virtual_engine *ve; 5051 unsigned int n; 5052 int err; 5053 5054 if (count == 0) 5055 return ERR_PTR(-EINVAL); 5056 5057 if (count == 1) 5058 return intel_context_create(siblings[0]); 5059 5060 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5061 if (!ve) 5062 return ERR_PTR(-ENOMEM); 5063 5064 ve->base.i915 = siblings[0]->i915; 5065 ve->base.gt = siblings[0]->gt; 5066 ve->base.uncore = siblings[0]->uncore; 5067 ve->base.id = -1; 5068 5069 ve->base.class = OTHER_CLASS; 5070 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5071 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5072 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5073 5074 /* 5075 * The decision on whether to submit a request using semaphores 5076 * depends on the saturated state of the engine. We only compute 5077 * this during HW submission of the request, and we need for this 5078 * state to be globally applied to all requests being submitted 5079 * to this engine. Virtual engines encompass more than one physical 5080 * engine and so we cannot accurately tell in advance if one of those 5081 * engines is already saturated and so cannot afford to use a semaphore 5082 * and be pessimized in priority for doing so -- if we are the only 5083 * context using semaphores after all other clients have stopped, we 5084 * will be starved on the saturated system. Such a global switch for 5085 * semaphores is less than ideal, but alas is the current compromise. 5086 */ 5087 ve->base.saturated = ALL_ENGINES; 5088 5089 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5090 5091 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5092 intel_engine_init_breadcrumbs(&ve->base); 5093 intel_engine_init_execlists(&ve->base); 5094 5095 ve->base.cops = &virtual_context_ops; 5096 ve->base.request_alloc = execlists_request_alloc; 5097 5098 ve->base.schedule = i915_schedule; 5099 ve->base.submit_request = virtual_submit_request; 5100 ve->base.bond_execute = virtual_bond_execute; 5101 5102 INIT_LIST_HEAD(virtual_queue(ve)); 5103 ve->base.execlists.queue_priority_hint = INT_MIN; 5104 tasklet_init(&ve->base.execlists.tasklet, 5105 virtual_submission_tasklet, 5106 (unsigned long)ve); 5107 5108 intel_context_init(&ve->context, &ve->base); 5109 5110 for (n = 0; n < count; n++) { 5111 struct intel_engine_cs *sibling = siblings[n]; 5112 5113 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5114 if (sibling->mask & ve->base.mask) { 5115 DRM_DEBUG("duplicate %s entry in load balancer\n", 5116 sibling->name); 5117 err = -EINVAL; 5118 goto err_put; 5119 } 5120 5121 /* 5122 * The virtual engine implementation is tightly coupled to 5123 * the execlists backend -- we push out request directly 5124 * into a tree inside each physical engine. We could support 5125 * layering if we handle cloning of the requests and 5126 * submitting a copy into each backend. 5127 */ 5128 if (sibling->execlists.tasklet.func != 5129 execlists_submission_tasklet) { 5130 err = -ENODEV; 5131 goto err_put; 5132 } 5133 5134 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5135 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5136 5137 ve->siblings[ve->num_siblings++] = sibling; 5138 ve->base.mask |= sibling->mask; 5139 5140 /* 5141 * All physical engines must be compatible for their emission 5142 * functions (as we build the instructions during request 5143 * construction and do not alter them before submission 5144 * on the physical engine). We use the engine class as a guide 5145 * here, although that could be refined. 5146 */ 5147 if (ve->base.class != OTHER_CLASS) { 5148 if (ve->base.class != sibling->class) { 5149 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5150 sibling->class, ve->base.class); 5151 err = -EINVAL; 5152 goto err_put; 5153 } 5154 continue; 5155 } 5156 5157 ve->base.class = sibling->class; 5158 ve->base.uabi_class = sibling->uabi_class; 5159 snprintf(ve->base.name, sizeof(ve->base.name), 5160 "v%dx%d", ve->base.class, count); 5161 ve->base.context_size = sibling->context_size; 5162 5163 ve->base.emit_bb_start = sibling->emit_bb_start; 5164 ve->base.emit_flush = sibling->emit_flush; 5165 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5166 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5167 ve->base.emit_fini_breadcrumb_dw = 5168 sibling->emit_fini_breadcrumb_dw; 5169 5170 ve->base.flags = sibling->flags; 5171 } 5172 5173 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5174 5175 return &ve->context; 5176 5177 err_put: 5178 intel_context_put(&ve->context); 5179 return ERR_PTR(err); 5180 } 5181 5182 struct intel_context * 5183 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5184 { 5185 struct virtual_engine *se = to_virtual_engine(src); 5186 struct intel_context *dst; 5187 5188 dst = intel_execlists_create_virtual(se->siblings, 5189 se->num_siblings); 5190 if (IS_ERR(dst)) 5191 return dst; 5192 5193 if (se->num_bonds) { 5194 struct virtual_engine *de = to_virtual_engine(dst->engine); 5195 5196 de->bonds = kmemdup(se->bonds, 5197 sizeof(*se->bonds) * se->num_bonds, 5198 GFP_KERNEL); 5199 if (!de->bonds) { 5200 intel_context_put(dst); 5201 return ERR_PTR(-ENOMEM); 5202 } 5203 5204 de->num_bonds = se->num_bonds; 5205 } 5206 5207 return dst; 5208 } 5209 5210 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5211 const struct intel_engine_cs *master, 5212 const struct intel_engine_cs *sibling) 5213 { 5214 struct virtual_engine *ve = to_virtual_engine(engine); 5215 struct ve_bond *bond; 5216 int n; 5217 5218 /* Sanity check the sibling is part of the virtual engine */ 5219 for (n = 0; n < ve->num_siblings; n++) 5220 if (sibling == ve->siblings[n]) 5221 break; 5222 if (n == ve->num_siblings) 5223 return -EINVAL; 5224 5225 bond = virtual_find_bond(ve, master); 5226 if (bond) { 5227 bond->sibling_mask |= sibling->mask; 5228 return 0; 5229 } 5230 5231 bond = krealloc(ve->bonds, 5232 sizeof(*bond) * (ve->num_bonds + 1), 5233 GFP_KERNEL); 5234 if (!bond) 5235 return -ENOMEM; 5236 5237 bond[ve->num_bonds].master = master; 5238 bond[ve->num_bonds].sibling_mask = sibling->mask; 5239 5240 ve->bonds = bond; 5241 ve->num_bonds++; 5242 5243 return 0; 5244 } 5245 5246 struct intel_engine_cs * 5247 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5248 unsigned int sibling) 5249 { 5250 struct virtual_engine *ve = to_virtual_engine(engine); 5251 5252 if (sibling >= ve->num_siblings) 5253 return NULL; 5254 5255 return ve->siblings[sibling]; 5256 } 5257 5258 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5259 struct drm_printer *m, 5260 void (*show_request)(struct drm_printer *m, 5261 struct i915_request *rq, 5262 const char *prefix), 5263 unsigned int max) 5264 { 5265 const struct intel_engine_execlists *execlists = &engine->execlists; 5266 struct i915_request *rq, *last; 5267 unsigned long flags; 5268 unsigned int count; 5269 struct rb_node *rb; 5270 5271 spin_lock_irqsave(&engine->active.lock, flags); 5272 5273 last = NULL; 5274 count = 0; 5275 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5276 if (count++ < max - 1) 5277 show_request(m, rq, "\t\tE "); 5278 else 5279 last = rq; 5280 } 5281 if (last) { 5282 if (count > max) { 5283 drm_printf(m, 5284 "\t\t...skipping %d executing requests...\n", 5285 count - max); 5286 } 5287 show_request(m, last, "\t\tE "); 5288 } 5289 5290 last = NULL; 5291 count = 0; 5292 if (execlists->queue_priority_hint != INT_MIN) 5293 drm_printf(m, "\t\tQueue priority hint: %d\n", 5294 execlists->queue_priority_hint); 5295 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5296 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5297 int i; 5298 5299 priolist_for_each_request(rq, p, i) { 5300 if (count++ < max - 1) 5301 show_request(m, rq, "\t\tQ "); 5302 else 5303 last = rq; 5304 } 5305 } 5306 if (last) { 5307 if (count > max) { 5308 drm_printf(m, 5309 "\t\t...skipping %d queued requests...\n", 5310 count - max); 5311 } 5312 show_request(m, last, "\t\tQ "); 5313 } 5314 5315 last = NULL; 5316 count = 0; 5317 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5318 struct virtual_engine *ve = 5319 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5320 struct i915_request *rq = READ_ONCE(ve->request); 5321 5322 if (rq) { 5323 if (count++ < max - 1) 5324 show_request(m, rq, "\t\tV "); 5325 else 5326 last = rq; 5327 } 5328 } 5329 if (last) { 5330 if (count > max) { 5331 drm_printf(m, 5332 "\t\t...skipping %d virtual requests...\n", 5333 count - max); 5334 } 5335 show_request(m, last, "\t\tV "); 5336 } 5337 5338 spin_unlock_irqrestore(&engine->active.lock, flags); 5339 } 5340 5341 void intel_lr_context_reset(struct intel_engine_cs *engine, 5342 struct intel_context *ce, 5343 u32 head, 5344 bool scrub) 5345 { 5346 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5347 5348 /* 5349 * We want a simple context + ring to execute the breadcrumb update. 5350 * We cannot rely on the context being intact across the GPU hang, 5351 * so clear it and rebuild just what we need for the breadcrumb. 5352 * All pending requests for this context will be zapped, and any 5353 * future request will be after userspace has had the opportunity 5354 * to recreate its own state. 5355 */ 5356 if (scrub) 5357 restore_default_state(ce, engine); 5358 5359 /* Rerun the request; its payload has been neutered (if guilty). */ 5360 __execlists_update_reg_state(ce, engine, head); 5361 } 5362 5363 bool 5364 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5365 { 5366 return engine->set_default_submission == 5367 intel_execlists_set_default_submission; 5368 } 5369 5370 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5371 #include "selftest_lrc.c" 5372 #endif 5373