1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 151 #define RING_EXECLIST_QFULL (1 << 0x2) 152 #define RING_EXECLIST1_VALID (1 << 0x3) 153 #define RING_EXECLIST0_VALID (1 << 0x4) 154 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 155 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 156 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 157 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 159 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 162 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 163 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 164 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 166 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 167 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 169 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 172 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 173 #define GEN12_IDLE_CTX_ID 0x7FF 174 #define GEN12_CSB_CTX_VALID(csb_dw) \ 175 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 176 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 179 #define WA_TAIL_DWORDS 2 180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[0]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine, 241 u32 head); 242 243 static void mark_eio(struct i915_request *rq) 244 { 245 if (i915_request_completed(rq)) 246 return; 247 248 GEM_BUG_ON(i915_request_signaled(rq)); 249 250 dma_fence_set_error(&rq->fence, -EIO); 251 i915_request_mark_complete(rq); 252 } 253 254 static struct i915_request * 255 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 256 { 257 struct i915_request *active = rq; 258 259 rcu_read_lock(); 260 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 261 if (i915_request_completed(rq)) 262 break; 263 264 active = rq; 265 } 266 rcu_read_unlock(); 267 268 return active; 269 } 270 271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 272 { 273 return (i915_ggtt_offset(engine->status_page.vma) + 274 I915_GEM_HWS_PREEMPT_ADDR); 275 } 276 277 static inline void 278 ring_set_paused(const struct intel_engine_cs *engine, int state) 279 { 280 /* 281 * We inspect HWS_PREEMPT with a semaphore inside 282 * engine->emit_fini_breadcrumb. If the dword is true, 283 * the ring is paused as the semaphore will busywait 284 * until the dword is false. 285 */ 286 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 287 if (state) 288 wmb(); 289 } 290 291 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 292 { 293 return rb_entry(rb, struct i915_priolist, node); 294 } 295 296 static inline int rq_prio(const struct i915_request *rq) 297 { 298 return rq->sched.attr.priority; 299 } 300 301 static int effective_prio(const struct i915_request *rq) 302 { 303 int prio = rq_prio(rq); 304 305 /* 306 * If this request is special and must not be interrupted at any 307 * cost, so be it. Note we are only checking the most recent request 308 * in the context and so may be masking an earlier vip request. It 309 * is hoped that under the conditions where nopreempt is used, this 310 * will not matter (i.e. all requests to that context will be 311 * nopreempt for as long as desired). 312 */ 313 if (i915_request_has_nopreempt(rq)) 314 prio = I915_PRIORITY_UNPREEMPTABLE; 315 316 /* 317 * On unwinding the active request, we give it a priority bump 318 * if it has completed waiting on any semaphore. If we know that 319 * the request has already started, we can prevent an unwanted 320 * preempt-to-idle cycle by taking that into account now. 321 */ 322 if (__i915_request_has_started(rq)) 323 prio |= I915_PRIORITY_NOSEMAPHORE; 324 325 /* Restrict mere WAIT boosts from triggering preemption */ 326 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 327 return prio | __NO_PREEMPTION; 328 } 329 330 static int queue_prio(const struct intel_engine_execlists *execlists) 331 { 332 struct i915_priolist *p; 333 struct rb_node *rb; 334 335 rb = rb_first_cached(&execlists->queue); 336 if (!rb) 337 return INT_MIN; 338 339 /* 340 * As the priolist[] are inverted, with the highest priority in [0], 341 * we have to flip the index value to become priority. 342 */ 343 p = to_priolist(rb); 344 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 345 } 346 347 static inline bool need_preempt(const struct intel_engine_cs *engine, 348 const struct i915_request *rq, 349 struct rb_node *rb) 350 { 351 int last_prio; 352 353 if (!intel_engine_has_semaphores(engine)) 354 return false; 355 356 /* 357 * Check if the current priority hint merits a preemption attempt. 358 * 359 * We record the highest value priority we saw during rescheduling 360 * prior to this dequeue, therefore we know that if it is strictly 361 * less than the current tail of ESLP[0], we do not need to force 362 * a preempt-to-idle cycle. 363 * 364 * However, the priority hint is a mere hint that we may need to 365 * preempt. If that hint is stale or we may be trying to preempt 366 * ourselves, ignore the request. 367 * 368 * More naturally we would write 369 * prio >= max(0, last); 370 * except that we wish to prevent triggering preemption at the same 371 * priority level: the task that is running should remain running 372 * to preserve FIFO ordering of dependencies. 373 */ 374 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 375 if (engine->execlists.queue_priority_hint <= last_prio) 376 return false; 377 378 /* 379 * Check against the first request in ELSP[1], it will, thanks to the 380 * power of PI, be the highest priority of that context. 381 */ 382 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 383 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 384 return true; 385 386 if (rb) { 387 struct virtual_engine *ve = 388 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 389 bool preempt = false; 390 391 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 392 struct i915_request *next; 393 394 rcu_read_lock(); 395 next = READ_ONCE(ve->request); 396 if (next) 397 preempt = rq_prio(next) > last_prio; 398 rcu_read_unlock(); 399 } 400 401 if (preempt) 402 return preempt; 403 } 404 405 /* 406 * If the inflight context did not trigger the preemption, then maybe 407 * it was the set of queued requests? Pick the highest priority in 408 * the queue (the first active priolist) and see if it deserves to be 409 * running instead of ELSP[0]. 410 * 411 * The highest priority request in the queue can not be either 412 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 413 * context, it's priority would not exceed ELSP[0] aka last_prio. 414 */ 415 return queue_prio(&engine->execlists) > last_prio; 416 } 417 418 __maybe_unused static inline bool 419 assert_priority_queue(const struct i915_request *prev, 420 const struct i915_request *next) 421 { 422 /* 423 * Without preemption, the prev may refer to the still active element 424 * which we refuse to let go. 425 * 426 * Even with preemption, there are times when we think it is better not 427 * to preempt and leave an ostensibly lower priority request in flight. 428 */ 429 if (i915_request_is_active(prev)) 430 return true; 431 432 return rq_prio(prev) >= rq_prio(next); 433 } 434 435 /* 436 * The context descriptor encodes various attributes of a context, 437 * including its GTT address and some flags. Because it's fairly 438 * expensive to calculate, we'll just do it once and cache the result, 439 * which remains valid until the context is unpinned. 440 * 441 * This is what a descriptor looks like, from LSB to MSB:: 442 * 443 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 444 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 445 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 446 * bits 53-54: mbz, reserved for use by hardware 447 * bits 55-63: group ID, currently unused and set to 0 448 * 449 * Starting from Gen11, the upper dword of the descriptor has a new format: 450 * 451 * bits 32-36: reserved 452 * bits 37-47: SW context ID 453 * bits 48:53: engine instance 454 * bit 54: mbz, reserved for use by hardware 455 * bits 55-60: SW counter 456 * bits 61-63: engine class 457 * 458 * engine info, SW context ID and SW counter need to form a unique number 459 * (Context ID) per lrc. 460 */ 461 static u64 462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 463 { 464 u64 desc; 465 466 desc = INTEL_LEGACY_32B_CONTEXT; 467 if (i915_vm_is_4lvl(ce->vm)) 468 desc = INTEL_LEGACY_64B_CONTEXT; 469 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 470 471 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 472 if (IS_GEN(engine->i915, 8)) 473 desc |= GEN8_CTX_L3LLC_COHERENT; 474 475 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 476 /* 477 * The following 32bits are copied into the OA reports (dword 2). 478 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 479 * anything below. 480 */ 481 if (INTEL_GEN(engine->i915) >= 11) { 482 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 483 /* bits 48-53 */ 484 485 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 486 /* bits 61-63 */ 487 } 488 489 return desc; 490 } 491 492 static inline unsigned int dword_in_page(void *addr) 493 { 494 return offset_in_page(addr) / sizeof(u32); 495 } 496 497 static void set_offsets(u32 *regs, 498 const u8 *data, 499 const struct intel_engine_cs *engine, 500 bool clear) 501 #define NOP(x) (BIT(7) | (x)) 502 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 503 #define POSTED BIT(0) 504 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 505 #define REG16(x) \ 506 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 507 (((x) >> 2) & 0x7f) 508 #define END(x) 0, (x) 509 { 510 const u32 base = engine->mmio_base; 511 512 while (*data) { 513 u8 count, flags; 514 515 if (*data & BIT(7)) { /* skip */ 516 count = *data++ & ~BIT(7); 517 if (clear) 518 memset32(regs, MI_NOOP, count); 519 regs += count; 520 continue; 521 } 522 523 count = *data & 0x3f; 524 flags = *data >> 6; 525 data++; 526 527 *regs = MI_LOAD_REGISTER_IMM(count); 528 if (flags & POSTED) 529 *regs |= MI_LRI_FORCE_POSTED; 530 if (INTEL_GEN(engine->i915) >= 11) 531 *regs |= MI_LRI_CS_MMIO; 532 regs++; 533 534 GEM_BUG_ON(!count); 535 do { 536 u32 offset = 0; 537 u8 v; 538 539 do { 540 v = *data++; 541 offset <<= 7; 542 offset |= v & ~BIT(7); 543 } while (v & BIT(7)); 544 545 regs[0] = base + (offset << 2); 546 if (clear) 547 regs[1] = 0; 548 regs += 2; 549 } while (--count); 550 } 551 552 if (clear) { 553 u8 count = *++data; 554 555 /* Clear past the tail for HW access */ 556 GEM_BUG_ON(dword_in_page(regs) > count); 557 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 558 559 /* Close the batch; used mainly by live_lrc_layout() */ 560 *regs = MI_BATCH_BUFFER_END; 561 if (INTEL_GEN(engine->i915) >= 10) 562 *regs |= BIT(0); 563 } 564 } 565 566 static const u8 gen8_xcs_offsets[] = { 567 NOP(1), 568 LRI(11, 0), 569 REG16(0x244), 570 REG(0x034), 571 REG(0x030), 572 REG(0x038), 573 REG(0x03c), 574 REG(0x168), 575 REG(0x140), 576 REG(0x110), 577 REG(0x11c), 578 REG(0x114), 579 REG(0x118), 580 581 NOP(9), 582 LRI(9, 0), 583 REG16(0x3a8), 584 REG16(0x28c), 585 REG16(0x288), 586 REG16(0x284), 587 REG16(0x280), 588 REG16(0x27c), 589 REG16(0x278), 590 REG16(0x274), 591 REG16(0x270), 592 593 NOP(13), 594 LRI(2, 0), 595 REG16(0x200), 596 REG(0x028), 597 598 END(80) 599 }; 600 601 static const u8 gen9_xcs_offsets[] = { 602 NOP(1), 603 LRI(14, POSTED), 604 REG16(0x244), 605 REG(0x034), 606 REG(0x030), 607 REG(0x038), 608 REG(0x03c), 609 REG(0x168), 610 REG(0x140), 611 REG(0x110), 612 REG(0x11c), 613 REG(0x114), 614 REG(0x118), 615 REG(0x1c0), 616 REG(0x1c4), 617 REG(0x1c8), 618 619 NOP(3), 620 LRI(9, POSTED), 621 REG16(0x3a8), 622 REG16(0x28c), 623 REG16(0x288), 624 REG16(0x284), 625 REG16(0x280), 626 REG16(0x27c), 627 REG16(0x278), 628 REG16(0x274), 629 REG16(0x270), 630 631 NOP(13), 632 LRI(1, POSTED), 633 REG16(0x200), 634 635 NOP(13), 636 LRI(44, POSTED), 637 REG(0x028), 638 REG(0x09c), 639 REG(0x0c0), 640 REG(0x178), 641 REG(0x17c), 642 REG16(0x358), 643 REG(0x170), 644 REG(0x150), 645 REG(0x154), 646 REG(0x158), 647 REG16(0x41c), 648 REG16(0x600), 649 REG16(0x604), 650 REG16(0x608), 651 REG16(0x60c), 652 REG16(0x610), 653 REG16(0x614), 654 REG16(0x618), 655 REG16(0x61c), 656 REG16(0x620), 657 REG16(0x624), 658 REG16(0x628), 659 REG16(0x62c), 660 REG16(0x630), 661 REG16(0x634), 662 REG16(0x638), 663 REG16(0x63c), 664 REG16(0x640), 665 REG16(0x644), 666 REG16(0x648), 667 REG16(0x64c), 668 REG16(0x650), 669 REG16(0x654), 670 REG16(0x658), 671 REG16(0x65c), 672 REG16(0x660), 673 REG16(0x664), 674 REG16(0x668), 675 REG16(0x66c), 676 REG16(0x670), 677 REG16(0x674), 678 REG16(0x678), 679 REG16(0x67c), 680 REG(0x068), 681 682 END(176) 683 }; 684 685 static const u8 gen12_xcs_offsets[] = { 686 NOP(1), 687 LRI(13, POSTED), 688 REG16(0x244), 689 REG(0x034), 690 REG(0x030), 691 REG(0x038), 692 REG(0x03c), 693 REG(0x168), 694 REG(0x140), 695 REG(0x110), 696 REG(0x1c0), 697 REG(0x1c4), 698 REG(0x1c8), 699 REG(0x180), 700 REG16(0x2b4), 701 702 NOP(5), 703 LRI(9, POSTED), 704 REG16(0x3a8), 705 REG16(0x28c), 706 REG16(0x288), 707 REG16(0x284), 708 REG16(0x280), 709 REG16(0x27c), 710 REG16(0x278), 711 REG16(0x274), 712 REG16(0x270), 713 714 END(80) 715 }; 716 717 static const u8 gen8_rcs_offsets[] = { 718 NOP(1), 719 LRI(14, POSTED), 720 REG16(0x244), 721 REG(0x034), 722 REG(0x030), 723 REG(0x038), 724 REG(0x03c), 725 REG(0x168), 726 REG(0x140), 727 REG(0x110), 728 REG(0x11c), 729 REG(0x114), 730 REG(0x118), 731 REG(0x1c0), 732 REG(0x1c4), 733 REG(0x1c8), 734 735 NOP(3), 736 LRI(9, POSTED), 737 REG16(0x3a8), 738 REG16(0x28c), 739 REG16(0x288), 740 REG16(0x284), 741 REG16(0x280), 742 REG16(0x27c), 743 REG16(0x278), 744 REG16(0x274), 745 REG16(0x270), 746 747 NOP(13), 748 LRI(1, 0), 749 REG(0x0c8), 750 751 END(80) 752 }; 753 754 static const u8 gen9_rcs_offsets[] = { 755 NOP(1), 756 LRI(14, POSTED), 757 REG16(0x244), 758 REG(0x34), 759 REG(0x30), 760 REG(0x38), 761 REG(0x3c), 762 REG(0x168), 763 REG(0x140), 764 REG(0x110), 765 REG(0x11c), 766 REG(0x114), 767 REG(0x118), 768 REG(0x1c0), 769 REG(0x1c4), 770 REG(0x1c8), 771 772 NOP(3), 773 LRI(9, POSTED), 774 REG16(0x3a8), 775 REG16(0x28c), 776 REG16(0x288), 777 REG16(0x284), 778 REG16(0x280), 779 REG16(0x27c), 780 REG16(0x278), 781 REG16(0x274), 782 REG16(0x270), 783 784 NOP(13), 785 LRI(1, 0), 786 REG(0xc8), 787 788 NOP(13), 789 LRI(44, POSTED), 790 REG(0x28), 791 REG(0x9c), 792 REG(0xc0), 793 REG(0x178), 794 REG(0x17c), 795 REG16(0x358), 796 REG(0x170), 797 REG(0x150), 798 REG(0x154), 799 REG(0x158), 800 REG16(0x41c), 801 REG16(0x600), 802 REG16(0x604), 803 REG16(0x608), 804 REG16(0x60c), 805 REG16(0x610), 806 REG16(0x614), 807 REG16(0x618), 808 REG16(0x61c), 809 REG16(0x620), 810 REG16(0x624), 811 REG16(0x628), 812 REG16(0x62c), 813 REG16(0x630), 814 REG16(0x634), 815 REG16(0x638), 816 REG16(0x63c), 817 REG16(0x640), 818 REG16(0x644), 819 REG16(0x648), 820 REG16(0x64c), 821 REG16(0x650), 822 REG16(0x654), 823 REG16(0x658), 824 REG16(0x65c), 825 REG16(0x660), 826 REG16(0x664), 827 REG16(0x668), 828 REG16(0x66c), 829 REG16(0x670), 830 REG16(0x674), 831 REG16(0x678), 832 REG16(0x67c), 833 REG(0x68), 834 835 END(176) 836 }; 837 838 static const u8 gen11_rcs_offsets[] = { 839 NOP(1), 840 LRI(15, POSTED), 841 REG16(0x244), 842 REG(0x034), 843 REG(0x030), 844 REG(0x038), 845 REG(0x03c), 846 REG(0x168), 847 REG(0x140), 848 REG(0x110), 849 REG(0x11c), 850 REG(0x114), 851 REG(0x118), 852 REG(0x1c0), 853 REG(0x1c4), 854 REG(0x1c8), 855 REG(0x180), 856 857 NOP(1), 858 LRI(9, POSTED), 859 REG16(0x3a8), 860 REG16(0x28c), 861 REG16(0x288), 862 REG16(0x284), 863 REG16(0x280), 864 REG16(0x27c), 865 REG16(0x278), 866 REG16(0x274), 867 REG16(0x270), 868 869 LRI(1, POSTED), 870 REG(0x1b0), 871 872 NOP(10), 873 LRI(1, 0), 874 REG(0x0c8), 875 876 END(80) 877 }; 878 879 static const u8 gen12_rcs_offsets[] = { 880 NOP(1), 881 LRI(13, POSTED), 882 REG16(0x244), 883 REG(0x034), 884 REG(0x030), 885 REG(0x038), 886 REG(0x03c), 887 REG(0x168), 888 REG(0x140), 889 REG(0x110), 890 REG(0x1c0), 891 REG(0x1c4), 892 REG(0x1c8), 893 REG(0x180), 894 REG16(0x2b4), 895 896 NOP(5), 897 LRI(9, POSTED), 898 REG16(0x3a8), 899 REG16(0x28c), 900 REG16(0x288), 901 REG16(0x284), 902 REG16(0x280), 903 REG16(0x27c), 904 REG16(0x278), 905 REG16(0x274), 906 REG16(0x270), 907 908 LRI(3, POSTED), 909 REG(0x1b0), 910 REG16(0x5a8), 911 REG16(0x5ac), 912 913 NOP(6), 914 LRI(1, 0), 915 REG(0x0c8), 916 917 END(80) 918 }; 919 920 #undef END 921 #undef REG16 922 #undef REG 923 #undef LRI 924 #undef NOP 925 926 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 927 { 928 /* 929 * The gen12+ lists only have the registers we program in the basic 930 * default state. We rely on the context image using relative 931 * addressing to automatic fixup the register state between the 932 * physical engines for virtual engine. 933 */ 934 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 935 !intel_engine_has_relative_mmio(engine)); 936 937 if (engine->class == RENDER_CLASS) { 938 if (INTEL_GEN(engine->i915) >= 12) 939 return gen12_rcs_offsets; 940 else if (INTEL_GEN(engine->i915) >= 11) 941 return gen11_rcs_offsets; 942 else if (INTEL_GEN(engine->i915) >= 9) 943 return gen9_rcs_offsets; 944 else 945 return gen8_rcs_offsets; 946 } else { 947 if (INTEL_GEN(engine->i915) >= 12) 948 return gen12_xcs_offsets; 949 else if (INTEL_GEN(engine->i915) >= 9) 950 return gen9_xcs_offsets; 951 else 952 return gen8_xcs_offsets; 953 } 954 } 955 956 static struct i915_request * 957 __unwind_incomplete_requests(struct intel_engine_cs *engine) 958 { 959 struct i915_request *rq, *rn, *active = NULL; 960 struct list_head *uninitialized_var(pl); 961 int prio = I915_PRIORITY_INVALID; 962 963 lockdep_assert_held(&engine->active.lock); 964 965 list_for_each_entry_safe_reverse(rq, rn, 966 &engine->active.requests, 967 sched.link) { 968 if (i915_request_completed(rq)) 969 continue; /* XXX */ 970 971 __i915_request_unsubmit(rq); 972 973 /* 974 * Push the request back into the queue for later resubmission. 975 * If this request is not native to this physical engine (i.e. 976 * it came from a virtual source), push it back onto the virtual 977 * engine so that it can be moved across onto another physical 978 * engine as load dictates. 979 */ 980 if (likely(rq->execution_mask == engine->mask)) { 981 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 982 if (rq_prio(rq) != prio) { 983 prio = rq_prio(rq); 984 pl = i915_sched_lookup_priolist(engine, prio); 985 } 986 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 987 988 list_move(&rq->sched.link, pl); 989 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 990 991 active = rq; 992 } else { 993 struct intel_engine_cs *owner = rq->context->engine; 994 995 /* 996 * Decouple the virtual breadcrumb before moving it 997 * back to the virtual engine -- we don't want the 998 * request to complete in the background and try 999 * and cancel the breadcrumb on the virtual engine 1000 * (instead of the old engine where it is linked)! 1001 */ 1002 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1003 &rq->fence.flags)) { 1004 spin_lock_nested(&rq->lock, 1005 SINGLE_DEPTH_NESTING); 1006 i915_request_cancel_breadcrumb(rq); 1007 spin_unlock(&rq->lock); 1008 } 1009 rq->engine = owner; 1010 owner->submit_request(rq); 1011 active = NULL; 1012 } 1013 } 1014 1015 return active; 1016 } 1017 1018 struct i915_request * 1019 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1020 { 1021 struct intel_engine_cs *engine = 1022 container_of(execlists, typeof(*engine), execlists); 1023 1024 return __unwind_incomplete_requests(engine); 1025 } 1026 1027 static inline void 1028 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1029 { 1030 /* 1031 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1032 * The compiler should eliminate this function as dead-code. 1033 */ 1034 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1035 return; 1036 1037 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1038 status, rq); 1039 } 1040 1041 static void intel_engine_context_in(struct intel_engine_cs *engine) 1042 { 1043 unsigned long flags; 1044 1045 if (READ_ONCE(engine->stats.enabled) == 0) 1046 return; 1047 1048 write_seqlock_irqsave(&engine->stats.lock, flags); 1049 1050 if (engine->stats.enabled > 0) { 1051 if (engine->stats.active++ == 0) 1052 engine->stats.start = ktime_get(); 1053 GEM_BUG_ON(engine->stats.active == 0); 1054 } 1055 1056 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1057 } 1058 1059 static void intel_engine_context_out(struct intel_engine_cs *engine) 1060 { 1061 unsigned long flags; 1062 1063 if (READ_ONCE(engine->stats.enabled) == 0) 1064 return; 1065 1066 write_seqlock_irqsave(&engine->stats.lock, flags); 1067 1068 if (engine->stats.enabled > 0) { 1069 ktime_t last; 1070 1071 if (engine->stats.active && --engine->stats.active == 0) { 1072 /* 1073 * Decrement the active context count and in case GPU 1074 * is now idle add up to the running total. 1075 */ 1076 last = ktime_sub(ktime_get(), engine->stats.start); 1077 1078 engine->stats.total = ktime_add(engine->stats.total, 1079 last); 1080 } else if (engine->stats.active == 0) { 1081 /* 1082 * After turning on engine stats, context out might be 1083 * the first event in which case we account from the 1084 * time stats gathering was turned on. 1085 */ 1086 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 1087 1088 engine->stats.total = ktime_add(engine->stats.total, 1089 last); 1090 } 1091 } 1092 1093 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1094 } 1095 1096 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 1097 { 1098 if (INTEL_GEN(engine->i915) >= 12) 1099 return 0x60; 1100 else if (INTEL_GEN(engine->i915) >= 9) 1101 return 0x54; 1102 else if (engine->class == RENDER_CLASS) 1103 return 0x58; 1104 else 1105 return -1; 1106 } 1107 1108 static void 1109 execlists_check_context(const struct intel_context *ce, 1110 const struct intel_engine_cs *engine) 1111 { 1112 const struct intel_ring *ring = ce->ring; 1113 u32 *regs = ce->lrc_reg_state; 1114 bool valid = true; 1115 int x; 1116 1117 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1118 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1119 engine->name, 1120 regs[CTX_RING_START], 1121 i915_ggtt_offset(ring->vma)); 1122 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1123 valid = false; 1124 } 1125 1126 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1127 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1128 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1129 engine->name, 1130 regs[CTX_RING_CTL], 1131 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1132 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1133 valid = false; 1134 } 1135 1136 x = lrc_ring_mi_mode(engine); 1137 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1138 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1139 engine->name, regs[x + 1]); 1140 regs[x + 1] &= ~STOP_RING; 1141 regs[x + 1] |= STOP_RING << 16; 1142 valid = false; 1143 } 1144 1145 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1146 } 1147 1148 static void restore_default_state(struct intel_context *ce, 1149 struct intel_engine_cs *engine) 1150 { 1151 u32 *regs = ce->lrc_reg_state; 1152 1153 if (engine->pinned_default_state) 1154 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1155 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1156 engine->context_size - PAGE_SIZE); 1157 1158 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1159 } 1160 1161 static void reset_active(struct i915_request *rq, 1162 struct intel_engine_cs *engine) 1163 { 1164 struct intel_context * const ce = rq->context; 1165 u32 head; 1166 1167 /* 1168 * The executing context has been cancelled. We want to prevent 1169 * further execution along this context and propagate the error on 1170 * to anything depending on its results. 1171 * 1172 * In __i915_request_submit(), we apply the -EIO and remove the 1173 * requests' payloads for any banned requests. But first, we must 1174 * rewind the context back to the start of the incomplete request so 1175 * that we do not jump back into the middle of the batch. 1176 * 1177 * We preserve the breadcrumbs and semaphores of the incomplete 1178 * requests so that inter-timeline dependencies (i.e other timelines) 1179 * remain correctly ordered. And we defer to __i915_request_submit() 1180 * so that all asynchronous waits are correctly handled. 1181 */ 1182 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1183 rq->fence.context, rq->fence.seqno); 1184 1185 /* On resubmission of the active request, payload will be scrubbed */ 1186 if (i915_request_completed(rq)) 1187 head = rq->tail; 1188 else 1189 head = active_request(ce->timeline, rq)->head; 1190 head = intel_ring_wrap(ce->ring, head); 1191 1192 /* Scrub the context image to prevent replaying the previous batch */ 1193 restore_default_state(ce, engine); 1194 __execlists_update_reg_state(ce, engine, head); 1195 1196 /* We've switched away, so this should be a no-op, but intent matters */ 1197 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1198 } 1199 1200 static inline struct intel_engine_cs * 1201 __execlists_schedule_in(struct i915_request *rq) 1202 { 1203 struct intel_engine_cs * const engine = rq->engine; 1204 struct intel_context * const ce = rq->context; 1205 1206 intel_context_get(ce); 1207 1208 if (unlikely(intel_context_is_banned(ce))) 1209 reset_active(rq, engine); 1210 1211 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1212 execlists_check_context(ce, engine); 1213 1214 if (ce->tag) { 1215 /* Use a fixed tag for OA and friends */ 1216 ce->lrc_desc |= (u64)ce->tag << 32; 1217 } else { 1218 /* We don't need a strict matching tag, just different values */ 1219 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1220 ce->lrc_desc |= 1221 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) << 1222 GEN11_SW_CTX_ID_SHIFT; 1223 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1224 } 1225 1226 __intel_gt_pm_get(engine->gt); 1227 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1228 intel_engine_context_in(engine); 1229 1230 return engine; 1231 } 1232 1233 static inline struct i915_request * 1234 execlists_schedule_in(struct i915_request *rq, int idx) 1235 { 1236 struct intel_context * const ce = rq->context; 1237 struct intel_engine_cs *old; 1238 1239 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1240 trace_i915_request_in(rq, idx); 1241 1242 old = READ_ONCE(ce->inflight); 1243 do { 1244 if (!old) { 1245 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1246 break; 1247 } 1248 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1249 1250 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1251 return i915_request_get(rq); 1252 } 1253 1254 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1255 { 1256 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1257 struct i915_request *next = READ_ONCE(ve->request); 1258 1259 if (next && next->execution_mask & ~rq->execution_mask) 1260 tasklet_schedule(&ve->base.execlists.tasklet); 1261 } 1262 1263 static inline void 1264 __execlists_schedule_out(struct i915_request *rq, 1265 struct intel_engine_cs * const engine) 1266 { 1267 struct intel_context * const ce = rq->context; 1268 1269 /* 1270 * NB process_csb() is not under the engine->active.lock and hence 1271 * schedule_out can race with schedule_in meaning that we should 1272 * refrain from doing non-trivial work here. 1273 */ 1274 1275 /* 1276 * If we have just completed this context, the engine may now be 1277 * idle and we want to re-enter powersaving. 1278 */ 1279 if (list_is_last(&rq->link, &ce->timeline->requests) && 1280 i915_request_completed(rq)) 1281 intel_engine_add_retire(engine, ce->timeline); 1282 1283 intel_engine_context_out(engine); 1284 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1285 intel_gt_pm_put_async(engine->gt); 1286 1287 /* 1288 * If this is part of a virtual engine, its next request may 1289 * have been blocked waiting for access to the active context. 1290 * We have to kick all the siblings again in case we need to 1291 * switch (e.g. the next request is not runnable on this 1292 * engine). Hopefully, we will already have submitted the next 1293 * request before the tasklet runs and do not need to rebuild 1294 * each virtual tree and kick everyone again. 1295 */ 1296 if (ce->engine != engine) 1297 kick_siblings(rq, ce); 1298 1299 intel_context_put(ce); 1300 } 1301 1302 static inline void 1303 execlists_schedule_out(struct i915_request *rq) 1304 { 1305 struct intel_context * const ce = rq->context; 1306 struct intel_engine_cs *cur, *old; 1307 1308 trace_i915_request_out(rq); 1309 1310 old = READ_ONCE(ce->inflight); 1311 do 1312 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1313 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1314 if (!cur) 1315 __execlists_schedule_out(rq, old); 1316 1317 i915_request_put(rq); 1318 } 1319 1320 static u64 execlists_update_context(struct i915_request *rq) 1321 { 1322 struct intel_context *ce = rq->context; 1323 u64 desc = ce->lrc_desc; 1324 u32 tail, prev; 1325 1326 /* 1327 * WaIdleLiteRestore:bdw,skl 1328 * 1329 * We should never submit the context with the same RING_TAIL twice 1330 * just in case we submit an empty ring, which confuses the HW. 1331 * 1332 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1333 * the normal request to be able to always advance the RING_TAIL on 1334 * subsequent resubmissions (for lite restore). Should that fail us, 1335 * and we try and submit the same tail again, force the context 1336 * reload. 1337 * 1338 * If we need to return to a preempted context, we need to skip the 1339 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1340 * HW has a tendency to ignore us rewinding the TAIL to the end of 1341 * an earlier request. 1342 */ 1343 tail = intel_ring_set_tail(rq->ring, rq->tail); 1344 prev = ce->lrc_reg_state[CTX_RING_TAIL]; 1345 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1346 desc |= CTX_DESC_FORCE_RESTORE; 1347 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1348 rq->tail = rq->wa_tail; 1349 1350 /* 1351 * Make sure the context image is complete before we submit it to HW. 1352 * 1353 * Ostensibly, writes (including the WCB) should be flushed prior to 1354 * an uncached write such as our mmio register access, the empirical 1355 * evidence (esp. on Braswell) suggests that the WC write into memory 1356 * may not be visible to the HW prior to the completion of the UC 1357 * register write and that we may begin execution from the context 1358 * before its image is complete leading to invalid PD chasing. 1359 */ 1360 wmb(); 1361 1362 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1363 return desc; 1364 } 1365 1366 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1367 { 1368 if (execlists->ctrl_reg) { 1369 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1370 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1371 } else { 1372 writel(upper_32_bits(desc), execlists->submit_reg); 1373 writel(lower_32_bits(desc), execlists->submit_reg); 1374 } 1375 } 1376 1377 static __maybe_unused void 1378 trace_ports(const struct intel_engine_execlists *execlists, 1379 const char *msg, 1380 struct i915_request * const *ports) 1381 { 1382 const struct intel_engine_cs *engine = 1383 container_of(execlists, typeof(*engine), execlists); 1384 1385 if (!ports[0]) 1386 return; 1387 1388 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg, 1389 ports[0]->fence.context, 1390 ports[0]->fence.seqno, 1391 i915_request_completed(ports[0]) ? "!" : 1392 i915_request_started(ports[0]) ? "*" : 1393 "", 1394 ports[1] ? ports[1]->fence.context : 0, 1395 ports[1] ? ports[1]->fence.seqno : 0); 1396 } 1397 1398 static __maybe_unused bool 1399 assert_pending_valid(const struct intel_engine_execlists *execlists, 1400 const char *msg) 1401 { 1402 struct i915_request * const *port, *rq; 1403 struct intel_context *ce = NULL; 1404 1405 trace_ports(execlists, msg, execlists->pending); 1406 1407 if (!execlists->pending[0]) { 1408 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1409 return false; 1410 } 1411 1412 if (execlists->pending[execlists_num_ports(execlists)]) { 1413 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1414 execlists_num_ports(execlists)); 1415 return false; 1416 } 1417 1418 for (port = execlists->pending; (rq = *port); port++) { 1419 unsigned long flags; 1420 bool ok = true; 1421 1422 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1423 GEM_BUG_ON(!i915_request_is_active(rq)); 1424 1425 if (ce == rq->context) { 1426 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n", 1427 ce->timeline->fence_context, 1428 port - execlists->pending); 1429 return false; 1430 } 1431 ce = rq->context; 1432 1433 /* Hold tightly onto the lock to prevent concurrent retires! */ 1434 if (!spin_trylock_irqsave(&rq->lock, flags)) 1435 continue; 1436 1437 if (i915_request_completed(rq)) 1438 goto unlock; 1439 1440 if (i915_active_is_idle(&ce->active) && 1441 !intel_context_is_barrier(ce)) { 1442 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n", 1443 ce->timeline->fence_context, 1444 port - execlists->pending); 1445 ok = false; 1446 goto unlock; 1447 } 1448 1449 if (!i915_vma_is_pinned(ce->state)) { 1450 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n", 1451 ce->timeline->fence_context, 1452 port - execlists->pending); 1453 ok = false; 1454 goto unlock; 1455 } 1456 1457 if (!i915_vma_is_pinned(ce->ring->vma)) { 1458 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n", 1459 ce->timeline->fence_context, 1460 port - execlists->pending); 1461 ok = false; 1462 goto unlock; 1463 } 1464 1465 unlock: 1466 spin_unlock_irqrestore(&rq->lock, flags); 1467 if (!ok) 1468 return false; 1469 } 1470 1471 return ce; 1472 } 1473 1474 static void execlists_submit_ports(struct intel_engine_cs *engine) 1475 { 1476 struct intel_engine_execlists *execlists = &engine->execlists; 1477 unsigned int n; 1478 1479 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1480 1481 /* 1482 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1483 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1484 * not be relinquished until the device is idle (see 1485 * i915_gem_idle_work_handler()). As a precaution, we make sure 1486 * that all ELSP are drained i.e. we have processed the CSB, 1487 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1488 */ 1489 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1490 1491 /* 1492 * ELSQ note: the submit queue is not cleared after being submitted 1493 * to the HW so we need to make sure we always clean it up. This is 1494 * currently ensured by the fact that we always write the same number 1495 * of elsq entries, keep this in mind before changing the loop below. 1496 */ 1497 for (n = execlists_num_ports(execlists); n--; ) { 1498 struct i915_request *rq = execlists->pending[n]; 1499 1500 write_desc(execlists, 1501 rq ? execlists_update_context(rq) : 0, 1502 n); 1503 } 1504 1505 /* we need to manually load the submit queue */ 1506 if (execlists->ctrl_reg) 1507 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1508 } 1509 1510 static bool ctx_single_port_submission(const struct intel_context *ce) 1511 { 1512 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1513 intel_context_force_single_submission(ce)); 1514 } 1515 1516 static bool can_merge_ctx(const struct intel_context *prev, 1517 const struct intel_context *next) 1518 { 1519 if (prev != next) 1520 return false; 1521 1522 if (ctx_single_port_submission(prev)) 1523 return false; 1524 1525 return true; 1526 } 1527 1528 static bool can_merge_rq(const struct i915_request *prev, 1529 const struct i915_request *next) 1530 { 1531 GEM_BUG_ON(prev == next); 1532 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1533 1534 /* 1535 * We do not submit known completed requests. Therefore if the next 1536 * request is already completed, we can pretend to merge it in 1537 * with the previous context (and we will skip updating the ELSP 1538 * and tracking). Thus hopefully keeping the ELSP full with active 1539 * contexts, despite the best efforts of preempt-to-busy to confuse 1540 * us. 1541 */ 1542 if (i915_request_completed(next)) 1543 return true; 1544 1545 if (unlikely((prev->fence.flags ^ next->fence.flags) & 1546 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1547 BIT(I915_FENCE_FLAG_SENTINEL)))) 1548 return false; 1549 1550 if (!can_merge_ctx(prev->context, next->context)) 1551 return false; 1552 1553 return true; 1554 } 1555 1556 static void virtual_update_register_offsets(u32 *regs, 1557 struct intel_engine_cs *engine) 1558 { 1559 set_offsets(regs, reg_offsets(engine), engine, false); 1560 } 1561 1562 static bool virtual_matches(const struct virtual_engine *ve, 1563 const struct i915_request *rq, 1564 const struct intel_engine_cs *engine) 1565 { 1566 const struct intel_engine_cs *inflight; 1567 1568 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1569 return false; 1570 1571 /* 1572 * We track when the HW has completed saving the context image 1573 * (i.e. when we have seen the final CS event switching out of 1574 * the context) and must not overwrite the context image before 1575 * then. This restricts us to only using the active engine 1576 * while the previous virtualized request is inflight (so 1577 * we reuse the register offsets). This is a very small 1578 * hystersis on the greedy seelction algorithm. 1579 */ 1580 inflight = intel_context_inflight(&ve->context); 1581 if (inflight && inflight != engine) 1582 return false; 1583 1584 return true; 1585 } 1586 1587 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1588 struct intel_engine_cs *engine) 1589 { 1590 struct intel_engine_cs *old = ve->siblings[0]; 1591 1592 /* All unattached (rq->engine == old) must already be completed */ 1593 1594 spin_lock(&old->breadcrumbs.irq_lock); 1595 if (!list_empty(&ve->context.signal_link)) { 1596 list_move_tail(&ve->context.signal_link, 1597 &engine->breadcrumbs.signalers); 1598 intel_engine_signal_breadcrumbs(engine); 1599 } 1600 spin_unlock(&old->breadcrumbs.irq_lock); 1601 } 1602 1603 static struct i915_request * 1604 last_active(const struct intel_engine_execlists *execlists) 1605 { 1606 struct i915_request * const *last = READ_ONCE(execlists->active); 1607 1608 while (*last && i915_request_completed(*last)) 1609 last++; 1610 1611 return *last; 1612 } 1613 1614 #define for_each_waiter(p__, rq__) \ 1615 list_for_each_entry_lockless(p__, \ 1616 &(rq__)->sched.waiters_list, \ 1617 wait_link) 1618 1619 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1620 { 1621 LIST_HEAD(list); 1622 1623 /* 1624 * We want to move the interrupted request to the back of 1625 * the round-robin list (i.e. its priority level), but 1626 * in doing so, we must then move all requests that were in 1627 * flight and were waiting for the interrupted request to 1628 * be run after it again. 1629 */ 1630 do { 1631 struct i915_dependency *p; 1632 1633 GEM_BUG_ON(i915_request_is_active(rq)); 1634 list_move_tail(&rq->sched.link, pl); 1635 1636 for_each_waiter(p, rq) { 1637 struct i915_request *w = 1638 container_of(p->waiter, typeof(*w), sched); 1639 1640 /* Leave semaphores spinning on the other engines */ 1641 if (w->engine != rq->engine) 1642 continue; 1643 1644 /* No waiter should start before its signaler */ 1645 GEM_BUG_ON(i915_request_started(w) && 1646 !i915_request_completed(rq)); 1647 1648 GEM_BUG_ON(i915_request_is_active(w)); 1649 if (!i915_request_is_ready(w)) 1650 continue; 1651 1652 if (rq_prio(w) < rq_prio(rq)) 1653 continue; 1654 1655 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1656 list_move_tail(&w->sched.link, &list); 1657 } 1658 1659 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1660 } while (rq); 1661 } 1662 1663 static void defer_active(struct intel_engine_cs *engine) 1664 { 1665 struct i915_request *rq; 1666 1667 rq = __unwind_incomplete_requests(engine); 1668 if (!rq) 1669 return; 1670 1671 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1672 } 1673 1674 static bool 1675 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1676 { 1677 int hint; 1678 1679 if (!intel_engine_has_timeslices(engine)) 1680 return false; 1681 1682 hint = engine->execlists.queue_priority_hint; 1683 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1684 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1685 1686 return hint >= effective_prio(rq); 1687 } 1688 1689 static int 1690 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1691 { 1692 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1693 return INT_MIN; 1694 1695 return rq_prio(list_next_entry(rq, sched.link)); 1696 } 1697 1698 static inline unsigned long 1699 timeslice(const struct intel_engine_cs *engine) 1700 { 1701 return READ_ONCE(engine->props.timeslice_duration_ms); 1702 } 1703 1704 static unsigned long 1705 active_timeslice(const struct intel_engine_cs *engine) 1706 { 1707 const struct i915_request *rq = *engine->execlists.active; 1708 1709 if (!rq || i915_request_completed(rq)) 1710 return 0; 1711 1712 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1713 return 0; 1714 1715 return timeslice(engine); 1716 } 1717 1718 static void set_timeslice(struct intel_engine_cs *engine) 1719 { 1720 if (!intel_engine_has_timeslices(engine)) 1721 return; 1722 1723 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1724 } 1725 1726 static void start_timeslice(struct intel_engine_cs *engine) 1727 { 1728 struct intel_engine_execlists *execlists = &engine->execlists; 1729 1730 execlists->switch_priority_hint = execlists->queue_priority_hint; 1731 1732 if (timer_pending(&execlists->timer)) 1733 return; 1734 1735 set_timer_ms(&execlists->timer, timeslice(engine)); 1736 } 1737 1738 static void record_preemption(struct intel_engine_execlists *execlists) 1739 { 1740 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1741 } 1742 1743 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1744 { 1745 struct i915_request *rq; 1746 1747 rq = last_active(&engine->execlists); 1748 if (!rq) 1749 return 0; 1750 1751 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1752 if (unlikely(intel_context_is_banned(rq->context))) 1753 return 1; 1754 1755 return READ_ONCE(engine->props.preempt_timeout_ms); 1756 } 1757 1758 static void set_preempt_timeout(struct intel_engine_cs *engine) 1759 { 1760 if (!intel_engine_has_preempt_reset(engine)) 1761 return; 1762 1763 set_timer_ms(&engine->execlists.preempt, 1764 active_preempt_timeout(engine)); 1765 } 1766 1767 static inline void clear_ports(struct i915_request **ports, int count) 1768 { 1769 memset_p((void **)ports, NULL, count); 1770 } 1771 1772 static void execlists_dequeue(struct intel_engine_cs *engine) 1773 { 1774 struct intel_engine_execlists * const execlists = &engine->execlists; 1775 struct i915_request **port = execlists->pending; 1776 struct i915_request ** const last_port = port + execlists->port_mask; 1777 struct i915_request *last; 1778 struct rb_node *rb; 1779 bool submit = false; 1780 1781 /* 1782 * Hardware submission is through 2 ports. Conceptually each port 1783 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1784 * static for a context, and unique to each, so we only execute 1785 * requests belonging to a single context from each ring. RING_HEAD 1786 * is maintained by the CS in the context image, it marks the place 1787 * where it got up to last time, and through RING_TAIL we tell the CS 1788 * where we want to execute up to this time. 1789 * 1790 * In this list the requests are in order of execution. Consecutive 1791 * requests from the same context are adjacent in the ringbuffer. We 1792 * can combine these requests into a single RING_TAIL update: 1793 * 1794 * RING_HEAD...req1...req2 1795 * ^- RING_TAIL 1796 * since to execute req2 the CS must first execute req1. 1797 * 1798 * Our goal then is to point each port to the end of a consecutive 1799 * sequence of requests as being the most optimal (fewest wake ups 1800 * and context switches) submission. 1801 */ 1802 1803 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1804 struct virtual_engine *ve = 1805 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1806 struct i915_request *rq = READ_ONCE(ve->request); 1807 1808 if (!rq) { /* lazily cleanup after another engine handled rq */ 1809 rb_erase_cached(rb, &execlists->virtual); 1810 RB_CLEAR_NODE(rb); 1811 rb = rb_first_cached(&execlists->virtual); 1812 continue; 1813 } 1814 1815 if (!virtual_matches(ve, rq, engine)) { 1816 rb = rb_next(rb); 1817 continue; 1818 } 1819 1820 break; 1821 } 1822 1823 /* 1824 * If the queue is higher priority than the last 1825 * request in the currently active context, submit afresh. 1826 * We will resubmit again afterwards in case we need to split 1827 * the active context to interject the preemption request, 1828 * i.e. we will retrigger preemption following the ack in case 1829 * of trouble. 1830 */ 1831 last = last_active(execlists); 1832 if (last) { 1833 if (need_preempt(engine, last, rb)) { 1834 ENGINE_TRACE(engine, 1835 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 1836 last->fence.context, 1837 last->fence.seqno, 1838 last->sched.attr.priority, 1839 execlists->queue_priority_hint); 1840 record_preemption(execlists); 1841 1842 /* 1843 * Don't let the RING_HEAD advance past the breadcrumb 1844 * as we unwind (and until we resubmit) so that we do 1845 * not accidentally tell it to go backwards. 1846 */ 1847 ring_set_paused(engine, 1); 1848 1849 /* 1850 * Note that we have not stopped the GPU at this point, 1851 * so we are unwinding the incomplete requests as they 1852 * remain inflight and so by the time we do complete 1853 * the preemption, some of the unwound requests may 1854 * complete! 1855 */ 1856 __unwind_incomplete_requests(engine); 1857 1858 last = NULL; 1859 } else if (need_timeslice(engine, last) && 1860 timer_expired(&engine->execlists.timer)) { 1861 ENGINE_TRACE(engine, 1862 "expired last=%llx:%lld, prio=%d, hint=%d\n", 1863 last->fence.context, 1864 last->fence.seqno, 1865 last->sched.attr.priority, 1866 execlists->queue_priority_hint); 1867 1868 ring_set_paused(engine, 1); 1869 defer_active(engine); 1870 1871 /* 1872 * Unlike for preemption, if we rewind and continue 1873 * executing the same context as previously active, 1874 * the order of execution will remain the same and 1875 * the tail will only advance. We do not need to 1876 * force a full context restore, as a lite-restore 1877 * is sufficient to resample the monotonic TAIL. 1878 * 1879 * If we switch to any other context, similarly we 1880 * will not rewind TAIL of current context, and 1881 * normal save/restore will preserve state and allow 1882 * us to later continue executing the same request. 1883 */ 1884 last = NULL; 1885 } else { 1886 /* 1887 * Otherwise if we already have a request pending 1888 * for execution after the current one, we can 1889 * just wait until the next CS event before 1890 * queuing more. In either case we will force a 1891 * lite-restore preemption event, but if we wait 1892 * we hopefully coalesce several updates into a single 1893 * submission. 1894 */ 1895 if (!list_is_last(&last->sched.link, 1896 &engine->active.requests)) { 1897 /* 1898 * Even if ELSP[1] is occupied and not worthy 1899 * of timeslices, our queue might be. 1900 */ 1901 start_timeslice(engine); 1902 return; 1903 } 1904 } 1905 } 1906 1907 while (rb) { /* XXX virtual is always taking precedence */ 1908 struct virtual_engine *ve = 1909 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1910 struct i915_request *rq; 1911 1912 spin_lock(&ve->base.active.lock); 1913 1914 rq = ve->request; 1915 if (unlikely(!rq)) { /* lost the race to a sibling */ 1916 spin_unlock(&ve->base.active.lock); 1917 rb_erase_cached(rb, &execlists->virtual); 1918 RB_CLEAR_NODE(rb); 1919 rb = rb_first_cached(&execlists->virtual); 1920 continue; 1921 } 1922 1923 GEM_BUG_ON(rq != ve->request); 1924 GEM_BUG_ON(rq->engine != &ve->base); 1925 GEM_BUG_ON(rq->context != &ve->context); 1926 1927 if (rq_prio(rq) >= queue_prio(execlists)) { 1928 if (!virtual_matches(ve, rq, engine)) { 1929 spin_unlock(&ve->base.active.lock); 1930 rb = rb_next(rb); 1931 continue; 1932 } 1933 1934 if (last && !can_merge_rq(last, rq)) { 1935 spin_unlock(&ve->base.active.lock); 1936 start_timeslice(engine); 1937 return; /* leave this for another sibling */ 1938 } 1939 1940 ENGINE_TRACE(engine, 1941 "virtual rq=%llx:%lld%s, new engine? %s\n", 1942 rq->fence.context, 1943 rq->fence.seqno, 1944 i915_request_completed(rq) ? "!" : 1945 i915_request_started(rq) ? "*" : 1946 "", 1947 yesno(engine != ve->siblings[0])); 1948 1949 ve->request = NULL; 1950 ve->base.execlists.queue_priority_hint = INT_MIN; 1951 rb_erase_cached(rb, &execlists->virtual); 1952 RB_CLEAR_NODE(rb); 1953 1954 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1955 rq->engine = engine; 1956 1957 if (engine != ve->siblings[0]) { 1958 u32 *regs = ve->context.lrc_reg_state; 1959 unsigned int n; 1960 1961 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1962 1963 if (!intel_engine_has_relative_mmio(engine)) 1964 virtual_update_register_offsets(regs, 1965 engine); 1966 1967 if (!list_empty(&ve->context.signals)) 1968 virtual_xfer_breadcrumbs(ve, engine); 1969 1970 /* 1971 * Move the bound engine to the top of the list 1972 * for future execution. We then kick this 1973 * tasklet first before checking others, so that 1974 * we preferentially reuse this set of bound 1975 * registers. 1976 */ 1977 for (n = 1; n < ve->num_siblings; n++) { 1978 if (ve->siblings[n] == engine) { 1979 swap(ve->siblings[n], 1980 ve->siblings[0]); 1981 break; 1982 } 1983 } 1984 1985 GEM_BUG_ON(ve->siblings[0] != engine); 1986 } 1987 1988 if (__i915_request_submit(rq)) { 1989 submit = true; 1990 last = rq; 1991 } 1992 i915_request_put(rq); 1993 1994 /* 1995 * Hmm, we have a bunch of virtual engine requests, 1996 * but the first one was already completed (thanks 1997 * preempt-to-busy!). Keep looking at the veng queue 1998 * until we have no more relevant requests (i.e. 1999 * the normal submit queue has higher priority). 2000 */ 2001 if (!submit) { 2002 spin_unlock(&ve->base.active.lock); 2003 rb = rb_first_cached(&execlists->virtual); 2004 continue; 2005 } 2006 } 2007 2008 spin_unlock(&ve->base.active.lock); 2009 break; 2010 } 2011 2012 while ((rb = rb_first_cached(&execlists->queue))) { 2013 struct i915_priolist *p = to_priolist(rb); 2014 struct i915_request *rq, *rn; 2015 int i; 2016 2017 priolist_for_each_request_consume(rq, rn, p, i) { 2018 bool merge = true; 2019 2020 /* 2021 * Can we combine this request with the current port? 2022 * It has to be the same context/ringbuffer and not 2023 * have any exceptions (e.g. GVT saying never to 2024 * combine contexts). 2025 * 2026 * If we can combine the requests, we can execute both 2027 * by updating the RING_TAIL to point to the end of the 2028 * second request, and so we never need to tell the 2029 * hardware about the first. 2030 */ 2031 if (last && !can_merge_rq(last, rq)) { 2032 /* 2033 * If we are on the second port and cannot 2034 * combine this request with the last, then we 2035 * are done. 2036 */ 2037 if (port == last_port) 2038 goto done; 2039 2040 /* 2041 * We must not populate both ELSP[] with the 2042 * same LRCA, i.e. we must submit 2 different 2043 * contexts if we submit 2 ELSP. 2044 */ 2045 if (last->context == rq->context) 2046 goto done; 2047 2048 if (i915_request_has_sentinel(last)) 2049 goto done; 2050 2051 /* 2052 * If GVT overrides us we only ever submit 2053 * port[0], leaving port[1] empty. Note that we 2054 * also have to be careful that we don't queue 2055 * the same context (even though a different 2056 * request) to the second port. 2057 */ 2058 if (ctx_single_port_submission(last->context) || 2059 ctx_single_port_submission(rq->context)) 2060 goto done; 2061 2062 merge = false; 2063 } 2064 2065 if (__i915_request_submit(rq)) { 2066 if (!merge) { 2067 *port = execlists_schedule_in(last, port - execlists->pending); 2068 port++; 2069 last = NULL; 2070 } 2071 2072 GEM_BUG_ON(last && 2073 !can_merge_ctx(last->context, 2074 rq->context)); 2075 2076 submit = true; 2077 last = rq; 2078 } 2079 } 2080 2081 rb_erase_cached(&p->node, &execlists->queue); 2082 i915_priolist_free(p); 2083 } 2084 2085 done: 2086 /* 2087 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2088 * 2089 * We choose the priority hint such that if we add a request of greater 2090 * priority than this, we kick the submission tasklet to decide on 2091 * the right order of submitting the requests to hardware. We must 2092 * also be prepared to reorder requests as they are in-flight on the 2093 * HW. We derive the priority hint then as the first "hole" in 2094 * the HW submission ports and if there are no available slots, 2095 * the priority of the lowest executing request, i.e. last. 2096 * 2097 * When we do receive a higher priority request ready to run from the 2098 * user, see queue_request(), the priority hint is bumped to that 2099 * request triggering preemption on the next dequeue (or subsequent 2100 * interrupt for secondary ports). 2101 */ 2102 execlists->queue_priority_hint = queue_prio(execlists); 2103 2104 if (submit) { 2105 *port = execlists_schedule_in(last, port - execlists->pending); 2106 execlists->switch_priority_hint = 2107 switch_prio(engine, *execlists->pending); 2108 2109 /* 2110 * Skip if we ended up with exactly the same set of requests, 2111 * e.g. trying to timeslice a pair of ordered contexts 2112 */ 2113 if (!memcmp(execlists->active, execlists->pending, 2114 (port - execlists->pending + 1) * sizeof(*port))) { 2115 do 2116 execlists_schedule_out(fetch_and_zero(port)); 2117 while (port-- != execlists->pending); 2118 2119 goto skip_submit; 2120 } 2121 clear_ports(port + 1, last_port - port); 2122 2123 execlists_submit_ports(engine); 2124 set_preempt_timeout(engine); 2125 } else { 2126 skip_submit: 2127 ring_set_paused(engine, 0); 2128 } 2129 } 2130 2131 static void 2132 cancel_port_requests(struct intel_engine_execlists * const execlists) 2133 { 2134 struct i915_request * const *port; 2135 2136 for (port = execlists->pending; *port; port++) 2137 execlists_schedule_out(*port); 2138 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2139 2140 /* Mark the end of active before we overwrite *active */ 2141 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2142 execlists_schedule_out(*port); 2143 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2144 2145 WRITE_ONCE(execlists->active, execlists->inflight); 2146 } 2147 2148 static inline void 2149 invalidate_csb_entries(const u32 *first, const u32 *last) 2150 { 2151 clflush((void *)first); 2152 clflush((void *)last); 2153 } 2154 2155 static inline bool 2156 reset_in_progress(const struct intel_engine_execlists *execlists) 2157 { 2158 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 2159 } 2160 2161 /* 2162 * Starting with Gen12, the status has a new format: 2163 * 2164 * bit 0: switched to new queue 2165 * bit 1: reserved 2166 * bit 2: semaphore wait mode (poll or signal), only valid when 2167 * switch detail is set to "wait on semaphore" 2168 * bits 3-5: engine class 2169 * bits 6-11: engine instance 2170 * bits 12-14: reserved 2171 * bits 15-25: sw context id of the lrc the GT switched to 2172 * bits 26-31: sw counter of the lrc the GT switched to 2173 * bits 32-35: context switch detail 2174 * - 0: ctx complete 2175 * - 1: wait on sync flip 2176 * - 2: wait on vblank 2177 * - 3: wait on scanline 2178 * - 4: wait on semaphore 2179 * - 5: context preempted (not on SEMAPHORE_WAIT or 2180 * WAIT_FOR_EVENT) 2181 * bit 36: reserved 2182 * bits 37-43: wait detail (for switch detail 1 to 4) 2183 * bits 44-46: reserved 2184 * bits 47-57: sw context id of the lrc the GT switched away from 2185 * bits 58-63: sw counter of the lrc the GT switched away from 2186 */ 2187 static inline bool 2188 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2189 { 2190 u32 lower_dw = csb[0]; 2191 u32 upper_dw = csb[1]; 2192 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2193 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2194 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2195 2196 /* 2197 * The context switch detail is not guaranteed to be 5 when a preemption 2198 * occurs, so we can't just check for that. The check below works for 2199 * all the cases we care about, including preemptions of WAIT 2200 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2201 * would require some extra handling, but we don't support that. 2202 */ 2203 if (!ctx_away_valid || new_queue) { 2204 GEM_BUG_ON(!ctx_to_valid); 2205 return true; 2206 } 2207 2208 /* 2209 * switch detail = 5 is covered by the case above and we do not expect a 2210 * context switch on an unsuccessful wait instruction since we always 2211 * use polling mode. 2212 */ 2213 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2214 return false; 2215 } 2216 2217 static inline bool 2218 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2219 { 2220 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2221 } 2222 2223 static void process_csb(struct intel_engine_cs *engine) 2224 { 2225 struct intel_engine_execlists * const execlists = &engine->execlists; 2226 const u32 * const buf = execlists->csb_status; 2227 const u8 num_entries = execlists->csb_size; 2228 u8 head, tail; 2229 2230 /* 2231 * As we modify our execlists state tracking we require exclusive 2232 * access. Either we are inside the tasklet, or the tasklet is disabled 2233 * and we assume that is only inside the reset paths and so serialised. 2234 */ 2235 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2236 !reset_in_progress(execlists)); 2237 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2238 2239 /* 2240 * Note that csb_write, csb_status may be either in HWSP or mmio. 2241 * When reading from the csb_write mmio register, we have to be 2242 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2243 * the low 4bits. As it happens we know the next 4bits are always 2244 * zero and so we can simply masked off the low u8 of the register 2245 * and treat it identically to reading from the HWSP (without having 2246 * to use explicit shifting and masking, and probably bifurcating 2247 * the code to handle the legacy mmio read). 2248 */ 2249 head = execlists->csb_head; 2250 tail = READ_ONCE(*execlists->csb_write); 2251 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2252 if (unlikely(head == tail)) 2253 return; 2254 2255 /* 2256 * Hopefully paired with a wmb() in HW! 2257 * 2258 * We must complete the read of the write pointer before any reads 2259 * from the CSB, so that we do not see stale values. Without an rmb 2260 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2261 * we perform the READ_ONCE(*csb_write). 2262 */ 2263 rmb(); 2264 2265 do { 2266 bool promote; 2267 2268 if (++head == num_entries) 2269 head = 0; 2270 2271 /* 2272 * We are flying near dragons again. 2273 * 2274 * We hold a reference to the request in execlist_port[] 2275 * but no more than that. We are operating in softirq 2276 * context and so cannot hold any mutex or sleep. That 2277 * prevents us stopping the requests we are processing 2278 * in port[] from being retired simultaneously (the 2279 * breadcrumb will be complete before we see the 2280 * context-switch). As we only hold the reference to the 2281 * request, any pointer chasing underneath the request 2282 * is subject to a potential use-after-free. Thus we 2283 * store all of the bookkeeping within port[] as 2284 * required, and avoid using unguarded pointers beneath 2285 * request itself. The same applies to the atomic 2286 * status notifier. 2287 */ 2288 2289 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2290 head, buf[2 * head + 0], buf[2 * head + 1]); 2291 2292 if (INTEL_GEN(engine->i915) >= 12) 2293 promote = gen12_csb_parse(execlists, buf + 2 * head); 2294 else 2295 promote = gen8_csb_parse(execlists, buf + 2 * head); 2296 if (promote) { 2297 struct i915_request * const *old = execlists->active; 2298 2299 /* Point active to the new ELSP; prevent overwriting */ 2300 WRITE_ONCE(execlists->active, execlists->pending); 2301 2302 if (!inject_preempt_hang(execlists)) 2303 ring_set_paused(engine, 0); 2304 2305 /* cancel old inflight, prepare for switch */ 2306 trace_ports(execlists, "preempted", old); 2307 while (*old) 2308 execlists_schedule_out(*old++); 2309 2310 /* switch pending to inflight */ 2311 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2312 WRITE_ONCE(execlists->active, 2313 memcpy(execlists->inflight, 2314 execlists->pending, 2315 execlists_num_ports(execlists) * 2316 sizeof(*execlists->pending))); 2317 2318 WRITE_ONCE(execlists->pending[0], NULL); 2319 } else { 2320 GEM_BUG_ON(!*execlists->active); 2321 2322 /* port0 completed, advanced to port1 */ 2323 trace_ports(execlists, "completed", execlists->active); 2324 2325 /* 2326 * We rely on the hardware being strongly 2327 * ordered, that the breadcrumb write is 2328 * coherent (visible from the CPU) before the 2329 * user interrupt and CSB is processed. 2330 */ 2331 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 2332 !reset_in_progress(execlists)); 2333 execlists_schedule_out(*execlists->active++); 2334 2335 GEM_BUG_ON(execlists->active - execlists->inflight > 2336 execlists_num_ports(execlists)); 2337 } 2338 } while (head != tail); 2339 2340 execlists->csb_head = head; 2341 set_timeslice(engine); 2342 2343 /* 2344 * Gen11 has proven to fail wrt global observation point between 2345 * entry and tail update, failing on the ordering and thus 2346 * we see an old entry in the context status buffer. 2347 * 2348 * Forcibly evict out entries for the next gpu csb update, 2349 * to increase the odds that we get a fresh entries with non 2350 * working hardware. The cost for doing so comes out mostly with 2351 * the wash as hardware, working or not, will need to do the 2352 * invalidation before. 2353 */ 2354 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2355 } 2356 2357 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2358 { 2359 lockdep_assert_held(&engine->active.lock); 2360 if (!engine->execlists.pending[0]) { 2361 rcu_read_lock(); /* protect peeking at execlists->active */ 2362 execlists_dequeue(engine); 2363 rcu_read_unlock(); 2364 } 2365 } 2366 2367 static void __execlists_hold(struct i915_request *rq) 2368 { 2369 LIST_HEAD(list); 2370 2371 do { 2372 struct i915_dependency *p; 2373 2374 if (i915_request_is_active(rq)) 2375 __i915_request_unsubmit(rq); 2376 2377 RQ_TRACE(rq, "on hold\n"); 2378 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2379 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2380 i915_request_set_hold(rq); 2381 2382 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 2383 struct i915_request *w = 2384 container_of(p->waiter, typeof(*w), sched); 2385 2386 /* Leave semaphores spinning on the other engines */ 2387 if (w->engine != rq->engine) 2388 continue; 2389 2390 if (!i915_request_is_ready(w)) 2391 continue; 2392 2393 if (i915_request_completed(w)) 2394 continue; 2395 2396 if (i915_request_on_hold(rq)) 2397 continue; 2398 2399 list_move_tail(&w->sched.link, &list); 2400 } 2401 2402 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2403 } while (rq); 2404 } 2405 2406 static bool execlists_hold(struct intel_engine_cs *engine, 2407 struct i915_request *rq) 2408 { 2409 spin_lock_irq(&engine->active.lock); 2410 2411 if (i915_request_completed(rq)) { /* too late! */ 2412 rq = NULL; 2413 goto unlock; 2414 } 2415 2416 if (rq->engine != engine) { /* preempted virtual engine */ 2417 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2418 2419 /* 2420 * intel_context_inflight() is only protected by virtue 2421 * of process_csb() being called only by the tasklet (or 2422 * directly from inside reset while the tasklet is suspended). 2423 * Assert that neither of those are allowed to run while we 2424 * poke at the request queues. 2425 */ 2426 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2427 2428 /* 2429 * An unsubmitted request along a virtual engine will 2430 * remain on the active (this) engine until we are able 2431 * to process the context switch away (and so mark the 2432 * context as no longer in flight). That cannot have happened 2433 * yet, otherwise we would not be hanging! 2434 */ 2435 spin_lock(&ve->base.active.lock); 2436 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2437 GEM_BUG_ON(ve->request != rq); 2438 ve->request = NULL; 2439 spin_unlock(&ve->base.active.lock); 2440 i915_request_put(rq); 2441 2442 rq->engine = engine; 2443 } 2444 2445 /* 2446 * Transfer this request onto the hold queue to prevent it 2447 * being resumbitted to HW (and potentially completed) before we have 2448 * released it. Since we may have already submitted following 2449 * requests, we need to remove those as well. 2450 */ 2451 GEM_BUG_ON(i915_request_on_hold(rq)); 2452 GEM_BUG_ON(rq->engine != engine); 2453 __execlists_hold(rq); 2454 2455 unlock: 2456 spin_unlock_irq(&engine->active.lock); 2457 return rq; 2458 } 2459 2460 static bool hold_request(const struct i915_request *rq) 2461 { 2462 struct i915_dependency *p; 2463 2464 /* 2465 * If one of our ancestors is on hold, we must also be on hold, 2466 * otherwise we will bypass it and execute before it. 2467 */ 2468 list_for_each_entry(p, &rq->sched.signalers_list, signal_link) { 2469 const struct i915_request *s = 2470 container_of(p->signaler, typeof(*s), sched); 2471 2472 if (s->engine != rq->engine) 2473 continue; 2474 2475 if (i915_request_on_hold(s)) 2476 return true; 2477 } 2478 2479 return false; 2480 } 2481 2482 static void __execlists_unhold(struct i915_request *rq) 2483 { 2484 LIST_HEAD(list); 2485 2486 do { 2487 struct i915_dependency *p; 2488 2489 GEM_BUG_ON(!i915_request_on_hold(rq)); 2490 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2491 2492 i915_request_clear_hold(rq); 2493 list_move_tail(&rq->sched.link, 2494 i915_sched_lookup_priolist(rq->engine, 2495 rq_prio(rq))); 2496 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2497 RQ_TRACE(rq, "hold release\n"); 2498 2499 /* Also release any children on this engine that are ready */ 2500 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 2501 struct i915_request *w = 2502 container_of(p->waiter, typeof(*w), sched); 2503 2504 if (w->engine != rq->engine) 2505 continue; 2506 2507 if (!i915_request_on_hold(rq)) 2508 continue; 2509 2510 /* Check that no other parents are also on hold */ 2511 if (hold_request(rq)) 2512 continue; 2513 2514 list_move_tail(&w->sched.link, &list); 2515 } 2516 2517 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2518 } while (rq); 2519 } 2520 2521 static void execlists_unhold(struct intel_engine_cs *engine, 2522 struct i915_request *rq) 2523 { 2524 spin_lock_irq(&engine->active.lock); 2525 2526 /* 2527 * Move this request back to the priority queue, and all of its 2528 * children and grandchildren that were suspended along with it. 2529 */ 2530 __execlists_unhold(rq); 2531 2532 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2533 engine->execlists.queue_priority_hint = rq_prio(rq); 2534 tasklet_hi_schedule(&engine->execlists.tasklet); 2535 } 2536 2537 spin_unlock_irq(&engine->active.lock); 2538 } 2539 2540 struct execlists_capture { 2541 struct work_struct work; 2542 struct i915_request *rq; 2543 struct i915_gpu_coredump *error; 2544 }; 2545 2546 static void execlists_capture_work(struct work_struct *work) 2547 { 2548 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2549 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2550 struct intel_engine_cs *engine = cap->rq->engine; 2551 struct intel_gt_coredump *gt = cap->error->gt; 2552 struct intel_engine_capture_vma *vma; 2553 2554 /* Compress all the objects attached to the request, slow! */ 2555 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2556 if (vma) { 2557 struct i915_vma_compress *compress = 2558 i915_vma_capture_prepare(gt); 2559 2560 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2561 i915_vma_capture_finish(gt, compress); 2562 } 2563 2564 gt->simulated = gt->engine->simulated; 2565 cap->error->simulated = gt->simulated; 2566 2567 /* Publish the error state, and announce it to the world */ 2568 i915_error_state_store(cap->error); 2569 i915_gpu_coredump_put(cap->error); 2570 2571 /* Return this request and all that depend upon it for signaling */ 2572 execlists_unhold(engine, cap->rq); 2573 i915_request_put(cap->rq); 2574 2575 kfree(cap); 2576 } 2577 2578 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2579 { 2580 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2581 struct execlists_capture *cap; 2582 2583 cap = kmalloc(sizeof(*cap), gfp); 2584 if (!cap) 2585 return NULL; 2586 2587 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2588 if (!cap->error) 2589 goto err_cap; 2590 2591 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2592 if (!cap->error->gt) 2593 goto err_gpu; 2594 2595 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2596 if (!cap->error->gt->engine) 2597 goto err_gt; 2598 2599 return cap; 2600 2601 err_gt: 2602 kfree(cap->error->gt); 2603 err_gpu: 2604 kfree(cap->error); 2605 err_cap: 2606 kfree(cap); 2607 return NULL; 2608 } 2609 2610 static bool execlists_capture(struct intel_engine_cs *engine) 2611 { 2612 struct execlists_capture *cap; 2613 2614 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 2615 return true; 2616 2617 /* 2618 * We need to _quickly_ capture the engine state before we reset. 2619 * We are inside an atomic section (softirq) here and we are delaying 2620 * the forced preemption event. 2621 */ 2622 cap = capture_regs(engine); 2623 if (!cap) 2624 return true; 2625 2626 cap->rq = execlists_active(&engine->execlists); 2627 GEM_BUG_ON(!cap->rq); 2628 2629 rcu_read_lock(); 2630 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 2631 cap->rq = i915_request_get_rcu(cap->rq); 2632 rcu_read_unlock(); 2633 if (!cap->rq) 2634 goto err_free; 2635 2636 /* 2637 * Remove the request from the execlists queue, and take ownership 2638 * of the request. We pass it to our worker who will _slowly_ compress 2639 * all the pages the _user_ requested for debugging their batch, after 2640 * which we return it to the queue for signaling. 2641 * 2642 * By removing them from the execlists queue, we also remove the 2643 * requests from being processed by __unwind_incomplete_requests() 2644 * during the intel_engine_reset(), and so they will *not* be replayed 2645 * afterwards. 2646 * 2647 * Note that because we have not yet reset the engine at this point, 2648 * it is possible for the request that we have identified as being 2649 * guilty, did in fact complete and we will then hit an arbitration 2650 * point allowing the outstanding preemption to succeed. The likelihood 2651 * of that is very low (as capturing of the engine registers should be 2652 * fast enough to run inside an irq-off atomic section!), so we will 2653 * simply hold that request accountable for being non-preemptible 2654 * long enough to force the reset. 2655 */ 2656 if (!execlists_hold(engine, cap->rq)) 2657 goto err_rq; 2658 2659 INIT_WORK(&cap->work, execlists_capture_work); 2660 schedule_work(&cap->work); 2661 return true; 2662 2663 err_rq: 2664 i915_request_put(cap->rq); 2665 err_free: 2666 i915_gpu_coredump_put(cap->error); 2667 kfree(cap); 2668 return false; 2669 } 2670 2671 static noinline void preempt_reset(struct intel_engine_cs *engine) 2672 { 2673 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2674 unsigned long *lock = &engine->gt->reset.flags; 2675 2676 if (i915_modparams.reset < 3) 2677 return; 2678 2679 if (test_and_set_bit(bit, lock)) 2680 return; 2681 2682 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2683 tasklet_disable_nosync(&engine->execlists.tasklet); 2684 2685 ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n", 2686 READ_ONCE(engine->props.preempt_timeout_ms), 2687 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); 2688 2689 ring_set_paused(engine, 1); /* Freeze the current request in place */ 2690 if (execlists_capture(engine)) 2691 intel_engine_reset(engine, "preemption time out"); 2692 else 2693 ring_set_paused(engine, 0); 2694 2695 tasklet_enable(&engine->execlists.tasklet); 2696 clear_and_wake_up_bit(bit, lock); 2697 } 2698 2699 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2700 { 2701 const struct timer_list *t = &engine->execlists.preempt; 2702 2703 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2704 return false; 2705 2706 if (!timer_expired(t)) 2707 return false; 2708 2709 return READ_ONCE(engine->execlists.pending[0]); 2710 } 2711 2712 /* 2713 * Check the unread Context Status Buffers and manage the submission of new 2714 * contexts to the ELSP accordingly. 2715 */ 2716 static void execlists_submission_tasklet(unsigned long data) 2717 { 2718 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2719 bool timeout = preempt_timeout(engine); 2720 2721 process_csb(engine); 2722 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2723 unsigned long flags; 2724 2725 spin_lock_irqsave(&engine->active.lock, flags); 2726 __execlists_submission_tasklet(engine); 2727 spin_unlock_irqrestore(&engine->active.lock, flags); 2728 2729 /* Recheck after serialising with direct-submission */ 2730 if (timeout && preempt_timeout(engine)) 2731 preempt_reset(engine); 2732 } 2733 } 2734 2735 static void __execlists_kick(struct intel_engine_execlists *execlists) 2736 { 2737 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2738 tasklet_hi_schedule(&execlists->tasklet); 2739 } 2740 2741 #define execlists_kick(t, member) \ 2742 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2743 2744 static void execlists_timeslice(struct timer_list *timer) 2745 { 2746 execlists_kick(timer, timer); 2747 } 2748 2749 static void execlists_preempt(struct timer_list *timer) 2750 { 2751 execlists_kick(timer, preempt); 2752 } 2753 2754 static void queue_request(struct intel_engine_cs *engine, 2755 struct i915_request *rq) 2756 { 2757 GEM_BUG_ON(!list_empty(&rq->sched.link)); 2758 list_add_tail(&rq->sched.link, 2759 i915_sched_lookup_priolist(engine, rq_prio(rq))); 2760 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2761 } 2762 2763 static void __submit_queue_imm(struct intel_engine_cs *engine) 2764 { 2765 struct intel_engine_execlists * const execlists = &engine->execlists; 2766 2767 if (reset_in_progress(execlists)) 2768 return; /* defer until we restart the engine following reset */ 2769 2770 if (execlists->tasklet.func == execlists_submission_tasklet) 2771 __execlists_submission_tasklet(engine); 2772 else 2773 tasklet_hi_schedule(&execlists->tasklet); 2774 } 2775 2776 static void submit_queue(struct intel_engine_cs *engine, 2777 const struct i915_request *rq) 2778 { 2779 struct intel_engine_execlists *execlists = &engine->execlists; 2780 2781 if (rq_prio(rq) <= execlists->queue_priority_hint) 2782 return; 2783 2784 execlists->queue_priority_hint = rq_prio(rq); 2785 __submit_queue_imm(engine); 2786 } 2787 2788 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 2789 const struct i915_request *rq) 2790 { 2791 GEM_BUG_ON(i915_request_on_hold(rq)); 2792 return !list_empty(&engine->active.hold) && hold_request(rq); 2793 } 2794 2795 static void execlists_submit_request(struct i915_request *request) 2796 { 2797 struct intel_engine_cs *engine = request->engine; 2798 unsigned long flags; 2799 2800 /* Will be called from irq-context when using foreign fences. */ 2801 spin_lock_irqsave(&engine->active.lock, flags); 2802 2803 if (unlikely(ancestor_on_hold(engine, request))) { 2804 list_add_tail(&request->sched.link, &engine->active.hold); 2805 i915_request_set_hold(request); 2806 } else { 2807 queue_request(engine, request); 2808 2809 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2810 GEM_BUG_ON(list_empty(&request->sched.link)); 2811 2812 submit_queue(engine, request); 2813 } 2814 2815 spin_unlock_irqrestore(&engine->active.lock, flags); 2816 } 2817 2818 static void __execlists_context_fini(struct intel_context *ce) 2819 { 2820 intel_ring_put(ce->ring); 2821 i915_vma_put(ce->state); 2822 } 2823 2824 static void execlists_context_destroy(struct kref *kref) 2825 { 2826 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2827 2828 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2829 GEM_BUG_ON(intel_context_is_pinned(ce)); 2830 2831 if (ce->state) 2832 __execlists_context_fini(ce); 2833 2834 intel_context_fini(ce); 2835 intel_context_free(ce); 2836 } 2837 2838 static void 2839 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2840 { 2841 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2842 return; 2843 2844 vaddr += engine->context_size; 2845 2846 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 2847 } 2848 2849 static void 2850 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2851 { 2852 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2853 return; 2854 2855 vaddr += engine->context_size; 2856 2857 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 2858 dev_err_once(engine->i915->drm.dev, 2859 "%s context redzone overwritten!\n", 2860 engine->name); 2861 } 2862 2863 static void execlists_context_unpin(struct intel_context *ce) 2864 { 2865 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2866 ce->engine); 2867 2868 i915_gem_object_unpin_map(ce->state->obj); 2869 } 2870 2871 static void 2872 __execlists_update_reg_state(const struct intel_context *ce, 2873 const struct intel_engine_cs *engine, 2874 u32 head) 2875 { 2876 struct intel_ring *ring = ce->ring; 2877 u32 *regs = ce->lrc_reg_state; 2878 2879 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 2880 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2881 2882 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 2883 regs[CTX_RING_HEAD] = head; 2884 regs[CTX_RING_TAIL] = ring->tail; 2885 2886 /* RPCS */ 2887 if (engine->class == RENDER_CLASS) { 2888 regs[CTX_R_PWR_CLK_STATE] = 2889 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2890 2891 i915_oa_init_reg_state(ce, engine); 2892 } 2893 } 2894 2895 static int 2896 __execlists_context_pin(struct intel_context *ce, 2897 struct intel_engine_cs *engine) 2898 { 2899 void *vaddr; 2900 2901 GEM_BUG_ON(!ce->state); 2902 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2903 2904 vaddr = i915_gem_object_pin_map(ce->state->obj, 2905 i915_coherent_map_type(engine->i915) | 2906 I915_MAP_OVERRIDE); 2907 if (IS_ERR(vaddr)) 2908 return PTR_ERR(vaddr); 2909 2910 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 2911 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2912 __execlists_update_reg_state(ce, engine, ce->ring->tail); 2913 2914 return 0; 2915 } 2916 2917 static int execlists_context_pin(struct intel_context *ce) 2918 { 2919 return __execlists_context_pin(ce, ce->engine); 2920 } 2921 2922 static int execlists_context_alloc(struct intel_context *ce) 2923 { 2924 return __execlists_context_alloc(ce, ce->engine); 2925 } 2926 2927 static void execlists_context_reset(struct intel_context *ce) 2928 { 2929 CE_TRACE(ce, "reset\n"); 2930 GEM_BUG_ON(!intel_context_is_pinned(ce)); 2931 2932 /* 2933 * Because we emit WA_TAIL_DWORDS there may be a disparity 2934 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2935 * that stored in context. As we only write new commands from 2936 * ce->ring->tail onwards, everything before that is junk. If the GPU 2937 * starts reading from its RING_HEAD from the context, it may try to 2938 * execute that junk and die. 2939 * 2940 * The contexts that are stilled pinned on resume belong to the 2941 * kernel, and are local to each engine. All other contexts will 2942 * have their head/tail sanitized upon pinning before use, so they 2943 * will never see garbage, 2944 * 2945 * So to avoid that we reset the context images upon resume. For 2946 * simplicity, we just zero everything out. 2947 */ 2948 intel_ring_reset(ce->ring, ce->ring->emit); 2949 2950 /* Scrub away the garbage */ 2951 execlists_init_reg_state(ce->lrc_reg_state, 2952 ce, ce->engine, ce->ring, true); 2953 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 2954 2955 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 2956 } 2957 2958 static const struct intel_context_ops execlists_context_ops = { 2959 .alloc = execlists_context_alloc, 2960 2961 .pin = execlists_context_pin, 2962 .unpin = execlists_context_unpin, 2963 2964 .enter = intel_context_enter_engine, 2965 .exit = intel_context_exit_engine, 2966 2967 .reset = execlists_context_reset, 2968 .destroy = execlists_context_destroy, 2969 }; 2970 2971 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 2972 { 2973 u32 *cs; 2974 2975 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb); 2976 2977 cs = intel_ring_begin(rq, 6); 2978 if (IS_ERR(cs)) 2979 return PTR_ERR(cs); 2980 2981 /* 2982 * Check if we have been preempted before we even get started. 2983 * 2984 * After this point i915_request_started() reports true, even if 2985 * we get preempted and so are no longer running. 2986 */ 2987 *cs++ = MI_ARB_CHECK; 2988 *cs++ = MI_NOOP; 2989 2990 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 2991 *cs++ = i915_request_timeline(rq)->hwsp_offset; 2992 *cs++ = 0; 2993 *cs++ = rq->fence.seqno - 1; 2994 2995 intel_ring_advance(rq, cs); 2996 2997 /* Record the updated position of the request's payload */ 2998 rq->infix = intel_ring_offset(rq, cs); 2999 3000 return 0; 3001 } 3002 3003 static int execlists_request_alloc(struct i915_request *request) 3004 { 3005 int ret; 3006 3007 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3008 3009 /* 3010 * Flush enough space to reduce the likelihood of waiting after 3011 * we start building the request - in which case we will just 3012 * have to repeat work. 3013 */ 3014 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3015 3016 /* 3017 * Note that after this point, we have committed to using 3018 * this request as it is being used to both track the 3019 * state of engine initialisation and liveness of the 3020 * golden renderstate above. Think twice before you try 3021 * to cancel/unwind this request now. 3022 */ 3023 3024 /* Unconditionally invalidate GPU caches and TLBs. */ 3025 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3026 if (ret) 3027 return ret; 3028 3029 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3030 return 0; 3031 } 3032 3033 /* 3034 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3035 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3036 * but there is a slight complication as this is applied in WA batch where the 3037 * values are only initialized once so we cannot take register value at the 3038 * beginning and reuse it further; hence we save its value to memory, upload a 3039 * constant value with bit21 set and then we restore it back with the saved value. 3040 * To simplify the WA, a constant value is formed by using the default value 3041 * of this register. This shouldn't be a problem because we are only modifying 3042 * it for a short period and this batch in non-premptible. We can ofcourse 3043 * use additional instructions that read the actual value of the register 3044 * at that time and set our bit of interest but it makes the WA complicated. 3045 * 3046 * This WA is also required for Gen9 so extracting as a function avoids 3047 * code duplication. 3048 */ 3049 static u32 * 3050 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3051 { 3052 /* NB no one else is allowed to scribble over scratch + 256! */ 3053 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3054 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3055 *batch++ = intel_gt_scratch_offset(engine->gt, 3056 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3057 *batch++ = 0; 3058 3059 *batch++ = MI_LOAD_REGISTER_IMM(1); 3060 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3061 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3062 3063 batch = gen8_emit_pipe_control(batch, 3064 PIPE_CONTROL_CS_STALL | 3065 PIPE_CONTROL_DC_FLUSH_ENABLE, 3066 0); 3067 3068 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3069 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3070 *batch++ = intel_gt_scratch_offset(engine->gt, 3071 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3072 *batch++ = 0; 3073 3074 return batch; 3075 } 3076 3077 /* 3078 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3079 * initialized at the beginning and shared across all contexts but this field 3080 * helps us to have multiple batches at different offsets and select them based 3081 * on a criteria. At the moment this batch always start at the beginning of the page 3082 * and at this point we don't have multiple wa_ctx batch buffers. 3083 * 3084 * The number of WA applied are not known at the beginning; we use this field 3085 * to return the no of DWORDS written. 3086 * 3087 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3088 * so it adds NOOPs as padding to make it cacheline aligned. 3089 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3090 * makes a complete batch buffer. 3091 */ 3092 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3093 { 3094 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3095 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3096 3097 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3098 if (IS_BROADWELL(engine->i915)) 3099 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3100 3101 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3102 /* Actual scratch location is at 128 bytes offset */ 3103 batch = gen8_emit_pipe_control(batch, 3104 PIPE_CONTROL_FLUSH_L3 | 3105 PIPE_CONTROL_STORE_DATA_INDEX | 3106 PIPE_CONTROL_CS_STALL | 3107 PIPE_CONTROL_QW_WRITE, 3108 LRC_PPHWSP_SCRATCH_ADDR); 3109 3110 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3111 3112 /* Pad to end of cacheline */ 3113 while ((unsigned long)batch % CACHELINE_BYTES) 3114 *batch++ = MI_NOOP; 3115 3116 /* 3117 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3118 * execution depends on the length specified in terms of cache lines 3119 * in the register CTX_RCS_INDIRECT_CTX 3120 */ 3121 3122 return batch; 3123 } 3124 3125 struct lri { 3126 i915_reg_t reg; 3127 u32 value; 3128 }; 3129 3130 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3131 { 3132 GEM_BUG_ON(!count || count > 63); 3133 3134 *batch++ = MI_LOAD_REGISTER_IMM(count); 3135 do { 3136 *batch++ = i915_mmio_reg_offset(lri->reg); 3137 *batch++ = lri->value; 3138 } while (lri++, --count); 3139 *batch++ = MI_NOOP; 3140 3141 return batch; 3142 } 3143 3144 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3145 { 3146 static const struct lri lri[] = { 3147 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3148 { 3149 COMMON_SLICE_CHICKEN2, 3150 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3151 0), 3152 }, 3153 3154 /* BSpec: 11391 */ 3155 { 3156 FF_SLICE_CHICKEN, 3157 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3158 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3159 }, 3160 3161 /* BSpec: 11299 */ 3162 { 3163 _3D_CHICKEN3, 3164 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3165 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3166 } 3167 }; 3168 3169 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3170 3171 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3172 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3173 3174 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3175 batch = gen8_emit_pipe_control(batch, 3176 PIPE_CONTROL_FLUSH_L3 | 3177 PIPE_CONTROL_STORE_DATA_INDEX | 3178 PIPE_CONTROL_CS_STALL | 3179 PIPE_CONTROL_QW_WRITE, 3180 LRC_PPHWSP_SCRATCH_ADDR); 3181 3182 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3183 3184 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3185 if (HAS_POOLED_EU(engine->i915)) { 3186 /* 3187 * EU pool configuration is setup along with golden context 3188 * during context initialization. This value depends on 3189 * device type (2x6 or 3x6) and needs to be updated based 3190 * on which subslice is disabled especially for 2x6 3191 * devices, however it is safe to load default 3192 * configuration of 3x6 device instead of masking off 3193 * corresponding bits because HW ignores bits of a disabled 3194 * subslice and drops down to appropriate config. Please 3195 * see render_state_setup() in i915_gem_render_state.c for 3196 * possible configurations, to avoid duplication they are 3197 * not shown here again. 3198 */ 3199 *batch++ = GEN9_MEDIA_POOL_STATE; 3200 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3201 *batch++ = 0x00777000; 3202 *batch++ = 0; 3203 *batch++ = 0; 3204 *batch++ = 0; 3205 } 3206 3207 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3208 3209 /* Pad to end of cacheline */ 3210 while ((unsigned long)batch % CACHELINE_BYTES) 3211 *batch++ = MI_NOOP; 3212 3213 return batch; 3214 } 3215 3216 static u32 * 3217 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3218 { 3219 int i; 3220 3221 /* 3222 * WaPipeControlBefore3DStateSamplePattern: cnl 3223 * 3224 * Ensure the engine is idle prior to programming a 3225 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3226 */ 3227 batch = gen8_emit_pipe_control(batch, 3228 PIPE_CONTROL_CS_STALL, 3229 0); 3230 /* 3231 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3232 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3233 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3234 * confusing. Since gen8_emit_pipe_control() already advances the 3235 * batch by 6 dwords, we advance the other 10 here, completing a 3236 * cacheline. It's not clear if the workaround requires this padding 3237 * before other commands, or if it's just the regular padding we would 3238 * already have for the workaround bb, so leave it here for now. 3239 */ 3240 for (i = 0; i < 10; i++) 3241 *batch++ = MI_NOOP; 3242 3243 /* Pad to end of cacheline */ 3244 while ((unsigned long)batch % CACHELINE_BYTES) 3245 *batch++ = MI_NOOP; 3246 3247 return batch; 3248 } 3249 3250 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3251 3252 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3253 { 3254 struct drm_i915_gem_object *obj; 3255 struct i915_vma *vma; 3256 int err; 3257 3258 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3259 if (IS_ERR(obj)) 3260 return PTR_ERR(obj); 3261 3262 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3263 if (IS_ERR(vma)) { 3264 err = PTR_ERR(vma); 3265 goto err; 3266 } 3267 3268 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 3269 if (err) 3270 goto err; 3271 3272 engine->wa_ctx.vma = vma; 3273 return 0; 3274 3275 err: 3276 i915_gem_object_put(obj); 3277 return err; 3278 } 3279 3280 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3281 { 3282 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3283 } 3284 3285 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3286 3287 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3288 { 3289 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3290 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3291 &wa_ctx->per_ctx }; 3292 wa_bb_func_t wa_bb_fn[2]; 3293 struct page *page; 3294 void *batch, *batch_ptr; 3295 unsigned int i; 3296 int ret; 3297 3298 if (engine->class != RENDER_CLASS) 3299 return 0; 3300 3301 switch (INTEL_GEN(engine->i915)) { 3302 case 12: 3303 case 11: 3304 return 0; 3305 case 10: 3306 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3307 wa_bb_fn[1] = NULL; 3308 break; 3309 case 9: 3310 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3311 wa_bb_fn[1] = NULL; 3312 break; 3313 case 8: 3314 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3315 wa_bb_fn[1] = NULL; 3316 break; 3317 default: 3318 MISSING_CASE(INTEL_GEN(engine->i915)); 3319 return 0; 3320 } 3321 3322 ret = lrc_setup_wa_ctx(engine); 3323 if (ret) { 3324 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 3325 return ret; 3326 } 3327 3328 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 3329 batch = batch_ptr = kmap_atomic(page); 3330 3331 /* 3332 * Emit the two workaround batch buffers, recording the offset from the 3333 * start of the workaround batch buffer object for each and their 3334 * respective sizes. 3335 */ 3336 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3337 wa_bb[i]->offset = batch_ptr - batch; 3338 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3339 CACHELINE_BYTES))) { 3340 ret = -EINVAL; 3341 break; 3342 } 3343 if (wa_bb_fn[i]) 3344 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3345 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3346 } 3347 3348 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3349 3350 kunmap_atomic(batch); 3351 if (ret) 3352 lrc_destroy_wa_ctx(engine); 3353 3354 return ret; 3355 } 3356 3357 static void enable_execlists(struct intel_engine_cs *engine) 3358 { 3359 u32 mode; 3360 3361 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 3362 3363 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 3364 3365 if (INTEL_GEN(engine->i915) >= 11) 3366 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 3367 else 3368 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 3369 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 3370 3371 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 3372 3373 ENGINE_WRITE_FW(engine, 3374 RING_HWS_PGA, 3375 i915_ggtt_offset(engine->status_page.vma)); 3376 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 3377 3378 engine->context_tag = 0; 3379 } 3380 3381 static bool unexpected_starting_state(struct intel_engine_cs *engine) 3382 { 3383 bool unexpected = false; 3384 3385 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 3386 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 3387 unexpected = true; 3388 } 3389 3390 return unexpected; 3391 } 3392 3393 static int execlists_resume(struct intel_engine_cs *engine) 3394 { 3395 intel_engine_apply_workarounds(engine); 3396 intel_engine_apply_whitelist(engine); 3397 3398 intel_mocs_init_engine(engine); 3399 3400 intel_engine_reset_breadcrumbs(engine); 3401 3402 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 3403 struct drm_printer p = drm_debug_printer(__func__); 3404 3405 intel_engine_dump(engine, &p, NULL); 3406 } 3407 3408 enable_execlists(engine); 3409 3410 return 0; 3411 } 3412 3413 static void execlists_reset_prepare(struct intel_engine_cs *engine) 3414 { 3415 struct intel_engine_execlists * const execlists = &engine->execlists; 3416 unsigned long flags; 3417 3418 ENGINE_TRACE(engine, "depth<-%d\n", 3419 atomic_read(&execlists->tasklet.count)); 3420 3421 /* 3422 * Prevent request submission to the hardware until we have 3423 * completed the reset in i915_gem_reset_finish(). If a request 3424 * is completed by one engine, it may then queue a request 3425 * to a second via its execlists->tasklet *just* as we are 3426 * calling engine->resume() and also writing the ELSP. 3427 * Turning off the execlists->tasklet until the reset is over 3428 * prevents the race. 3429 */ 3430 __tasklet_disable_sync_once(&execlists->tasklet); 3431 GEM_BUG_ON(!reset_in_progress(execlists)); 3432 3433 /* And flush any current direct submission. */ 3434 spin_lock_irqsave(&engine->active.lock, flags); 3435 spin_unlock_irqrestore(&engine->active.lock, flags); 3436 3437 /* 3438 * We stop engines, otherwise we might get failed reset and a 3439 * dead gpu (on elk). Also as modern gpu as kbl can suffer 3440 * from system hang if batchbuffer is progressing when 3441 * the reset is issued, regardless of READY_TO_RESET ack. 3442 * Thus assume it is best to stop engines on all gens 3443 * where we have a gpu reset. 3444 * 3445 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 3446 * 3447 * FIXME: Wa for more modern gens needs to be validated 3448 */ 3449 intel_engine_stop_cs(engine); 3450 } 3451 3452 static void reset_csb_pointers(struct intel_engine_cs *engine) 3453 { 3454 struct intel_engine_execlists * const execlists = &engine->execlists; 3455 const unsigned int reset_value = execlists->csb_size - 1; 3456 3457 ring_set_paused(engine, 0); 3458 3459 /* 3460 * After a reset, the HW starts writing into CSB entry [0]. We 3461 * therefore have to set our HEAD pointer back one entry so that 3462 * the *first* entry we check is entry 0. To complicate this further, 3463 * as we don't wait for the first interrupt after reset, we have to 3464 * fake the HW write to point back to the last entry so that our 3465 * inline comparison of our cached head position against the last HW 3466 * write works even before the first interrupt. 3467 */ 3468 execlists->csb_head = reset_value; 3469 WRITE_ONCE(*execlists->csb_write, reset_value); 3470 wmb(); /* Make sure this is visible to HW (paranoia?) */ 3471 3472 /* 3473 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3474 * Bludgeon them with a mmio update to be sure. 3475 */ 3476 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3477 reset_value << 8 | reset_value); 3478 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3479 3480 invalidate_csb_entries(&execlists->csb_status[0], 3481 &execlists->csb_status[reset_value]); 3482 } 3483 3484 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 3485 { 3486 int x; 3487 3488 x = lrc_ring_mi_mode(engine); 3489 if (x != -1) { 3490 regs[x + 1] &= ~STOP_RING; 3491 regs[x + 1] |= STOP_RING << 16; 3492 } 3493 } 3494 3495 static void __execlists_reset_reg_state(const struct intel_context *ce, 3496 const struct intel_engine_cs *engine) 3497 { 3498 u32 *regs = ce->lrc_reg_state; 3499 3500 __reset_stop_ring(regs, engine); 3501 } 3502 3503 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 3504 { 3505 struct intel_engine_execlists * const execlists = &engine->execlists; 3506 struct intel_context *ce; 3507 struct i915_request *rq; 3508 u32 head; 3509 3510 mb(); /* paranoia: read the CSB pointers from after the reset */ 3511 clflush(execlists->csb_write); 3512 mb(); 3513 3514 process_csb(engine); /* drain preemption events */ 3515 3516 /* Following the reset, we need to reload the CSB read/write pointers */ 3517 reset_csb_pointers(engine); 3518 3519 /* 3520 * Save the currently executing context, even if we completed 3521 * its request, it was still running at the time of the 3522 * reset and will have been clobbered. 3523 */ 3524 rq = execlists_active(execlists); 3525 if (!rq) 3526 goto unwind; 3527 3528 /* We still have requests in-flight; the engine should be active */ 3529 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3530 3531 ce = rq->context; 3532 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3533 3534 if (i915_request_completed(rq)) { 3535 /* Idle context; tidy up the ring so we can restart afresh */ 3536 head = intel_ring_wrap(ce->ring, rq->tail); 3537 goto out_replay; 3538 } 3539 3540 /* Context has requests still in-flight; it should not be idle! */ 3541 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3542 rq = active_request(ce->timeline, rq); 3543 head = intel_ring_wrap(ce->ring, rq->head); 3544 GEM_BUG_ON(head == ce->ring->tail); 3545 3546 /* 3547 * If this request hasn't started yet, e.g. it is waiting on a 3548 * semaphore, we need to avoid skipping the request or else we 3549 * break the signaling chain. However, if the context is corrupt 3550 * the request will not restart and we will be stuck with a wedged 3551 * device. It is quite often the case that if we issue a reset 3552 * while the GPU is loading the context image, that the context 3553 * image becomes corrupt. 3554 * 3555 * Otherwise, if we have not started yet, the request should replay 3556 * perfectly and we do not need to flag the result as being erroneous. 3557 */ 3558 if (!i915_request_started(rq)) 3559 goto out_replay; 3560 3561 /* 3562 * If the request was innocent, we leave the request in the ELSP 3563 * and will try to replay it on restarting. The context image may 3564 * have been corrupted by the reset, in which case we may have 3565 * to service a new GPU hang, but more likely we can continue on 3566 * without impact. 3567 * 3568 * If the request was guilty, we presume the context is corrupt 3569 * and have to at least restore the RING register in the context 3570 * image back to the expected values to skip over the guilty request. 3571 */ 3572 __i915_request_reset(rq, stalled); 3573 if (!stalled) 3574 goto out_replay; 3575 3576 /* 3577 * We want a simple context + ring to execute the breadcrumb update. 3578 * We cannot rely on the context being intact across the GPU hang, 3579 * so clear it and rebuild just what we need for the breadcrumb. 3580 * All pending requests for this context will be zapped, and any 3581 * future request will be after userspace has had the opportunity 3582 * to recreate its own state. 3583 */ 3584 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3585 restore_default_state(ce, engine); 3586 3587 out_replay: 3588 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 3589 head, ce->ring->tail); 3590 __execlists_reset_reg_state(ce, engine); 3591 __execlists_update_reg_state(ce, engine, head); 3592 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3593 3594 unwind: 3595 /* Push back any incomplete requests for replay after the reset. */ 3596 cancel_port_requests(execlists); 3597 __unwind_incomplete_requests(engine); 3598 } 3599 3600 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 3601 { 3602 unsigned long flags; 3603 3604 ENGINE_TRACE(engine, "\n"); 3605 3606 spin_lock_irqsave(&engine->active.lock, flags); 3607 3608 __execlists_reset(engine, stalled); 3609 3610 spin_unlock_irqrestore(&engine->active.lock, flags); 3611 } 3612 3613 static void nop_submission_tasklet(unsigned long data) 3614 { 3615 /* The driver is wedged; don't process any more events. */ 3616 } 3617 3618 static void execlists_reset_cancel(struct intel_engine_cs *engine) 3619 { 3620 struct intel_engine_execlists * const execlists = &engine->execlists; 3621 struct i915_request *rq, *rn; 3622 struct rb_node *rb; 3623 unsigned long flags; 3624 3625 ENGINE_TRACE(engine, "\n"); 3626 3627 /* 3628 * Before we call engine->cancel_requests(), we should have exclusive 3629 * access to the submission state. This is arranged for us by the 3630 * caller disabling the interrupt generation, the tasklet and other 3631 * threads that may then access the same state, giving us a free hand 3632 * to reset state. However, we still need to let lockdep be aware that 3633 * we know this state may be accessed in hardirq context, so we 3634 * disable the irq around this manipulation and we want to keep 3635 * the spinlock focused on its duties and not accidentally conflate 3636 * coverage to the submission's irq state. (Similarly, although we 3637 * shouldn't need to disable irq around the manipulation of the 3638 * submission's irq state, we also wish to remind ourselves that 3639 * it is irq state.) 3640 */ 3641 spin_lock_irqsave(&engine->active.lock, flags); 3642 3643 __execlists_reset(engine, true); 3644 3645 /* Mark all executing requests as skipped. */ 3646 list_for_each_entry(rq, &engine->active.requests, sched.link) 3647 mark_eio(rq); 3648 3649 /* Flush the queued requests to the timeline list (for retiring). */ 3650 while ((rb = rb_first_cached(&execlists->queue))) { 3651 struct i915_priolist *p = to_priolist(rb); 3652 int i; 3653 3654 priolist_for_each_request_consume(rq, rn, p, i) { 3655 mark_eio(rq); 3656 __i915_request_submit(rq); 3657 } 3658 3659 rb_erase_cached(&p->node, &execlists->queue); 3660 i915_priolist_free(p); 3661 } 3662 3663 /* On-hold requests will be flushed to timeline upon their release */ 3664 list_for_each_entry(rq, &engine->active.hold, sched.link) 3665 mark_eio(rq); 3666 3667 /* Cancel all attached virtual engines */ 3668 while ((rb = rb_first_cached(&execlists->virtual))) { 3669 struct virtual_engine *ve = 3670 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3671 3672 rb_erase_cached(rb, &execlists->virtual); 3673 RB_CLEAR_NODE(rb); 3674 3675 spin_lock(&ve->base.active.lock); 3676 rq = fetch_and_zero(&ve->request); 3677 if (rq) { 3678 mark_eio(rq); 3679 3680 rq->engine = engine; 3681 __i915_request_submit(rq); 3682 i915_request_put(rq); 3683 3684 ve->base.execlists.queue_priority_hint = INT_MIN; 3685 } 3686 spin_unlock(&ve->base.active.lock); 3687 } 3688 3689 /* Remaining _unready_ requests will be nop'ed when submitted */ 3690 3691 execlists->queue_priority_hint = INT_MIN; 3692 execlists->queue = RB_ROOT_CACHED; 3693 3694 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3695 execlists->tasklet.func = nop_submission_tasklet; 3696 3697 spin_unlock_irqrestore(&engine->active.lock, flags); 3698 } 3699 3700 static void execlists_reset_finish(struct intel_engine_cs *engine) 3701 { 3702 struct intel_engine_execlists * const execlists = &engine->execlists; 3703 3704 /* 3705 * After a GPU reset, we may have requests to replay. Do so now while 3706 * we still have the forcewake to be sure that the GPU is not allowed 3707 * to sleep before we restart and reload a context. 3708 */ 3709 GEM_BUG_ON(!reset_in_progress(execlists)); 3710 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3711 execlists->tasklet.func(execlists->tasklet.data); 3712 3713 if (__tasklet_enable(&execlists->tasklet)) 3714 /* And kick in case we missed a new request submission. */ 3715 tasklet_hi_schedule(&execlists->tasklet); 3716 ENGINE_TRACE(engine, "depth->%d\n", 3717 atomic_read(&execlists->tasklet.count)); 3718 } 3719 3720 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 3721 u64 offset, u32 len, 3722 const unsigned int flags) 3723 { 3724 u32 *cs; 3725 3726 cs = intel_ring_begin(rq, 4); 3727 if (IS_ERR(cs)) 3728 return PTR_ERR(cs); 3729 3730 /* 3731 * WaDisableCtxRestoreArbitration:bdw,chv 3732 * 3733 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3734 * particular all the gen that do not need the w/a at all!), if we 3735 * took care to make sure that on every switch into this context 3736 * (both ordinary and for preemption) that arbitrartion was enabled 3737 * we would be fine. However, for gen8 there is another w/a that 3738 * requires us to not preempt inside GPGPU execution, so we keep 3739 * arbitration disabled for gen8 batches. Arbitration will be 3740 * re-enabled before we close the request 3741 * (engine->emit_fini_breadcrumb). 3742 */ 3743 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3744 3745 /* FIXME(BDW+): Address space and security selectors. */ 3746 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3747 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3748 *cs++ = lower_32_bits(offset); 3749 *cs++ = upper_32_bits(offset); 3750 3751 intel_ring_advance(rq, cs); 3752 3753 return 0; 3754 } 3755 3756 static int gen8_emit_bb_start(struct i915_request *rq, 3757 u64 offset, u32 len, 3758 const unsigned int flags) 3759 { 3760 u32 *cs; 3761 3762 cs = intel_ring_begin(rq, 6); 3763 if (IS_ERR(cs)) 3764 return PTR_ERR(cs); 3765 3766 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3767 3768 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3769 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3770 *cs++ = lower_32_bits(offset); 3771 *cs++ = upper_32_bits(offset); 3772 3773 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3774 *cs++ = MI_NOOP; 3775 3776 intel_ring_advance(rq, cs); 3777 3778 return 0; 3779 } 3780 3781 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3782 { 3783 ENGINE_WRITE(engine, RING_IMR, 3784 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3785 ENGINE_POSTING_READ(engine, RING_IMR); 3786 } 3787 3788 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3789 { 3790 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3791 } 3792 3793 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3794 { 3795 u32 cmd, *cs; 3796 3797 cs = intel_ring_begin(request, 4); 3798 if (IS_ERR(cs)) 3799 return PTR_ERR(cs); 3800 3801 cmd = MI_FLUSH_DW + 1; 3802 3803 /* We always require a command barrier so that subsequent 3804 * commands, such as breadcrumb interrupts, are strictly ordered 3805 * wrt the contents of the write cache being flushed to memory 3806 * (and thus being coherent from the CPU). 3807 */ 3808 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3809 3810 if (mode & EMIT_INVALIDATE) { 3811 cmd |= MI_INVALIDATE_TLB; 3812 if (request->engine->class == VIDEO_DECODE_CLASS) 3813 cmd |= MI_INVALIDATE_BSD; 3814 } 3815 3816 *cs++ = cmd; 3817 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3818 *cs++ = 0; /* upper addr */ 3819 *cs++ = 0; /* value */ 3820 intel_ring_advance(request, cs); 3821 3822 return 0; 3823 } 3824 3825 static int gen8_emit_flush_render(struct i915_request *request, 3826 u32 mode) 3827 { 3828 bool vf_flush_wa = false, dc_flush_wa = false; 3829 u32 *cs, flags = 0; 3830 int len; 3831 3832 flags |= PIPE_CONTROL_CS_STALL; 3833 3834 if (mode & EMIT_FLUSH) { 3835 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3836 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3837 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3838 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3839 } 3840 3841 if (mode & EMIT_INVALIDATE) { 3842 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3843 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3844 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3845 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3846 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3847 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3848 flags |= PIPE_CONTROL_QW_WRITE; 3849 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3850 3851 /* 3852 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3853 * pipe control. 3854 */ 3855 if (IS_GEN(request->i915, 9)) 3856 vf_flush_wa = true; 3857 3858 /* WaForGAMHang:kbl */ 3859 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3860 dc_flush_wa = true; 3861 } 3862 3863 len = 6; 3864 3865 if (vf_flush_wa) 3866 len += 6; 3867 3868 if (dc_flush_wa) 3869 len += 12; 3870 3871 cs = intel_ring_begin(request, len); 3872 if (IS_ERR(cs)) 3873 return PTR_ERR(cs); 3874 3875 if (vf_flush_wa) 3876 cs = gen8_emit_pipe_control(cs, 0, 0); 3877 3878 if (dc_flush_wa) 3879 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3880 0); 3881 3882 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3883 3884 if (dc_flush_wa) 3885 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3886 3887 intel_ring_advance(request, cs); 3888 3889 return 0; 3890 } 3891 3892 static int gen11_emit_flush_render(struct i915_request *request, 3893 u32 mode) 3894 { 3895 if (mode & EMIT_FLUSH) { 3896 u32 *cs; 3897 u32 flags = 0; 3898 3899 flags |= PIPE_CONTROL_CS_STALL; 3900 3901 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3902 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3903 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3904 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3905 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3906 flags |= PIPE_CONTROL_QW_WRITE; 3907 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3908 3909 cs = intel_ring_begin(request, 6); 3910 if (IS_ERR(cs)) 3911 return PTR_ERR(cs); 3912 3913 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3914 intel_ring_advance(request, cs); 3915 } 3916 3917 if (mode & EMIT_INVALIDATE) { 3918 u32 *cs; 3919 u32 flags = 0; 3920 3921 flags |= PIPE_CONTROL_CS_STALL; 3922 3923 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3924 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3925 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3926 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3927 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3928 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3929 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3930 flags |= PIPE_CONTROL_QW_WRITE; 3931 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3932 3933 cs = intel_ring_begin(request, 6); 3934 if (IS_ERR(cs)) 3935 return PTR_ERR(cs); 3936 3937 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3938 intel_ring_advance(request, cs); 3939 } 3940 3941 return 0; 3942 } 3943 3944 static u32 preparser_disable(bool state) 3945 { 3946 return MI_ARB_CHECK | 1 << 8 | state; 3947 } 3948 3949 static int gen12_emit_flush_render(struct i915_request *request, 3950 u32 mode) 3951 { 3952 if (mode & EMIT_FLUSH) { 3953 u32 flags = 0; 3954 u32 *cs; 3955 3956 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3957 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3958 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3959 /* Wa_1409600907:tgl */ 3960 flags |= PIPE_CONTROL_DEPTH_STALL; 3961 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3962 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3963 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3964 3965 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3966 flags |= PIPE_CONTROL_QW_WRITE; 3967 3968 flags |= PIPE_CONTROL_CS_STALL; 3969 3970 cs = intel_ring_begin(request, 6); 3971 if (IS_ERR(cs)) 3972 return PTR_ERR(cs); 3973 3974 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3975 intel_ring_advance(request, cs); 3976 } 3977 3978 if (mode & EMIT_INVALIDATE) { 3979 u32 flags = 0; 3980 u32 *cs; 3981 3982 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3983 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3984 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3985 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3986 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3987 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3988 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3989 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 3990 3991 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3992 flags |= PIPE_CONTROL_QW_WRITE; 3993 3994 flags |= PIPE_CONTROL_CS_STALL; 3995 3996 cs = intel_ring_begin(request, 8); 3997 if (IS_ERR(cs)) 3998 return PTR_ERR(cs); 3999 4000 /* 4001 * Prevent the pre-parser from skipping past the TLB 4002 * invalidate and loading a stale page for the batch 4003 * buffer / request payload. 4004 */ 4005 *cs++ = preparser_disable(true); 4006 4007 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4008 4009 *cs++ = preparser_disable(false); 4010 intel_ring_advance(request, cs); 4011 4012 /* 4013 * Wa_1604544889:tgl 4014 */ 4015 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 4016 flags = 0; 4017 flags |= PIPE_CONTROL_CS_STALL; 4018 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 4019 4020 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4021 flags |= PIPE_CONTROL_QW_WRITE; 4022 4023 cs = intel_ring_begin(request, 6); 4024 if (IS_ERR(cs)) 4025 return PTR_ERR(cs); 4026 4027 cs = gen8_emit_pipe_control(cs, flags, 4028 LRC_PPHWSP_SCRATCH_ADDR); 4029 intel_ring_advance(request, cs); 4030 } 4031 } 4032 4033 return 0; 4034 } 4035 4036 /* 4037 * Reserve space for 2 NOOPs at the end of each request to be 4038 * used as a workaround for not being allowed to do lite 4039 * restore with HEAD==TAIL (WaIdleLiteRestore). 4040 */ 4041 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4042 { 4043 /* Ensure there's always at least one preemption point per-request. */ 4044 *cs++ = MI_ARB_CHECK; 4045 *cs++ = MI_NOOP; 4046 request->wa_tail = intel_ring_offset(request, cs); 4047 4048 return cs; 4049 } 4050 4051 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4052 { 4053 *cs++ = MI_SEMAPHORE_WAIT | 4054 MI_SEMAPHORE_GLOBAL_GTT | 4055 MI_SEMAPHORE_POLL | 4056 MI_SEMAPHORE_SAD_EQ_SDD; 4057 *cs++ = 0; 4058 *cs++ = intel_hws_preempt_address(request->engine); 4059 *cs++ = 0; 4060 4061 return cs; 4062 } 4063 4064 static __always_inline u32* 4065 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 4066 u32 *cs) 4067 { 4068 *cs++ = MI_USER_INTERRUPT; 4069 4070 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4071 if (intel_engine_has_semaphores(request->engine)) 4072 cs = emit_preempt_busywait(request, cs); 4073 4074 request->tail = intel_ring_offset(request, cs); 4075 assert_ring_tail_valid(request->ring, request->tail); 4076 4077 return gen8_emit_wa_tail(request, cs); 4078 } 4079 4080 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4081 { 4082 cs = gen8_emit_ggtt_write(cs, 4083 request->fence.seqno, 4084 i915_request_active_timeline(request)->hwsp_offset, 4085 0); 4086 4087 return gen8_emit_fini_breadcrumb_footer(request, cs); 4088 } 4089 4090 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4091 { 4092 cs = gen8_emit_pipe_control(cs, 4093 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4094 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4095 PIPE_CONTROL_DC_FLUSH_ENABLE, 4096 0); 4097 4098 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4099 cs = gen8_emit_ggtt_write_rcs(cs, 4100 request->fence.seqno, 4101 i915_request_active_timeline(request)->hwsp_offset, 4102 PIPE_CONTROL_FLUSH_ENABLE | 4103 PIPE_CONTROL_CS_STALL); 4104 4105 return gen8_emit_fini_breadcrumb_footer(request, cs); 4106 } 4107 4108 static u32 * 4109 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4110 { 4111 cs = gen8_emit_ggtt_write_rcs(cs, 4112 request->fence.seqno, 4113 i915_request_active_timeline(request)->hwsp_offset, 4114 PIPE_CONTROL_CS_STALL | 4115 PIPE_CONTROL_TILE_CACHE_FLUSH | 4116 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4117 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4118 PIPE_CONTROL_DC_FLUSH_ENABLE | 4119 PIPE_CONTROL_FLUSH_ENABLE); 4120 4121 return gen8_emit_fini_breadcrumb_footer(request, cs); 4122 } 4123 4124 /* 4125 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4126 * flush and will continue pre-fetching the instructions after it before the 4127 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4128 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4129 * of the next request before the memory has been flushed, we're guaranteed that 4130 * we won't access the batch itself too early. 4131 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4132 * so, if the current request is modifying an instruction in the next request on 4133 * the same intel_context, we might pre-fetch and then execute the pre-update 4134 * instruction. To avoid this, the users of self-modifying code should either 4135 * disable the parser around the code emitting the memory writes, via a new flag 4136 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4137 * the in-kernel use-cases we've opted to use a separate context, see 4138 * reloc_gpu() as an example. 4139 * All the above applies only to the instructions themselves. Non-inline data 4140 * used by the instructions is not pre-fetched. 4141 */ 4142 4143 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4144 { 4145 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4146 MI_SEMAPHORE_GLOBAL_GTT | 4147 MI_SEMAPHORE_POLL | 4148 MI_SEMAPHORE_SAD_EQ_SDD; 4149 *cs++ = 0; 4150 *cs++ = intel_hws_preempt_address(request->engine); 4151 *cs++ = 0; 4152 *cs++ = 0; 4153 *cs++ = MI_NOOP; 4154 4155 return cs; 4156 } 4157 4158 static __always_inline u32* 4159 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 4160 { 4161 *cs++ = MI_USER_INTERRUPT; 4162 4163 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4164 if (intel_engine_has_semaphores(request->engine)) 4165 cs = gen12_emit_preempt_busywait(request, cs); 4166 4167 request->tail = intel_ring_offset(request, cs); 4168 assert_ring_tail_valid(request->ring, request->tail); 4169 4170 return gen8_emit_wa_tail(request, cs); 4171 } 4172 4173 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4174 { 4175 cs = gen8_emit_ggtt_write(cs, 4176 request->fence.seqno, 4177 i915_request_active_timeline(request)->hwsp_offset, 4178 0); 4179 4180 return gen12_emit_fini_breadcrumb_footer(request, cs); 4181 } 4182 4183 static u32 * 4184 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4185 { 4186 cs = gen8_emit_ggtt_write_rcs(cs, 4187 request->fence.seqno, 4188 i915_request_active_timeline(request)->hwsp_offset, 4189 PIPE_CONTROL_CS_STALL | 4190 PIPE_CONTROL_TILE_CACHE_FLUSH | 4191 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4192 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4193 /* Wa_1409600907:tgl */ 4194 PIPE_CONTROL_DEPTH_STALL | 4195 PIPE_CONTROL_DC_FLUSH_ENABLE | 4196 PIPE_CONTROL_FLUSH_ENABLE | 4197 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 4198 4199 return gen12_emit_fini_breadcrumb_footer(request, cs); 4200 } 4201 4202 static void execlists_park(struct intel_engine_cs *engine) 4203 { 4204 cancel_timer(&engine->execlists.timer); 4205 cancel_timer(&engine->execlists.preempt); 4206 } 4207 4208 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4209 { 4210 engine->submit_request = execlists_submit_request; 4211 engine->schedule = i915_schedule; 4212 engine->execlists.tasklet.func = execlists_submission_tasklet; 4213 4214 engine->reset.prepare = execlists_reset_prepare; 4215 engine->reset.rewind = execlists_reset_rewind; 4216 engine->reset.cancel = execlists_reset_cancel; 4217 engine->reset.finish = execlists_reset_finish; 4218 4219 engine->park = execlists_park; 4220 engine->unpark = NULL; 4221 4222 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 4223 if (!intel_vgpu_active(engine->i915)) { 4224 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 4225 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 4226 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 4227 } 4228 4229 if (INTEL_GEN(engine->i915) >= 12) 4230 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 4231 4232 if (intel_engine_has_preemption(engine)) 4233 engine->emit_bb_start = gen8_emit_bb_start; 4234 else 4235 engine->emit_bb_start = gen8_emit_bb_start_noarb; 4236 } 4237 4238 static void execlists_shutdown(struct intel_engine_cs *engine) 4239 { 4240 /* Synchronise with residual timers and any softirq they raise */ 4241 del_timer_sync(&engine->execlists.timer); 4242 del_timer_sync(&engine->execlists.preempt); 4243 tasklet_kill(&engine->execlists.tasklet); 4244 } 4245 4246 static void execlists_release(struct intel_engine_cs *engine) 4247 { 4248 execlists_shutdown(engine); 4249 4250 intel_engine_cleanup_common(engine); 4251 lrc_destroy_wa_ctx(engine); 4252 } 4253 4254 static void 4255 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 4256 { 4257 /* Default vfuncs which can be overriden by each engine. */ 4258 4259 engine->resume = execlists_resume; 4260 4261 engine->cops = &execlists_context_ops; 4262 engine->request_alloc = execlists_request_alloc; 4263 4264 engine->emit_flush = gen8_emit_flush; 4265 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 4266 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 4267 if (INTEL_GEN(engine->i915) >= 12) 4268 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 4269 4270 engine->set_default_submission = intel_execlists_set_default_submission; 4271 4272 if (INTEL_GEN(engine->i915) < 11) { 4273 engine->irq_enable = gen8_logical_ring_enable_irq; 4274 engine->irq_disable = gen8_logical_ring_disable_irq; 4275 } else { 4276 /* 4277 * TODO: On Gen11 interrupt masks need to be clear 4278 * to allow C6 entry. Keep interrupts enabled at 4279 * and take the hit of generating extra interrupts 4280 * until a more refined solution exists. 4281 */ 4282 } 4283 } 4284 4285 static inline void 4286 logical_ring_default_irqs(struct intel_engine_cs *engine) 4287 { 4288 unsigned int shift = 0; 4289 4290 if (INTEL_GEN(engine->i915) < 11) { 4291 const u8 irq_shifts[] = { 4292 [RCS0] = GEN8_RCS_IRQ_SHIFT, 4293 [BCS0] = GEN8_BCS_IRQ_SHIFT, 4294 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 4295 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 4296 [VECS0] = GEN8_VECS_IRQ_SHIFT, 4297 }; 4298 4299 shift = irq_shifts[engine->id]; 4300 } 4301 4302 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 4303 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 4304 } 4305 4306 static void rcs_submission_override(struct intel_engine_cs *engine) 4307 { 4308 switch (INTEL_GEN(engine->i915)) { 4309 case 12: 4310 engine->emit_flush = gen12_emit_flush_render; 4311 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 4312 break; 4313 case 11: 4314 engine->emit_flush = gen11_emit_flush_render; 4315 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 4316 break; 4317 default: 4318 engine->emit_flush = gen8_emit_flush_render; 4319 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 4320 break; 4321 } 4322 } 4323 4324 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 4325 { 4326 struct intel_engine_execlists * const execlists = &engine->execlists; 4327 struct drm_i915_private *i915 = engine->i915; 4328 struct intel_uncore *uncore = engine->uncore; 4329 u32 base = engine->mmio_base; 4330 4331 tasklet_init(&engine->execlists.tasklet, 4332 execlists_submission_tasklet, (unsigned long)engine); 4333 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 4334 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 4335 4336 logical_ring_default_vfuncs(engine); 4337 logical_ring_default_irqs(engine); 4338 4339 if (engine->class == RENDER_CLASS) 4340 rcs_submission_override(engine); 4341 4342 if (intel_init_workaround_bb(engine)) 4343 /* 4344 * We continue even if we fail to initialize WA batch 4345 * because we only expect rare glitches but nothing 4346 * critical to prevent us from using GPU 4347 */ 4348 DRM_ERROR("WA batch buffer initialization failed\n"); 4349 4350 if (HAS_LOGICAL_RING_ELSQ(i915)) { 4351 execlists->submit_reg = uncore->regs + 4352 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 4353 execlists->ctrl_reg = uncore->regs + 4354 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 4355 } else { 4356 execlists->submit_reg = uncore->regs + 4357 i915_mmio_reg_offset(RING_ELSP(base)); 4358 } 4359 4360 execlists->csb_status = 4361 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 4362 4363 execlists->csb_write = 4364 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 4365 4366 if (INTEL_GEN(i915) < 11) 4367 execlists->csb_size = GEN8_CSB_ENTRIES; 4368 else 4369 execlists->csb_size = GEN11_CSB_ENTRIES; 4370 4371 reset_csb_pointers(engine); 4372 4373 /* Finally, take ownership and responsibility for cleanup! */ 4374 engine->release = execlists_release; 4375 4376 return 0; 4377 } 4378 4379 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 4380 { 4381 u32 indirect_ctx_offset; 4382 4383 switch (INTEL_GEN(engine->i915)) { 4384 default: 4385 MISSING_CASE(INTEL_GEN(engine->i915)); 4386 /* fall through */ 4387 case 12: 4388 indirect_ctx_offset = 4389 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4390 break; 4391 case 11: 4392 indirect_ctx_offset = 4393 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4394 break; 4395 case 10: 4396 indirect_ctx_offset = 4397 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4398 break; 4399 case 9: 4400 indirect_ctx_offset = 4401 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4402 break; 4403 case 8: 4404 indirect_ctx_offset = 4405 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4406 break; 4407 } 4408 4409 return indirect_ctx_offset; 4410 } 4411 4412 4413 static void init_common_reg_state(u32 * const regs, 4414 const struct intel_engine_cs *engine, 4415 const struct intel_ring *ring, 4416 bool inhibit) 4417 { 4418 u32 ctl; 4419 4420 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 4421 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4422 if (inhibit) 4423 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 4424 if (INTEL_GEN(engine->i915) < 11) 4425 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 4426 CTX_CTRL_RS_CTX_ENABLE); 4427 regs[CTX_CONTEXT_CONTROL] = ctl; 4428 4429 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 4430 } 4431 4432 static void init_wa_bb_reg_state(u32 * const regs, 4433 const struct intel_engine_cs *engine, 4434 u32 pos_bb_per_ctx) 4435 { 4436 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 4437 4438 if (wa_ctx->per_ctx.size) { 4439 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4440 4441 regs[pos_bb_per_ctx] = 4442 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 4443 } 4444 4445 if (wa_ctx->indirect_ctx.size) { 4446 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4447 4448 regs[pos_bb_per_ctx + 2] = 4449 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 4450 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 4451 4452 regs[pos_bb_per_ctx + 4] = 4453 intel_lr_indirect_ctx_offset(engine) << 6; 4454 } 4455 } 4456 4457 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 4458 { 4459 if (i915_vm_is_4lvl(&ppgtt->vm)) { 4460 /* 64b PPGTT (48bit canonical) 4461 * PDP0_DESCRIPTOR contains the base address to PML4 and 4462 * other PDP Descriptors are ignored. 4463 */ 4464 ASSIGN_CTX_PML4(ppgtt, regs); 4465 } else { 4466 ASSIGN_CTX_PDP(ppgtt, regs, 3); 4467 ASSIGN_CTX_PDP(ppgtt, regs, 2); 4468 ASSIGN_CTX_PDP(ppgtt, regs, 1); 4469 ASSIGN_CTX_PDP(ppgtt, regs, 0); 4470 } 4471 } 4472 4473 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 4474 { 4475 if (i915_is_ggtt(vm)) 4476 return i915_vm_to_ggtt(vm)->alias; 4477 else 4478 return i915_vm_to_ppgtt(vm); 4479 } 4480 4481 static void execlists_init_reg_state(u32 *regs, 4482 const struct intel_context *ce, 4483 const struct intel_engine_cs *engine, 4484 const struct intel_ring *ring, 4485 bool inhibit) 4486 { 4487 /* 4488 * A context is actually a big batch buffer with several 4489 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 4490 * values we are setting here are only for the first context restore: 4491 * on a subsequent save, the GPU will recreate this batchbuffer with new 4492 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 4493 * we are not initializing here). 4494 * 4495 * Must keep consistent with virtual_update_register_offsets(). 4496 */ 4497 set_offsets(regs, reg_offsets(engine), engine, inhibit); 4498 4499 init_common_reg_state(regs, engine, ring, inhibit); 4500 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 4501 4502 init_wa_bb_reg_state(regs, engine, 4503 INTEL_GEN(engine->i915) >= 12 ? 4504 GEN12_CTX_BB_PER_CTX_PTR : 4505 CTX_BB_PER_CTX_PTR); 4506 4507 __reset_stop_ring(regs, engine); 4508 } 4509 4510 static int 4511 populate_lr_context(struct intel_context *ce, 4512 struct drm_i915_gem_object *ctx_obj, 4513 struct intel_engine_cs *engine, 4514 struct intel_ring *ring) 4515 { 4516 bool inhibit = true; 4517 void *vaddr; 4518 int ret; 4519 4520 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4521 if (IS_ERR(vaddr)) { 4522 ret = PTR_ERR(vaddr); 4523 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4524 return ret; 4525 } 4526 4527 set_redzone(vaddr, engine); 4528 4529 if (engine->default_state) { 4530 void *defaults; 4531 4532 defaults = i915_gem_object_pin_map(engine->default_state, 4533 I915_MAP_WB); 4534 if (IS_ERR(defaults)) { 4535 ret = PTR_ERR(defaults); 4536 goto err_unpin_ctx; 4537 } 4538 4539 memcpy(vaddr, defaults, engine->context_size); 4540 i915_gem_object_unpin_map(engine->default_state); 4541 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 4542 inhibit = false; 4543 } 4544 4545 /* The second page of the context object contains some fields which must 4546 * be set up prior to the first execution. */ 4547 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, 4548 ce, engine, ring, inhibit); 4549 4550 ret = 0; 4551 err_unpin_ctx: 4552 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4553 i915_gem_object_unpin_map(ctx_obj); 4554 return ret; 4555 } 4556 4557 static int __execlists_context_alloc(struct intel_context *ce, 4558 struct intel_engine_cs *engine) 4559 { 4560 struct drm_i915_gem_object *ctx_obj; 4561 struct intel_ring *ring; 4562 struct i915_vma *vma; 4563 u32 context_size; 4564 int ret; 4565 4566 GEM_BUG_ON(ce->state); 4567 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4568 4569 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4570 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4571 4572 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4573 if (IS_ERR(ctx_obj)) 4574 return PTR_ERR(ctx_obj); 4575 4576 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4577 if (IS_ERR(vma)) { 4578 ret = PTR_ERR(vma); 4579 goto error_deref_obj; 4580 } 4581 4582 if (!ce->timeline) { 4583 struct intel_timeline *tl; 4584 4585 tl = intel_timeline_create(engine->gt, NULL); 4586 if (IS_ERR(tl)) { 4587 ret = PTR_ERR(tl); 4588 goto error_deref_obj; 4589 } 4590 4591 ce->timeline = tl; 4592 } 4593 4594 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4595 if (IS_ERR(ring)) { 4596 ret = PTR_ERR(ring); 4597 goto error_deref_obj; 4598 } 4599 4600 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4601 if (ret) { 4602 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4603 goto error_ring_free; 4604 } 4605 4606 ce->ring = ring; 4607 ce->state = vma; 4608 4609 return 0; 4610 4611 error_ring_free: 4612 intel_ring_put(ring); 4613 error_deref_obj: 4614 i915_gem_object_put(ctx_obj); 4615 return ret; 4616 } 4617 4618 static struct list_head *virtual_queue(struct virtual_engine *ve) 4619 { 4620 return &ve->base.execlists.default_priolist.requests[0]; 4621 } 4622 4623 static void virtual_context_destroy(struct kref *kref) 4624 { 4625 struct virtual_engine *ve = 4626 container_of(kref, typeof(*ve), context.ref); 4627 unsigned int n; 4628 4629 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4630 GEM_BUG_ON(ve->request); 4631 GEM_BUG_ON(ve->context.inflight); 4632 4633 for (n = 0; n < ve->num_siblings; n++) { 4634 struct intel_engine_cs *sibling = ve->siblings[n]; 4635 struct rb_node *node = &ve->nodes[sibling->id].rb; 4636 unsigned long flags; 4637 4638 if (RB_EMPTY_NODE(node)) 4639 continue; 4640 4641 spin_lock_irqsave(&sibling->active.lock, flags); 4642 4643 /* Detachment is lazily performed in the execlists tasklet */ 4644 if (!RB_EMPTY_NODE(node)) 4645 rb_erase_cached(node, &sibling->execlists.virtual); 4646 4647 spin_unlock_irqrestore(&sibling->active.lock, flags); 4648 } 4649 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4650 4651 if (ve->context.state) 4652 __execlists_context_fini(&ve->context); 4653 intel_context_fini(&ve->context); 4654 4655 kfree(ve->bonds); 4656 kfree(ve); 4657 } 4658 4659 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4660 { 4661 int swp; 4662 4663 /* 4664 * Pick a random sibling on starting to help spread the load around. 4665 * 4666 * New contexts are typically created with exactly the same order 4667 * of siblings, and often started in batches. Due to the way we iterate 4668 * the array of sibling when submitting requests, sibling[0] is 4669 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4670 * randomised across the system, we also help spread the load by the 4671 * first engine we inspect being different each time. 4672 * 4673 * NB This does not force us to execute on this engine, it will just 4674 * typically be the first we inspect for submission. 4675 */ 4676 swp = prandom_u32_max(ve->num_siblings); 4677 if (!swp) 4678 return; 4679 4680 swap(ve->siblings[swp], ve->siblings[0]); 4681 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4682 virtual_update_register_offsets(ve->context.lrc_reg_state, 4683 ve->siblings[0]); 4684 } 4685 4686 static int virtual_context_alloc(struct intel_context *ce) 4687 { 4688 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4689 4690 return __execlists_context_alloc(ce, ve->siblings[0]); 4691 } 4692 4693 static int virtual_context_pin(struct intel_context *ce) 4694 { 4695 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4696 int err; 4697 4698 /* Note: we must use a real engine class for setting up reg state */ 4699 err = __execlists_context_pin(ce, ve->siblings[0]); 4700 if (err) 4701 return err; 4702 4703 virtual_engine_initial_hint(ve); 4704 return 0; 4705 } 4706 4707 static void virtual_context_enter(struct intel_context *ce) 4708 { 4709 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4710 unsigned int n; 4711 4712 for (n = 0; n < ve->num_siblings; n++) 4713 intel_engine_pm_get(ve->siblings[n]); 4714 4715 intel_timeline_enter(ce->timeline); 4716 } 4717 4718 static void virtual_context_exit(struct intel_context *ce) 4719 { 4720 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4721 unsigned int n; 4722 4723 intel_timeline_exit(ce->timeline); 4724 4725 for (n = 0; n < ve->num_siblings; n++) 4726 intel_engine_pm_put(ve->siblings[n]); 4727 } 4728 4729 static const struct intel_context_ops virtual_context_ops = { 4730 .alloc = virtual_context_alloc, 4731 4732 .pin = virtual_context_pin, 4733 .unpin = execlists_context_unpin, 4734 4735 .enter = virtual_context_enter, 4736 .exit = virtual_context_exit, 4737 4738 .destroy = virtual_context_destroy, 4739 }; 4740 4741 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4742 { 4743 struct i915_request *rq; 4744 intel_engine_mask_t mask; 4745 4746 rq = READ_ONCE(ve->request); 4747 if (!rq) 4748 return 0; 4749 4750 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4751 mask = rq->execution_mask; 4752 if (unlikely(!mask)) { 4753 /* Invalid selection, submit to a random engine in error */ 4754 i915_request_skip(rq, -ENODEV); 4755 mask = ve->siblings[0]->mask; 4756 } 4757 4758 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 4759 rq->fence.context, rq->fence.seqno, 4760 mask, ve->base.execlists.queue_priority_hint); 4761 4762 return mask; 4763 } 4764 4765 static void virtual_submission_tasklet(unsigned long data) 4766 { 4767 struct virtual_engine * const ve = (struct virtual_engine *)data; 4768 const int prio = ve->base.execlists.queue_priority_hint; 4769 intel_engine_mask_t mask; 4770 unsigned int n; 4771 4772 rcu_read_lock(); 4773 mask = virtual_submission_mask(ve); 4774 rcu_read_unlock(); 4775 if (unlikely(!mask)) 4776 return; 4777 4778 local_irq_disable(); 4779 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4780 struct intel_engine_cs *sibling = ve->siblings[n]; 4781 struct ve_node * const node = &ve->nodes[sibling->id]; 4782 struct rb_node **parent, *rb; 4783 bool first; 4784 4785 if (unlikely(!(mask & sibling->mask))) { 4786 if (!RB_EMPTY_NODE(&node->rb)) { 4787 spin_lock(&sibling->active.lock); 4788 rb_erase_cached(&node->rb, 4789 &sibling->execlists.virtual); 4790 RB_CLEAR_NODE(&node->rb); 4791 spin_unlock(&sibling->active.lock); 4792 } 4793 continue; 4794 } 4795 4796 spin_lock(&sibling->active.lock); 4797 4798 if (!RB_EMPTY_NODE(&node->rb)) { 4799 /* 4800 * Cheat and avoid rebalancing the tree if we can 4801 * reuse this node in situ. 4802 */ 4803 first = rb_first_cached(&sibling->execlists.virtual) == 4804 &node->rb; 4805 if (prio == node->prio || (prio > node->prio && first)) 4806 goto submit_engine; 4807 4808 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4809 } 4810 4811 rb = NULL; 4812 first = true; 4813 parent = &sibling->execlists.virtual.rb_root.rb_node; 4814 while (*parent) { 4815 struct ve_node *other; 4816 4817 rb = *parent; 4818 other = rb_entry(rb, typeof(*other), rb); 4819 if (prio > other->prio) { 4820 parent = &rb->rb_left; 4821 } else { 4822 parent = &rb->rb_right; 4823 first = false; 4824 } 4825 } 4826 4827 rb_link_node(&node->rb, rb, parent); 4828 rb_insert_color_cached(&node->rb, 4829 &sibling->execlists.virtual, 4830 first); 4831 4832 submit_engine: 4833 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4834 node->prio = prio; 4835 if (first && prio > sibling->execlists.queue_priority_hint) { 4836 sibling->execlists.queue_priority_hint = prio; 4837 tasklet_hi_schedule(&sibling->execlists.tasklet); 4838 } 4839 4840 spin_unlock(&sibling->active.lock); 4841 } 4842 local_irq_enable(); 4843 } 4844 4845 static void virtual_submit_request(struct i915_request *rq) 4846 { 4847 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4848 struct i915_request *old; 4849 unsigned long flags; 4850 4851 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 4852 rq->fence.context, 4853 rq->fence.seqno); 4854 4855 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4856 4857 spin_lock_irqsave(&ve->base.active.lock, flags); 4858 4859 old = ve->request; 4860 if (old) { /* background completion event from preempt-to-busy */ 4861 GEM_BUG_ON(!i915_request_completed(old)); 4862 __i915_request_submit(old); 4863 i915_request_put(old); 4864 } 4865 4866 if (i915_request_completed(rq)) { 4867 __i915_request_submit(rq); 4868 4869 ve->base.execlists.queue_priority_hint = INT_MIN; 4870 ve->request = NULL; 4871 } else { 4872 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4873 ve->request = i915_request_get(rq); 4874 4875 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4876 list_move_tail(&rq->sched.link, virtual_queue(ve)); 4877 4878 tasklet_schedule(&ve->base.execlists.tasklet); 4879 } 4880 4881 spin_unlock_irqrestore(&ve->base.active.lock, flags); 4882 } 4883 4884 static struct ve_bond * 4885 virtual_find_bond(struct virtual_engine *ve, 4886 const struct intel_engine_cs *master) 4887 { 4888 int i; 4889 4890 for (i = 0; i < ve->num_bonds; i++) { 4891 if (ve->bonds[i].master == master) 4892 return &ve->bonds[i]; 4893 } 4894 4895 return NULL; 4896 } 4897 4898 static void 4899 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 4900 { 4901 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4902 intel_engine_mask_t allowed, exec; 4903 struct ve_bond *bond; 4904 4905 allowed = ~to_request(signal)->engine->mask; 4906 4907 bond = virtual_find_bond(ve, to_request(signal)->engine); 4908 if (bond) 4909 allowed &= bond->sibling_mask; 4910 4911 /* Restrict the bonded request to run on only the available engines */ 4912 exec = READ_ONCE(rq->execution_mask); 4913 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 4914 ; 4915 4916 /* Prevent the master from being re-run on the bonded engines */ 4917 to_request(signal)->execution_mask &= ~allowed; 4918 } 4919 4920 struct intel_context * 4921 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 4922 unsigned int count) 4923 { 4924 struct virtual_engine *ve; 4925 unsigned int n; 4926 int err; 4927 4928 if (count == 0) 4929 return ERR_PTR(-EINVAL); 4930 4931 if (count == 1) 4932 return intel_context_create(siblings[0]); 4933 4934 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 4935 if (!ve) 4936 return ERR_PTR(-ENOMEM); 4937 4938 ve->base.i915 = siblings[0]->i915; 4939 ve->base.gt = siblings[0]->gt; 4940 ve->base.uncore = siblings[0]->uncore; 4941 ve->base.id = -1; 4942 4943 ve->base.class = OTHER_CLASS; 4944 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 4945 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4946 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4947 4948 /* 4949 * The decision on whether to submit a request using semaphores 4950 * depends on the saturated state of the engine. We only compute 4951 * this during HW submission of the request, and we need for this 4952 * state to be globally applied to all requests being submitted 4953 * to this engine. Virtual engines encompass more than one physical 4954 * engine and so we cannot accurately tell in advance if one of those 4955 * engines is already saturated and so cannot afford to use a semaphore 4956 * and be pessimized in priority for doing so -- if we are the only 4957 * context using semaphores after all other clients have stopped, we 4958 * will be starved on the saturated system. Such a global switch for 4959 * semaphores is less than ideal, but alas is the current compromise. 4960 */ 4961 ve->base.saturated = ALL_ENGINES; 4962 4963 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 4964 4965 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 4966 intel_engine_init_breadcrumbs(&ve->base); 4967 intel_engine_init_execlists(&ve->base); 4968 4969 ve->base.cops = &virtual_context_ops; 4970 ve->base.request_alloc = execlists_request_alloc; 4971 4972 ve->base.schedule = i915_schedule; 4973 ve->base.submit_request = virtual_submit_request; 4974 ve->base.bond_execute = virtual_bond_execute; 4975 4976 INIT_LIST_HEAD(virtual_queue(ve)); 4977 ve->base.execlists.queue_priority_hint = INT_MIN; 4978 tasklet_init(&ve->base.execlists.tasklet, 4979 virtual_submission_tasklet, 4980 (unsigned long)ve); 4981 4982 intel_context_init(&ve->context, &ve->base); 4983 4984 for (n = 0; n < count; n++) { 4985 struct intel_engine_cs *sibling = siblings[n]; 4986 4987 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 4988 if (sibling->mask & ve->base.mask) { 4989 DRM_DEBUG("duplicate %s entry in load balancer\n", 4990 sibling->name); 4991 err = -EINVAL; 4992 goto err_put; 4993 } 4994 4995 /* 4996 * The virtual engine implementation is tightly coupled to 4997 * the execlists backend -- we push out request directly 4998 * into a tree inside each physical engine. We could support 4999 * layering if we handle cloning of the requests and 5000 * submitting a copy into each backend. 5001 */ 5002 if (sibling->execlists.tasklet.func != 5003 execlists_submission_tasklet) { 5004 err = -ENODEV; 5005 goto err_put; 5006 } 5007 5008 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5009 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5010 5011 ve->siblings[ve->num_siblings++] = sibling; 5012 ve->base.mask |= sibling->mask; 5013 5014 /* 5015 * All physical engines must be compatible for their emission 5016 * functions (as we build the instructions during request 5017 * construction and do not alter them before submission 5018 * on the physical engine). We use the engine class as a guide 5019 * here, although that could be refined. 5020 */ 5021 if (ve->base.class != OTHER_CLASS) { 5022 if (ve->base.class != sibling->class) { 5023 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5024 sibling->class, ve->base.class); 5025 err = -EINVAL; 5026 goto err_put; 5027 } 5028 continue; 5029 } 5030 5031 ve->base.class = sibling->class; 5032 ve->base.uabi_class = sibling->uabi_class; 5033 snprintf(ve->base.name, sizeof(ve->base.name), 5034 "v%dx%d", ve->base.class, count); 5035 ve->base.context_size = sibling->context_size; 5036 5037 ve->base.emit_bb_start = sibling->emit_bb_start; 5038 ve->base.emit_flush = sibling->emit_flush; 5039 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5040 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5041 ve->base.emit_fini_breadcrumb_dw = 5042 sibling->emit_fini_breadcrumb_dw; 5043 5044 ve->base.flags = sibling->flags; 5045 } 5046 5047 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5048 5049 return &ve->context; 5050 5051 err_put: 5052 intel_context_put(&ve->context); 5053 return ERR_PTR(err); 5054 } 5055 5056 struct intel_context * 5057 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5058 { 5059 struct virtual_engine *se = to_virtual_engine(src); 5060 struct intel_context *dst; 5061 5062 dst = intel_execlists_create_virtual(se->siblings, 5063 se->num_siblings); 5064 if (IS_ERR(dst)) 5065 return dst; 5066 5067 if (se->num_bonds) { 5068 struct virtual_engine *de = to_virtual_engine(dst->engine); 5069 5070 de->bonds = kmemdup(se->bonds, 5071 sizeof(*se->bonds) * se->num_bonds, 5072 GFP_KERNEL); 5073 if (!de->bonds) { 5074 intel_context_put(dst); 5075 return ERR_PTR(-ENOMEM); 5076 } 5077 5078 de->num_bonds = se->num_bonds; 5079 } 5080 5081 return dst; 5082 } 5083 5084 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5085 const struct intel_engine_cs *master, 5086 const struct intel_engine_cs *sibling) 5087 { 5088 struct virtual_engine *ve = to_virtual_engine(engine); 5089 struct ve_bond *bond; 5090 int n; 5091 5092 /* Sanity check the sibling is part of the virtual engine */ 5093 for (n = 0; n < ve->num_siblings; n++) 5094 if (sibling == ve->siblings[n]) 5095 break; 5096 if (n == ve->num_siblings) 5097 return -EINVAL; 5098 5099 bond = virtual_find_bond(ve, master); 5100 if (bond) { 5101 bond->sibling_mask |= sibling->mask; 5102 return 0; 5103 } 5104 5105 bond = krealloc(ve->bonds, 5106 sizeof(*bond) * (ve->num_bonds + 1), 5107 GFP_KERNEL); 5108 if (!bond) 5109 return -ENOMEM; 5110 5111 bond[ve->num_bonds].master = master; 5112 bond[ve->num_bonds].sibling_mask = sibling->mask; 5113 5114 ve->bonds = bond; 5115 ve->num_bonds++; 5116 5117 return 0; 5118 } 5119 5120 struct intel_engine_cs * 5121 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5122 unsigned int sibling) 5123 { 5124 struct virtual_engine *ve = to_virtual_engine(engine); 5125 5126 if (sibling >= ve->num_siblings) 5127 return NULL; 5128 5129 return ve->siblings[sibling]; 5130 } 5131 5132 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5133 struct drm_printer *m, 5134 void (*show_request)(struct drm_printer *m, 5135 struct i915_request *rq, 5136 const char *prefix), 5137 unsigned int max) 5138 { 5139 const struct intel_engine_execlists *execlists = &engine->execlists; 5140 struct i915_request *rq, *last; 5141 unsigned long flags; 5142 unsigned int count; 5143 struct rb_node *rb; 5144 5145 spin_lock_irqsave(&engine->active.lock, flags); 5146 5147 last = NULL; 5148 count = 0; 5149 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5150 if (count++ < max - 1) 5151 show_request(m, rq, "\t\tE "); 5152 else 5153 last = rq; 5154 } 5155 if (last) { 5156 if (count > max) { 5157 drm_printf(m, 5158 "\t\t...skipping %d executing requests...\n", 5159 count - max); 5160 } 5161 show_request(m, last, "\t\tE "); 5162 } 5163 5164 last = NULL; 5165 count = 0; 5166 if (execlists->queue_priority_hint != INT_MIN) 5167 drm_printf(m, "\t\tQueue priority hint: %d\n", 5168 execlists->queue_priority_hint); 5169 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5170 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5171 int i; 5172 5173 priolist_for_each_request(rq, p, i) { 5174 if (count++ < max - 1) 5175 show_request(m, rq, "\t\tQ "); 5176 else 5177 last = rq; 5178 } 5179 } 5180 if (last) { 5181 if (count > max) { 5182 drm_printf(m, 5183 "\t\t...skipping %d queued requests...\n", 5184 count - max); 5185 } 5186 show_request(m, last, "\t\tQ "); 5187 } 5188 5189 last = NULL; 5190 count = 0; 5191 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5192 struct virtual_engine *ve = 5193 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5194 struct i915_request *rq = READ_ONCE(ve->request); 5195 5196 if (rq) { 5197 if (count++ < max - 1) 5198 show_request(m, rq, "\t\tV "); 5199 else 5200 last = rq; 5201 } 5202 } 5203 if (last) { 5204 if (count > max) { 5205 drm_printf(m, 5206 "\t\t...skipping %d virtual requests...\n", 5207 count - max); 5208 } 5209 show_request(m, last, "\t\tV "); 5210 } 5211 5212 spin_unlock_irqrestore(&engine->active.lock, flags); 5213 } 5214 5215 void intel_lr_context_reset(struct intel_engine_cs *engine, 5216 struct intel_context *ce, 5217 u32 head, 5218 bool scrub) 5219 { 5220 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5221 5222 /* 5223 * We want a simple context + ring to execute the breadcrumb update. 5224 * We cannot rely on the context being intact across the GPU hang, 5225 * so clear it and rebuild just what we need for the breadcrumb. 5226 * All pending requests for this context will be zapped, and any 5227 * future request will be after userspace has had the opportunity 5228 * to recreate its own state. 5229 */ 5230 if (scrub) 5231 restore_default_state(ce, engine); 5232 5233 /* Rerun the request; its payload has been neutered (if guilty). */ 5234 __execlists_update_reg_state(ce, engine, head); 5235 } 5236 5237 bool 5238 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5239 { 5240 return engine->set_default_submission == 5241 intel_execlists_set_default_submission; 5242 } 5243 5244 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5245 #include "selftest_lrc.c" 5246 #endif 5247