1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "gem/i915_gem_context.h" 137 138 #include "i915_drv.h" 139 #include "i915_perf.h" 140 #include "i915_trace.h" 141 #include "i915_vgpu.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_workarounds.h" 149 150 #define RING_EXECLIST_QFULL (1 << 0x2) 151 #define RING_EXECLIST1_VALID (1 << 0x3) 152 #define RING_EXECLIST0_VALID (1 << 0x4) 153 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 154 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 155 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 156 157 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 158 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 159 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 160 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 161 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 162 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 163 164 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 165 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 166 167 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 168 169 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 170 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 171 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 172 #define GEN12_IDLE_CTX_ID 0x7FF 173 #define GEN12_CSB_CTX_VALID(csb_dw) \ 174 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 175 176 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 177 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 178 #define WA_TAIL_DWORDS 2 179 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 180 181 struct virtual_engine { 182 struct intel_engine_cs base; 183 struct intel_context context; 184 185 /* 186 * We allow only a single request through the virtual engine at a time 187 * (each request in the timeline waits for the completion fence of 188 * the previous before being submitted). By restricting ourselves to 189 * only submitting a single request, each request is placed on to a 190 * physical to maximise load spreading (by virtue of the late greedy 191 * scheduling -- each real engine takes the next available request 192 * upon idling). 193 */ 194 struct i915_request *request; 195 196 /* 197 * We keep a rbtree of available virtual engines inside each physical 198 * engine, sorted by priority. Here we preallocate the nodes we need 199 * for the virtual engine, indexed by physical_engine->id. 200 */ 201 struct ve_node { 202 struct rb_node rb; 203 int prio; 204 } nodes[I915_NUM_ENGINES]; 205 206 /* 207 * Keep track of bonded pairs -- restrictions upon on our selection 208 * of physical engines any particular request may be submitted to. 209 * If we receive a submit-fence from a master engine, we will only 210 * use one of sibling_mask physical engines. 211 */ 212 struct ve_bond { 213 const struct intel_engine_cs *master; 214 intel_engine_mask_t sibling_mask; 215 } *bonds; 216 unsigned int num_bonds; 217 218 /* And finally, which physical engines this virtual engine maps onto. */ 219 unsigned int num_siblings; 220 struct intel_engine_cs *siblings[0]; 221 }; 222 223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 224 { 225 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 226 return container_of(engine, struct virtual_engine, base); 227 } 228 229 static int __execlists_context_alloc(struct intel_context *ce, 230 struct intel_engine_cs *engine); 231 232 static void execlists_init_reg_state(u32 *reg_state, 233 struct intel_context *ce, 234 struct intel_engine_cs *engine, 235 struct intel_ring *ring); 236 237 static void mark_eio(struct i915_request *rq) 238 { 239 if (!i915_request_signaled(rq)) 240 dma_fence_set_error(&rq->fence, -EIO); 241 i915_request_mark_complete(rq); 242 } 243 244 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 245 { 246 return (i915_ggtt_offset(engine->status_page.vma) + 247 I915_GEM_HWS_PREEMPT_ADDR); 248 } 249 250 static inline void 251 ring_set_paused(const struct intel_engine_cs *engine, int state) 252 { 253 /* 254 * We inspect HWS_PREEMPT with a semaphore inside 255 * engine->emit_fini_breadcrumb. If the dword is true, 256 * the ring is paused as the semaphore will busywait 257 * until the dword is false. 258 */ 259 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 260 if (state) 261 wmb(); 262 } 263 264 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 265 { 266 return rb_entry(rb, struct i915_priolist, node); 267 } 268 269 static inline int rq_prio(const struct i915_request *rq) 270 { 271 return rq->sched.attr.priority; 272 } 273 274 static int effective_prio(const struct i915_request *rq) 275 { 276 int prio = rq_prio(rq); 277 278 /* 279 * If this request is special and must not be interrupted at any 280 * cost, so be it. Note we are only checking the most recent request 281 * in the context and so may be masking an earlier vip request. It 282 * is hoped that under the conditions where nopreempt is used, this 283 * will not matter (i.e. all requests to that context will be 284 * nopreempt for as long as desired). 285 */ 286 if (i915_request_has_nopreempt(rq)) 287 prio = I915_PRIORITY_UNPREEMPTABLE; 288 289 /* 290 * On unwinding the active request, we give it a priority bump 291 * if it has completed waiting on any semaphore. If we know that 292 * the request has already started, we can prevent an unwanted 293 * preempt-to-idle cycle by taking that into account now. 294 */ 295 if (__i915_request_has_started(rq)) 296 prio |= I915_PRIORITY_NOSEMAPHORE; 297 298 /* Restrict mere WAIT boosts from triggering preemption */ 299 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 300 return prio | __NO_PREEMPTION; 301 } 302 303 static int queue_prio(const struct intel_engine_execlists *execlists) 304 { 305 struct i915_priolist *p; 306 struct rb_node *rb; 307 308 rb = rb_first_cached(&execlists->queue); 309 if (!rb) 310 return INT_MIN; 311 312 /* 313 * As the priolist[] are inverted, with the highest priority in [0], 314 * we have to flip the index value to become priority. 315 */ 316 p = to_priolist(rb); 317 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 318 } 319 320 static inline bool need_preempt(const struct intel_engine_cs *engine, 321 const struct i915_request *rq, 322 struct rb_node *rb) 323 { 324 int last_prio; 325 326 if (!intel_engine_has_semaphores(engine)) 327 return false; 328 329 /* 330 * Check if the current priority hint merits a preemption attempt. 331 * 332 * We record the highest value priority we saw during rescheduling 333 * prior to this dequeue, therefore we know that if it is strictly 334 * less than the current tail of ESLP[0], we do not need to force 335 * a preempt-to-idle cycle. 336 * 337 * However, the priority hint is a mere hint that we may need to 338 * preempt. If that hint is stale or we may be trying to preempt 339 * ourselves, ignore the request. 340 */ 341 last_prio = effective_prio(rq); 342 if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint, 343 last_prio)) 344 return false; 345 346 /* 347 * Check against the first request in ELSP[1], it will, thanks to the 348 * power of PI, be the highest priority of that context. 349 */ 350 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 351 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 352 return true; 353 354 if (rb) { 355 struct virtual_engine *ve = 356 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 357 bool preempt = false; 358 359 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 360 struct i915_request *next; 361 362 rcu_read_lock(); 363 next = READ_ONCE(ve->request); 364 if (next) 365 preempt = rq_prio(next) > last_prio; 366 rcu_read_unlock(); 367 } 368 369 if (preempt) 370 return preempt; 371 } 372 373 /* 374 * If the inflight context did not trigger the preemption, then maybe 375 * it was the set of queued requests? Pick the highest priority in 376 * the queue (the first active priolist) and see if it deserves to be 377 * running instead of ELSP[0]. 378 * 379 * The highest priority request in the queue can not be either 380 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 381 * context, it's priority would not exceed ELSP[0] aka last_prio. 382 */ 383 return queue_prio(&engine->execlists) > last_prio; 384 } 385 386 __maybe_unused static inline bool 387 assert_priority_queue(const struct i915_request *prev, 388 const struct i915_request *next) 389 { 390 /* 391 * Without preemption, the prev may refer to the still active element 392 * which we refuse to let go. 393 * 394 * Even with preemption, there are times when we think it is better not 395 * to preempt and leave an ostensibly lower priority request in flight. 396 */ 397 if (i915_request_is_active(prev)) 398 return true; 399 400 return rq_prio(prev) >= rq_prio(next); 401 } 402 403 /* 404 * The context descriptor encodes various attributes of a context, 405 * including its GTT address and some flags. Because it's fairly 406 * expensive to calculate, we'll just do it once and cache the result, 407 * which remains valid until the context is unpinned. 408 * 409 * This is what a descriptor looks like, from LSB to MSB:: 410 * 411 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 412 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 413 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 414 * bits 53-54: mbz, reserved for use by hardware 415 * bits 55-63: group ID, currently unused and set to 0 416 * 417 * Starting from Gen11, the upper dword of the descriptor has a new format: 418 * 419 * bits 32-36: reserved 420 * bits 37-47: SW context ID 421 * bits 48:53: engine instance 422 * bit 54: mbz, reserved for use by hardware 423 * bits 55-60: SW counter 424 * bits 61-63: engine class 425 * 426 * engine info, SW context ID and SW counter need to form a unique number 427 * (Context ID) per lrc. 428 */ 429 static u64 430 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 431 { 432 struct i915_gem_context *ctx = ce->gem_context; 433 u64 desc; 434 435 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH))); 436 BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH))); 437 438 desc = INTEL_LEGACY_32B_CONTEXT; 439 if (i915_vm_is_4lvl(ce->vm)) 440 desc = INTEL_LEGACY_64B_CONTEXT; 441 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 442 443 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 444 if (IS_GEN(engine->i915, 8)) 445 desc |= GEN8_CTX_L3LLC_COHERENT; 446 447 desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE; 448 /* bits 12-31 */ 449 /* 450 * The following 32bits are copied into the OA reports (dword 2). 451 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 452 * anything below. 453 */ 454 if (INTEL_GEN(engine->i915) >= 11) { 455 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH)); 456 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT; 457 /* bits 37-47 */ 458 459 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 460 /* bits 48-53 */ 461 462 /* TODO: decide what to do with SW counter (bits 55-60) */ 463 464 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 465 /* bits 61-63 */ 466 } else { 467 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH)); 468 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ 469 } 470 471 return desc; 472 } 473 474 static void unwind_wa_tail(struct i915_request *rq) 475 { 476 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); 477 assert_ring_tail_valid(rq->ring, rq->tail); 478 } 479 480 static struct i915_request * 481 __unwind_incomplete_requests(struct intel_engine_cs *engine) 482 { 483 struct i915_request *rq, *rn, *active = NULL; 484 struct list_head *uninitialized_var(pl); 485 int prio = I915_PRIORITY_INVALID; 486 487 lockdep_assert_held(&engine->active.lock); 488 489 list_for_each_entry_safe_reverse(rq, rn, 490 &engine->active.requests, 491 sched.link) { 492 struct intel_engine_cs *owner; 493 494 if (i915_request_completed(rq)) 495 continue; /* XXX */ 496 497 __i915_request_unsubmit(rq); 498 unwind_wa_tail(rq); 499 500 /* 501 * Push the request back into the queue for later resubmission. 502 * If this request is not native to this physical engine (i.e. 503 * it came from a virtual source), push it back onto the virtual 504 * engine so that it can be moved across onto another physical 505 * engine as load dictates. 506 */ 507 owner = rq->hw_context->engine; 508 if (likely(owner == engine)) { 509 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 510 if (rq_prio(rq) != prio) { 511 prio = rq_prio(rq); 512 pl = i915_sched_lookup_priolist(engine, prio); 513 } 514 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 515 516 list_move(&rq->sched.link, pl); 517 active = rq; 518 } else { 519 /* 520 * Decouple the virtual breadcrumb before moving it 521 * back to the virtual engine -- we don't want the 522 * request to complete in the background and try 523 * and cancel the breadcrumb on the virtual engine 524 * (instead of the old engine where it is linked)! 525 */ 526 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 527 &rq->fence.flags)) { 528 spin_lock(&rq->lock); 529 i915_request_cancel_breadcrumb(rq); 530 spin_unlock(&rq->lock); 531 } 532 rq->engine = owner; 533 owner->submit_request(rq); 534 active = NULL; 535 } 536 } 537 538 return active; 539 } 540 541 struct i915_request * 542 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 543 { 544 struct intel_engine_cs *engine = 545 container_of(execlists, typeof(*engine), execlists); 546 547 return __unwind_incomplete_requests(engine); 548 } 549 550 static inline void 551 execlists_context_status_change(struct i915_request *rq, unsigned long status) 552 { 553 /* 554 * Only used when GVT-g is enabled now. When GVT-g is disabled, 555 * The compiler should eliminate this function as dead-code. 556 */ 557 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 558 return; 559 560 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 561 status, rq); 562 } 563 564 static inline struct intel_engine_cs * 565 __execlists_schedule_in(struct i915_request *rq) 566 { 567 struct intel_engine_cs * const engine = rq->engine; 568 struct intel_context * const ce = rq->hw_context; 569 570 intel_context_get(ce); 571 572 intel_gt_pm_get(engine->gt); 573 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 574 intel_engine_context_in(engine); 575 576 return engine; 577 } 578 579 static inline struct i915_request * 580 execlists_schedule_in(struct i915_request *rq, int idx) 581 { 582 struct intel_context * const ce = rq->hw_context; 583 struct intel_engine_cs *old; 584 585 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 586 trace_i915_request_in(rq, idx); 587 588 old = READ_ONCE(ce->inflight); 589 do { 590 if (!old) { 591 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 592 break; 593 } 594 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 595 596 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 597 return i915_request_get(rq); 598 } 599 600 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 601 { 602 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 603 struct i915_request *next = READ_ONCE(ve->request); 604 605 if (next && next->execution_mask & ~rq->execution_mask) 606 tasklet_schedule(&ve->base.execlists.tasklet); 607 } 608 609 static inline void 610 __execlists_schedule_out(struct i915_request *rq, 611 struct intel_engine_cs * const engine) 612 { 613 struct intel_context * const ce = rq->hw_context; 614 615 intel_engine_context_out(engine); 616 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 617 intel_gt_pm_put(engine->gt); 618 619 /* 620 * If this is part of a virtual engine, its next request may 621 * have been blocked waiting for access to the active context. 622 * We have to kick all the siblings again in case we need to 623 * switch (e.g. the next request is not runnable on this 624 * engine). Hopefully, we will already have submitted the next 625 * request before the tasklet runs and do not need to rebuild 626 * each virtual tree and kick everyone again. 627 */ 628 if (ce->engine != engine) 629 kick_siblings(rq, ce); 630 631 intel_context_put(ce); 632 } 633 634 static inline void 635 execlists_schedule_out(struct i915_request *rq) 636 { 637 struct intel_context * const ce = rq->hw_context; 638 struct intel_engine_cs *cur, *old; 639 640 trace_i915_request_out(rq); 641 642 old = READ_ONCE(ce->inflight); 643 do 644 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 645 while (!try_cmpxchg(&ce->inflight, &old, cur)); 646 if (!cur) 647 __execlists_schedule_out(rq, old); 648 649 i915_request_put(rq); 650 } 651 652 static u64 execlists_update_context(const struct i915_request *rq) 653 { 654 struct intel_context *ce = rq->hw_context; 655 u64 desc; 656 657 ce->lrc_reg_state[CTX_RING_TAIL + 1] = 658 intel_ring_set_tail(rq->ring, rq->tail); 659 660 /* 661 * Make sure the context image is complete before we submit it to HW. 662 * 663 * Ostensibly, writes (including the WCB) should be flushed prior to 664 * an uncached write such as our mmio register access, the empirical 665 * evidence (esp. on Braswell) suggests that the WC write into memory 666 * may not be visible to the HW prior to the completion of the UC 667 * register write and that we may begin execution from the context 668 * before its image is complete leading to invalid PD chasing. 669 * 670 * Furthermore, Braswell, at least, wants a full mb to be sure that 671 * the writes are coherent in memory (visible to the GPU) prior to 672 * execution, and not just visible to other CPUs (as is the result of 673 * wmb). 674 */ 675 mb(); 676 677 desc = ce->lrc_desc; 678 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 679 680 return desc; 681 } 682 683 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 684 { 685 if (execlists->ctrl_reg) { 686 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 687 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 688 } else { 689 writel(upper_32_bits(desc), execlists->submit_reg); 690 writel(lower_32_bits(desc), execlists->submit_reg); 691 } 692 } 693 694 static __maybe_unused void 695 trace_ports(const struct intel_engine_execlists *execlists, 696 const char *msg, 697 struct i915_request * const *ports) 698 { 699 const struct intel_engine_cs *engine = 700 container_of(execlists, typeof(*engine), execlists); 701 702 GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n", 703 engine->name, msg, 704 ports[0]->fence.context, 705 ports[0]->fence.seqno, 706 i915_request_completed(ports[0]) ? "!" : 707 i915_request_started(ports[0]) ? "*" : 708 "", 709 ports[1] ? ports[1]->fence.context : 0, 710 ports[1] ? ports[1]->fence.seqno : 0); 711 } 712 713 static __maybe_unused bool 714 assert_pending_valid(const struct intel_engine_execlists *execlists, 715 const char *msg) 716 { 717 struct i915_request * const *port, *rq; 718 struct intel_context *ce = NULL; 719 720 trace_ports(execlists, msg, execlists->pending); 721 722 if (!execlists->pending[0]) 723 return false; 724 725 if (execlists->pending[execlists_num_ports(execlists)]) 726 return false; 727 728 for (port = execlists->pending; (rq = *port); port++) { 729 if (ce == rq->hw_context) 730 return false; 731 732 ce = rq->hw_context; 733 if (i915_request_completed(rq)) 734 continue; 735 736 if (i915_active_is_idle(&ce->active)) 737 return false; 738 739 if (!i915_vma_is_pinned(ce->state)) 740 return false; 741 } 742 743 return ce; 744 } 745 746 static void execlists_submit_ports(struct intel_engine_cs *engine) 747 { 748 struct intel_engine_execlists *execlists = &engine->execlists; 749 unsigned int n; 750 751 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 752 753 /* 754 * We can skip acquiring intel_runtime_pm_get() here as it was taken 755 * on our behalf by the request (see i915_gem_mark_busy()) and it will 756 * not be relinquished until the device is idle (see 757 * i915_gem_idle_work_handler()). As a precaution, we make sure 758 * that all ELSP are drained i.e. we have processed the CSB, 759 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 760 */ 761 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 762 763 /* 764 * ELSQ note: the submit queue is not cleared after being submitted 765 * to the HW so we need to make sure we always clean it up. This is 766 * currently ensured by the fact that we always write the same number 767 * of elsq entries, keep this in mind before changing the loop below. 768 */ 769 for (n = execlists_num_ports(execlists); n--; ) { 770 struct i915_request *rq = execlists->pending[n]; 771 772 write_desc(execlists, 773 rq ? execlists_update_context(rq) : 0, 774 n); 775 } 776 777 /* we need to manually load the submit queue */ 778 if (execlists->ctrl_reg) 779 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 780 } 781 782 static bool ctx_single_port_submission(const struct intel_context *ce) 783 { 784 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 785 i915_gem_context_force_single_submission(ce->gem_context)); 786 } 787 788 static bool can_merge_ctx(const struct intel_context *prev, 789 const struct intel_context *next) 790 { 791 if (prev != next) 792 return false; 793 794 if (ctx_single_port_submission(prev)) 795 return false; 796 797 return true; 798 } 799 800 static bool can_merge_rq(const struct i915_request *prev, 801 const struct i915_request *next) 802 { 803 GEM_BUG_ON(prev == next); 804 GEM_BUG_ON(!assert_priority_queue(prev, next)); 805 806 /* 807 * We do not submit known completed requests. Therefore if the next 808 * request is already completed, we can pretend to merge it in 809 * with the previous context (and we will skip updating the ELSP 810 * and tracking). Thus hopefully keeping the ELSP full with active 811 * contexts, despite the best efforts of preempt-to-busy to confuse 812 * us. 813 */ 814 if (i915_request_completed(next)) 815 return true; 816 817 if (!can_merge_ctx(prev->hw_context, next->hw_context)) 818 return false; 819 820 return true; 821 } 822 823 static void virtual_update_register_offsets(u32 *regs, 824 struct intel_engine_cs *engine) 825 { 826 u32 base = engine->mmio_base; 827 828 /* Must match execlists_init_reg_state()! */ 829 830 regs[CTX_CONTEXT_CONTROL] = 831 i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base)); 832 regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base)); 833 regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base)); 834 regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base)); 835 regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base)); 836 837 regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base)); 838 regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base)); 839 regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base)); 840 regs[CTX_SECOND_BB_HEAD_U] = 841 i915_mmio_reg_offset(RING_SBBADDR_UDW(base)); 842 regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base)); 843 regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base)); 844 845 regs[CTX_CTX_TIMESTAMP] = 846 i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base)); 847 regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3)); 848 regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3)); 849 regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2)); 850 regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2)); 851 regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1)); 852 regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1)); 853 regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0)); 854 regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0)); 855 856 if (engine->class == RENDER_CLASS) { 857 regs[CTX_RCS_INDIRECT_CTX] = 858 i915_mmio_reg_offset(RING_INDIRECT_CTX(base)); 859 regs[CTX_RCS_INDIRECT_CTX_OFFSET] = 860 i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base)); 861 regs[CTX_BB_PER_CTX_PTR] = 862 i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base)); 863 864 regs[CTX_R_PWR_CLK_STATE] = 865 i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE); 866 } 867 } 868 869 static bool virtual_matches(const struct virtual_engine *ve, 870 const struct i915_request *rq, 871 const struct intel_engine_cs *engine) 872 { 873 const struct intel_engine_cs *inflight; 874 875 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 876 return false; 877 878 /* 879 * We track when the HW has completed saving the context image 880 * (i.e. when we have seen the final CS event switching out of 881 * the context) and must not overwrite the context image before 882 * then. This restricts us to only using the active engine 883 * while the previous virtualized request is inflight (so 884 * we reuse the register offsets). This is a very small 885 * hystersis on the greedy seelction algorithm. 886 */ 887 inflight = intel_context_inflight(&ve->context); 888 if (inflight && inflight != engine) 889 return false; 890 891 return true; 892 } 893 894 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 895 struct intel_engine_cs *engine) 896 { 897 struct intel_engine_cs *old = ve->siblings[0]; 898 899 /* All unattached (rq->engine == old) must already be completed */ 900 901 spin_lock(&old->breadcrumbs.irq_lock); 902 if (!list_empty(&ve->context.signal_link)) { 903 list_move_tail(&ve->context.signal_link, 904 &engine->breadcrumbs.signalers); 905 intel_engine_queue_breadcrumbs(engine); 906 } 907 spin_unlock(&old->breadcrumbs.irq_lock); 908 } 909 910 static struct i915_request * 911 last_active(const struct intel_engine_execlists *execlists) 912 { 913 struct i915_request * const *last = READ_ONCE(execlists->active); 914 915 while (*last && i915_request_completed(*last)) 916 last++; 917 918 return *last; 919 } 920 921 static void defer_request(struct i915_request *rq, struct list_head * const pl) 922 { 923 LIST_HEAD(list); 924 925 /* 926 * We want to move the interrupted request to the back of 927 * the round-robin list (i.e. its priority level), but 928 * in doing so, we must then move all requests that were in 929 * flight and were waiting for the interrupted request to 930 * be run after it again. 931 */ 932 do { 933 struct i915_dependency *p; 934 935 GEM_BUG_ON(i915_request_is_active(rq)); 936 list_move_tail(&rq->sched.link, pl); 937 938 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 939 struct i915_request *w = 940 container_of(p->waiter, typeof(*w), sched); 941 942 /* Leave semaphores spinning on the other engines */ 943 if (w->engine != rq->engine) 944 continue; 945 946 /* No waiter should start before its signaler */ 947 GEM_BUG_ON(i915_request_started(w) && 948 !i915_request_completed(rq)); 949 950 GEM_BUG_ON(i915_request_is_active(w)); 951 if (list_empty(&w->sched.link)) 952 continue; /* Not yet submitted; unready */ 953 954 if (rq_prio(w) < rq_prio(rq)) 955 continue; 956 957 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 958 list_move_tail(&w->sched.link, &list); 959 } 960 961 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 962 } while (rq); 963 } 964 965 static void defer_active(struct intel_engine_cs *engine) 966 { 967 struct i915_request *rq; 968 969 rq = __unwind_incomplete_requests(engine); 970 if (!rq) 971 return; 972 973 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 974 } 975 976 static bool 977 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 978 { 979 int hint; 980 981 if (!intel_engine_has_semaphores(engine)) 982 return false; 983 984 if (list_is_last(&rq->sched.link, &engine->active.requests)) 985 return false; 986 987 hint = max(rq_prio(list_next_entry(rq, sched.link)), 988 engine->execlists.queue_priority_hint); 989 990 return hint >= effective_prio(rq); 991 } 992 993 static int 994 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 995 { 996 if (list_is_last(&rq->sched.link, &engine->active.requests)) 997 return INT_MIN; 998 999 return rq_prio(list_next_entry(rq, sched.link)); 1000 } 1001 1002 static bool 1003 enable_timeslice(const struct intel_engine_execlists *execlists) 1004 { 1005 const struct i915_request *rq = *execlists->active; 1006 1007 if (i915_request_completed(rq)) 1008 return false; 1009 1010 return execlists->switch_priority_hint >= effective_prio(rq); 1011 } 1012 1013 static void record_preemption(struct intel_engine_execlists *execlists) 1014 { 1015 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1016 } 1017 1018 static void execlists_dequeue(struct intel_engine_cs *engine) 1019 { 1020 struct intel_engine_execlists * const execlists = &engine->execlists; 1021 struct i915_request **port = execlists->pending; 1022 struct i915_request ** const last_port = port + execlists->port_mask; 1023 struct i915_request *last; 1024 struct rb_node *rb; 1025 bool submit = false; 1026 1027 /* 1028 * Hardware submission is through 2 ports. Conceptually each port 1029 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1030 * static for a context, and unique to each, so we only execute 1031 * requests belonging to a single context from each ring. RING_HEAD 1032 * is maintained by the CS in the context image, it marks the place 1033 * where it got up to last time, and through RING_TAIL we tell the CS 1034 * where we want to execute up to this time. 1035 * 1036 * In this list the requests are in order of execution. Consecutive 1037 * requests from the same context are adjacent in the ringbuffer. We 1038 * can combine these requests into a single RING_TAIL update: 1039 * 1040 * RING_HEAD...req1...req2 1041 * ^- RING_TAIL 1042 * since to execute req2 the CS must first execute req1. 1043 * 1044 * Our goal then is to point each port to the end of a consecutive 1045 * sequence of requests as being the most optimal (fewest wake ups 1046 * and context switches) submission. 1047 */ 1048 1049 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1050 struct virtual_engine *ve = 1051 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1052 struct i915_request *rq = READ_ONCE(ve->request); 1053 1054 if (!rq) { /* lazily cleanup after another engine handled rq */ 1055 rb_erase_cached(rb, &execlists->virtual); 1056 RB_CLEAR_NODE(rb); 1057 rb = rb_first_cached(&execlists->virtual); 1058 continue; 1059 } 1060 1061 if (!virtual_matches(ve, rq, engine)) { 1062 rb = rb_next(rb); 1063 continue; 1064 } 1065 1066 break; 1067 } 1068 1069 /* 1070 * If the queue is higher priority than the last 1071 * request in the currently active context, submit afresh. 1072 * We will resubmit again afterwards in case we need to split 1073 * the active context to interject the preemption request, 1074 * i.e. we will retrigger preemption following the ack in case 1075 * of trouble. 1076 */ 1077 last = last_active(execlists); 1078 if (last) { 1079 if (need_preempt(engine, last, rb)) { 1080 GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n", 1081 engine->name, 1082 last->fence.context, 1083 last->fence.seqno, 1084 last->sched.attr.priority, 1085 execlists->queue_priority_hint); 1086 record_preemption(execlists); 1087 1088 /* 1089 * Don't let the RING_HEAD advance past the breadcrumb 1090 * as we unwind (and until we resubmit) so that we do 1091 * not accidentally tell it to go backwards. 1092 */ 1093 ring_set_paused(engine, 1); 1094 1095 /* 1096 * Note that we have not stopped the GPU at this point, 1097 * so we are unwinding the incomplete requests as they 1098 * remain inflight and so by the time we do complete 1099 * the preemption, some of the unwound requests may 1100 * complete! 1101 */ 1102 __unwind_incomplete_requests(engine); 1103 1104 /* 1105 * If we need to return to the preempted context, we 1106 * need to skip the lite-restore and force it to 1107 * reload the RING_TAIL. Otherwise, the HW has a 1108 * tendency to ignore us rewinding the TAIL to the 1109 * end of an earlier request. 1110 */ 1111 last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1112 last = NULL; 1113 } else if (need_timeslice(engine, last) && 1114 !timer_pending(&engine->execlists.timer)) { 1115 GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", 1116 engine->name, 1117 last->fence.context, 1118 last->fence.seqno, 1119 last->sched.attr.priority, 1120 execlists->queue_priority_hint); 1121 1122 ring_set_paused(engine, 1); 1123 defer_active(engine); 1124 1125 /* 1126 * Unlike for preemption, if we rewind and continue 1127 * executing the same context as previously active, 1128 * the order of execution will remain the same and 1129 * the tail will only advance. We do not need to 1130 * force a full context restore, as a lite-restore 1131 * is sufficient to resample the monotonic TAIL. 1132 * 1133 * If we switch to any other context, similarly we 1134 * will not rewind TAIL of current context, and 1135 * normal save/restore will preserve state and allow 1136 * us to later continue executing the same request. 1137 */ 1138 last = NULL; 1139 } else { 1140 /* 1141 * Otherwise if we already have a request pending 1142 * for execution after the current one, we can 1143 * just wait until the next CS event before 1144 * queuing more. In either case we will force a 1145 * lite-restore preemption event, but if we wait 1146 * we hopefully coalesce several updates into a single 1147 * submission. 1148 */ 1149 if (!list_is_last(&last->sched.link, 1150 &engine->active.requests)) 1151 return; 1152 1153 /* 1154 * WaIdleLiteRestore:bdw,skl 1155 * Apply the wa NOOPs to prevent 1156 * ring:HEAD == rq:TAIL as we resubmit the 1157 * request. See gen8_emit_fini_breadcrumb() for 1158 * where we prepare the padding after the 1159 * end of the request. 1160 */ 1161 last->tail = last->wa_tail; 1162 } 1163 } 1164 1165 while (rb) { /* XXX virtual is always taking precedence */ 1166 struct virtual_engine *ve = 1167 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1168 struct i915_request *rq; 1169 1170 spin_lock(&ve->base.active.lock); 1171 1172 rq = ve->request; 1173 if (unlikely(!rq)) { /* lost the race to a sibling */ 1174 spin_unlock(&ve->base.active.lock); 1175 rb_erase_cached(rb, &execlists->virtual); 1176 RB_CLEAR_NODE(rb); 1177 rb = rb_first_cached(&execlists->virtual); 1178 continue; 1179 } 1180 1181 GEM_BUG_ON(rq != ve->request); 1182 GEM_BUG_ON(rq->engine != &ve->base); 1183 GEM_BUG_ON(rq->hw_context != &ve->context); 1184 1185 if (rq_prio(rq) >= queue_prio(execlists)) { 1186 if (!virtual_matches(ve, rq, engine)) { 1187 spin_unlock(&ve->base.active.lock); 1188 rb = rb_next(rb); 1189 continue; 1190 } 1191 1192 if (last && !can_merge_rq(last, rq)) { 1193 spin_unlock(&ve->base.active.lock); 1194 return; /* leave this for another */ 1195 } 1196 1197 GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", 1198 engine->name, 1199 rq->fence.context, 1200 rq->fence.seqno, 1201 i915_request_completed(rq) ? "!" : 1202 i915_request_started(rq) ? "*" : 1203 "", 1204 yesno(engine != ve->siblings[0])); 1205 1206 ve->request = NULL; 1207 ve->base.execlists.queue_priority_hint = INT_MIN; 1208 rb_erase_cached(rb, &execlists->virtual); 1209 RB_CLEAR_NODE(rb); 1210 1211 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1212 rq->engine = engine; 1213 1214 if (engine != ve->siblings[0]) { 1215 u32 *regs = ve->context.lrc_reg_state; 1216 unsigned int n; 1217 1218 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1219 virtual_update_register_offsets(regs, engine); 1220 1221 if (!list_empty(&ve->context.signals)) 1222 virtual_xfer_breadcrumbs(ve, engine); 1223 1224 /* 1225 * Move the bound engine to the top of the list 1226 * for future execution. We then kick this 1227 * tasklet first before checking others, so that 1228 * we preferentially reuse this set of bound 1229 * registers. 1230 */ 1231 for (n = 1; n < ve->num_siblings; n++) { 1232 if (ve->siblings[n] == engine) { 1233 swap(ve->siblings[n], 1234 ve->siblings[0]); 1235 break; 1236 } 1237 } 1238 1239 GEM_BUG_ON(ve->siblings[0] != engine); 1240 } 1241 1242 if (__i915_request_submit(rq)) { 1243 submit = true; 1244 last = rq; 1245 } 1246 i915_request_put(rq); 1247 1248 /* 1249 * Hmm, we have a bunch of virtual engine requests, 1250 * but the first one was already completed (thanks 1251 * preempt-to-busy!). Keep looking at the veng queue 1252 * until we have no more relevant requests (i.e. 1253 * the normal submit queue has higher priority). 1254 */ 1255 if (!submit) { 1256 spin_unlock(&ve->base.active.lock); 1257 rb = rb_first_cached(&execlists->virtual); 1258 continue; 1259 } 1260 } 1261 1262 spin_unlock(&ve->base.active.lock); 1263 break; 1264 } 1265 1266 while ((rb = rb_first_cached(&execlists->queue))) { 1267 struct i915_priolist *p = to_priolist(rb); 1268 struct i915_request *rq, *rn; 1269 int i; 1270 1271 priolist_for_each_request_consume(rq, rn, p, i) { 1272 bool merge = true; 1273 1274 /* 1275 * Can we combine this request with the current port? 1276 * It has to be the same context/ringbuffer and not 1277 * have any exceptions (e.g. GVT saying never to 1278 * combine contexts). 1279 * 1280 * If we can combine the requests, we can execute both 1281 * by updating the RING_TAIL to point to the end of the 1282 * second request, and so we never need to tell the 1283 * hardware about the first. 1284 */ 1285 if (last && !can_merge_rq(last, rq)) { 1286 /* 1287 * If we are on the second port and cannot 1288 * combine this request with the last, then we 1289 * are done. 1290 */ 1291 if (port == last_port) 1292 goto done; 1293 1294 /* 1295 * We must not populate both ELSP[] with the 1296 * same LRCA, i.e. we must submit 2 different 1297 * contexts if we submit 2 ELSP. 1298 */ 1299 if (last->hw_context == rq->hw_context) 1300 goto done; 1301 1302 /* 1303 * If GVT overrides us we only ever submit 1304 * port[0], leaving port[1] empty. Note that we 1305 * also have to be careful that we don't queue 1306 * the same context (even though a different 1307 * request) to the second port. 1308 */ 1309 if (ctx_single_port_submission(last->hw_context) || 1310 ctx_single_port_submission(rq->hw_context)) 1311 goto done; 1312 1313 merge = false; 1314 } 1315 1316 if (__i915_request_submit(rq)) { 1317 if (!merge) { 1318 *port = execlists_schedule_in(last, port - execlists->pending); 1319 port++; 1320 last = NULL; 1321 } 1322 1323 GEM_BUG_ON(last && 1324 !can_merge_ctx(last->hw_context, 1325 rq->hw_context)); 1326 1327 submit = true; 1328 last = rq; 1329 } 1330 } 1331 1332 rb_erase_cached(&p->node, &execlists->queue); 1333 i915_priolist_free(p); 1334 } 1335 1336 done: 1337 /* 1338 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 1339 * 1340 * We choose the priority hint such that if we add a request of greater 1341 * priority than this, we kick the submission tasklet to decide on 1342 * the right order of submitting the requests to hardware. We must 1343 * also be prepared to reorder requests as they are in-flight on the 1344 * HW. We derive the priority hint then as the first "hole" in 1345 * the HW submission ports and if there are no available slots, 1346 * the priority of the lowest executing request, i.e. last. 1347 * 1348 * When we do receive a higher priority request ready to run from the 1349 * user, see queue_request(), the priority hint is bumped to that 1350 * request triggering preemption on the next dequeue (or subsequent 1351 * interrupt for secondary ports). 1352 */ 1353 execlists->queue_priority_hint = queue_prio(execlists); 1354 GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n", 1355 engine->name, execlists->queue_priority_hint, 1356 yesno(submit)); 1357 1358 if (submit) { 1359 *port = execlists_schedule_in(last, port - execlists->pending); 1360 memset(port + 1, 0, (last_port - port) * sizeof(*port)); 1361 execlists->switch_priority_hint = 1362 switch_prio(engine, *execlists->pending); 1363 execlists_submit_ports(engine); 1364 } else { 1365 ring_set_paused(engine, 0); 1366 } 1367 } 1368 1369 static void 1370 cancel_port_requests(struct intel_engine_execlists * const execlists) 1371 { 1372 struct i915_request * const *port, *rq; 1373 1374 for (port = execlists->pending; (rq = *port); port++) 1375 execlists_schedule_out(rq); 1376 memset(execlists->pending, 0, sizeof(execlists->pending)); 1377 1378 for (port = execlists->active; (rq = *port); port++) 1379 execlists_schedule_out(rq); 1380 execlists->active = 1381 memset(execlists->inflight, 0, sizeof(execlists->inflight)); 1382 } 1383 1384 static inline void 1385 invalidate_csb_entries(const u32 *first, const u32 *last) 1386 { 1387 clflush((void *)first); 1388 clflush((void *)last); 1389 } 1390 1391 static inline bool 1392 reset_in_progress(const struct intel_engine_execlists *execlists) 1393 { 1394 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1395 } 1396 1397 enum csb_step { 1398 CSB_NOP, 1399 CSB_PROMOTE, 1400 CSB_PREEMPT, 1401 CSB_COMPLETE, 1402 }; 1403 1404 /* 1405 * Starting with Gen12, the status has a new format: 1406 * 1407 * bit 0: switched to new queue 1408 * bit 1: reserved 1409 * bit 2: semaphore wait mode (poll or signal), only valid when 1410 * switch detail is set to "wait on semaphore" 1411 * bits 3-5: engine class 1412 * bits 6-11: engine instance 1413 * bits 12-14: reserved 1414 * bits 15-25: sw context id of the lrc the GT switched to 1415 * bits 26-31: sw counter of the lrc the GT switched to 1416 * bits 32-35: context switch detail 1417 * - 0: ctx complete 1418 * - 1: wait on sync flip 1419 * - 2: wait on vblank 1420 * - 3: wait on scanline 1421 * - 4: wait on semaphore 1422 * - 5: context preempted (not on SEMAPHORE_WAIT or 1423 * WAIT_FOR_EVENT) 1424 * bit 36: reserved 1425 * bits 37-43: wait detail (for switch detail 1 to 4) 1426 * bits 44-46: reserved 1427 * bits 47-57: sw context id of the lrc the GT switched away from 1428 * bits 58-63: sw counter of the lrc the GT switched away from 1429 */ 1430 static inline enum csb_step 1431 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 1432 { 1433 u32 lower_dw = csb[0]; 1434 u32 upper_dw = csb[1]; 1435 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 1436 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 1437 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 1438 1439 if (!ctx_away_valid && ctx_to_valid) 1440 return CSB_PROMOTE; 1441 1442 /* 1443 * The context switch detail is not guaranteed to be 5 when a preemption 1444 * occurs, so we can't just check for that. The check below works for 1445 * all the cases we care about, including preemptions of WAIT 1446 * instructions and lite-restore. Preempt-to-idle via the CTRL register 1447 * would require some extra handling, but we don't support that. 1448 */ 1449 if (new_queue && ctx_away_valid) 1450 return CSB_PREEMPT; 1451 1452 /* 1453 * switch detail = 5 is covered by the case above and we do not expect a 1454 * context switch on an unsuccessful wait instruction since we always 1455 * use polling mode. 1456 */ 1457 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 1458 1459 if (*execlists->active) { 1460 GEM_BUG_ON(!ctx_away_valid); 1461 return CSB_COMPLETE; 1462 } 1463 1464 return CSB_NOP; 1465 } 1466 1467 static inline enum csb_step 1468 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 1469 { 1470 unsigned int status = *csb; 1471 1472 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) 1473 return CSB_PROMOTE; 1474 1475 if (status & GEN8_CTX_STATUS_PREEMPTED) 1476 return CSB_PREEMPT; 1477 1478 if (*execlists->active) 1479 return CSB_COMPLETE; 1480 1481 return CSB_NOP; 1482 } 1483 1484 static void process_csb(struct intel_engine_cs *engine) 1485 { 1486 struct intel_engine_execlists * const execlists = &engine->execlists; 1487 const u32 * const buf = execlists->csb_status; 1488 const u8 num_entries = execlists->csb_size; 1489 u8 head, tail; 1490 1491 GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915)); 1492 1493 /* 1494 * Note that csb_write, csb_status may be either in HWSP or mmio. 1495 * When reading from the csb_write mmio register, we have to be 1496 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 1497 * the low 4bits. As it happens we know the next 4bits are always 1498 * zero and so we can simply masked off the low u8 of the register 1499 * and treat it identically to reading from the HWSP (without having 1500 * to use explicit shifting and masking, and probably bifurcating 1501 * the code to handle the legacy mmio read). 1502 */ 1503 head = execlists->csb_head; 1504 tail = READ_ONCE(*execlists->csb_write); 1505 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail); 1506 if (unlikely(head == tail)) 1507 return; 1508 1509 /* 1510 * Hopefully paired with a wmb() in HW! 1511 * 1512 * We must complete the read of the write pointer before any reads 1513 * from the CSB, so that we do not see stale values. Without an rmb 1514 * (lfence) the HW may speculatively perform the CSB[] reads *before* 1515 * we perform the READ_ONCE(*csb_write). 1516 */ 1517 rmb(); 1518 1519 do { 1520 enum csb_step csb_step; 1521 1522 if (++head == num_entries) 1523 head = 0; 1524 1525 /* 1526 * We are flying near dragons again. 1527 * 1528 * We hold a reference to the request in execlist_port[] 1529 * but no more than that. We are operating in softirq 1530 * context and so cannot hold any mutex or sleep. That 1531 * prevents us stopping the requests we are processing 1532 * in port[] from being retired simultaneously (the 1533 * breadcrumb will be complete before we see the 1534 * context-switch). As we only hold the reference to the 1535 * request, any pointer chasing underneath the request 1536 * is subject to a potential use-after-free. Thus we 1537 * store all of the bookkeeping within port[] as 1538 * required, and avoid using unguarded pointers beneath 1539 * request itself. The same applies to the atomic 1540 * status notifier. 1541 */ 1542 1543 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n", 1544 engine->name, head, 1545 buf[2 * head + 0], buf[2 * head + 1]); 1546 1547 if (INTEL_GEN(engine->i915) >= 12) 1548 csb_step = gen12_csb_parse(execlists, buf + 2 * head); 1549 else 1550 csb_step = gen8_csb_parse(execlists, buf + 2 * head); 1551 1552 switch (csb_step) { 1553 case CSB_PREEMPT: /* cancel old inflight, prepare for switch */ 1554 trace_ports(execlists, "preempted", execlists->active); 1555 1556 while (*execlists->active) 1557 execlists_schedule_out(*execlists->active++); 1558 1559 /* fallthrough */ 1560 case CSB_PROMOTE: /* switch pending to inflight */ 1561 GEM_BUG_ON(*execlists->active); 1562 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 1563 execlists->active = 1564 memcpy(execlists->inflight, 1565 execlists->pending, 1566 execlists_num_ports(execlists) * 1567 sizeof(*execlists->pending)); 1568 1569 if (enable_timeslice(execlists)) 1570 mod_timer(&execlists->timer, jiffies + 1); 1571 1572 if (!inject_preempt_hang(execlists)) 1573 ring_set_paused(engine, 0); 1574 1575 WRITE_ONCE(execlists->pending[0], NULL); 1576 break; 1577 1578 case CSB_COMPLETE: /* port0 completed, advanced to port1 */ 1579 trace_ports(execlists, "completed", execlists->active); 1580 1581 /* 1582 * We rely on the hardware being strongly 1583 * ordered, that the breadcrumb write is 1584 * coherent (visible from the CPU) before the 1585 * user interrupt and CSB is processed. 1586 */ 1587 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 1588 !reset_in_progress(execlists)); 1589 execlists_schedule_out(*execlists->active++); 1590 1591 GEM_BUG_ON(execlists->active - execlists->inflight > 1592 execlists_num_ports(execlists)); 1593 break; 1594 1595 case CSB_NOP: 1596 break; 1597 } 1598 } while (head != tail); 1599 1600 execlists->csb_head = head; 1601 1602 /* 1603 * Gen11 has proven to fail wrt global observation point between 1604 * entry and tail update, failing on the ordering and thus 1605 * we see an old entry in the context status buffer. 1606 * 1607 * Forcibly evict out entries for the next gpu csb update, 1608 * to increase the odds that we get a fresh entries with non 1609 * working hardware. The cost for doing so comes out mostly with 1610 * the wash as hardware, working or not, will need to do the 1611 * invalidation before. 1612 */ 1613 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 1614 } 1615 1616 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 1617 { 1618 lockdep_assert_held(&engine->active.lock); 1619 if (!engine->execlists.pending[0]) { 1620 rcu_read_lock(); /* protect peeking at execlists->active */ 1621 execlists_dequeue(engine); 1622 rcu_read_unlock(); 1623 } 1624 } 1625 1626 /* 1627 * Check the unread Context Status Buffers and manage the submission of new 1628 * contexts to the ELSP accordingly. 1629 */ 1630 static void execlists_submission_tasklet(unsigned long data) 1631 { 1632 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 1633 unsigned long flags; 1634 1635 process_csb(engine); 1636 if (!READ_ONCE(engine->execlists.pending[0])) { 1637 spin_lock_irqsave(&engine->active.lock, flags); 1638 __execlists_submission_tasklet(engine); 1639 spin_unlock_irqrestore(&engine->active.lock, flags); 1640 } 1641 } 1642 1643 static void execlists_submission_timer(struct timer_list *timer) 1644 { 1645 struct intel_engine_cs *engine = 1646 from_timer(engine, timer, execlists.timer); 1647 1648 /* Kick the tasklet for some interrupt coalescing and reset handling */ 1649 tasklet_hi_schedule(&engine->execlists.tasklet); 1650 } 1651 1652 static void queue_request(struct intel_engine_cs *engine, 1653 struct i915_sched_node *node, 1654 int prio) 1655 { 1656 GEM_BUG_ON(!list_empty(&node->link)); 1657 list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); 1658 } 1659 1660 static void __submit_queue_imm(struct intel_engine_cs *engine) 1661 { 1662 struct intel_engine_execlists * const execlists = &engine->execlists; 1663 1664 if (reset_in_progress(execlists)) 1665 return; /* defer until we restart the engine following reset */ 1666 1667 if (execlists->tasklet.func == execlists_submission_tasklet) 1668 __execlists_submission_tasklet(engine); 1669 else 1670 tasklet_hi_schedule(&execlists->tasklet); 1671 } 1672 1673 static void submit_queue(struct intel_engine_cs *engine, 1674 const struct i915_request *rq) 1675 { 1676 struct intel_engine_execlists *execlists = &engine->execlists; 1677 1678 if (rq_prio(rq) <= execlists->queue_priority_hint) 1679 return; 1680 1681 execlists->queue_priority_hint = rq_prio(rq); 1682 __submit_queue_imm(engine); 1683 } 1684 1685 static void execlists_submit_request(struct i915_request *request) 1686 { 1687 struct intel_engine_cs *engine = request->engine; 1688 unsigned long flags; 1689 1690 /* Will be called from irq-context when using foreign fences. */ 1691 spin_lock_irqsave(&engine->active.lock, flags); 1692 1693 queue_request(engine, &request->sched, rq_prio(request)); 1694 1695 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1696 GEM_BUG_ON(list_empty(&request->sched.link)); 1697 1698 submit_queue(engine, request); 1699 1700 spin_unlock_irqrestore(&engine->active.lock, flags); 1701 } 1702 1703 static void __execlists_context_fini(struct intel_context *ce) 1704 { 1705 intel_ring_put(ce->ring); 1706 i915_vma_put(ce->state); 1707 } 1708 1709 static void execlists_context_destroy(struct kref *kref) 1710 { 1711 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1712 1713 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1714 GEM_BUG_ON(intel_context_is_pinned(ce)); 1715 1716 if (ce->state) 1717 __execlists_context_fini(ce); 1718 1719 intel_context_fini(ce); 1720 intel_context_free(ce); 1721 } 1722 1723 static void 1724 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 1725 { 1726 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1727 return; 1728 1729 vaddr += LRC_HEADER_PAGES * PAGE_SIZE; 1730 vaddr += engine->context_size; 1731 1732 memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE); 1733 } 1734 1735 static void 1736 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 1737 { 1738 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1739 return; 1740 1741 vaddr += LRC_HEADER_PAGES * PAGE_SIZE; 1742 vaddr += engine->context_size; 1743 1744 if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) 1745 dev_err_once(engine->i915->drm.dev, 1746 "%s context redzone overwritten!\n", 1747 engine->name); 1748 } 1749 1750 static void execlists_context_unpin(struct intel_context *ce) 1751 { 1752 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 1753 ce->engine); 1754 1755 i915_gem_context_unpin_hw_id(ce->gem_context); 1756 i915_gem_object_unpin_map(ce->state->obj); 1757 intel_ring_reset(ce->ring, ce->ring->tail); 1758 } 1759 1760 static void 1761 __execlists_update_reg_state(struct intel_context *ce, 1762 struct intel_engine_cs *engine) 1763 { 1764 struct intel_ring *ring = ce->ring; 1765 u32 *regs = ce->lrc_reg_state; 1766 1767 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); 1768 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1769 1770 regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma); 1771 regs[CTX_RING_HEAD + 1] = ring->head; 1772 regs[CTX_RING_TAIL + 1] = ring->tail; 1773 1774 /* RPCS */ 1775 if (engine->class == RENDER_CLASS) { 1776 regs[CTX_R_PWR_CLK_STATE + 1] = 1777 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 1778 1779 i915_oa_init_reg_state(engine, ce, regs); 1780 } 1781 } 1782 1783 static int 1784 __execlists_context_pin(struct intel_context *ce, 1785 struct intel_engine_cs *engine) 1786 { 1787 void *vaddr; 1788 int ret; 1789 1790 GEM_BUG_ON(!ce->state); 1791 1792 ret = intel_context_active_acquire(ce); 1793 if (ret) 1794 goto err; 1795 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1796 1797 vaddr = i915_gem_object_pin_map(ce->state->obj, 1798 i915_coherent_map_type(engine->i915) | 1799 I915_MAP_OVERRIDE); 1800 if (IS_ERR(vaddr)) { 1801 ret = PTR_ERR(vaddr); 1802 goto unpin_active; 1803 } 1804 1805 ret = i915_gem_context_pin_hw_id(ce->gem_context); 1806 if (ret) 1807 goto unpin_map; 1808 1809 ce->lrc_desc = lrc_descriptor(ce, engine); 1810 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 1811 __execlists_update_reg_state(ce, engine); 1812 1813 return 0; 1814 1815 unpin_map: 1816 i915_gem_object_unpin_map(ce->state->obj); 1817 unpin_active: 1818 intel_context_active_release(ce); 1819 err: 1820 return ret; 1821 } 1822 1823 static int execlists_context_pin(struct intel_context *ce) 1824 { 1825 return __execlists_context_pin(ce, ce->engine); 1826 } 1827 1828 static int execlists_context_alloc(struct intel_context *ce) 1829 { 1830 return __execlists_context_alloc(ce, ce->engine); 1831 } 1832 1833 static void execlists_context_reset(struct intel_context *ce) 1834 { 1835 /* 1836 * Because we emit WA_TAIL_DWORDS there may be a disparity 1837 * between our bookkeeping in ce->ring->head and ce->ring->tail and 1838 * that stored in context. As we only write new commands from 1839 * ce->ring->tail onwards, everything before that is junk. If the GPU 1840 * starts reading from its RING_HEAD from the context, it may try to 1841 * execute that junk and die. 1842 * 1843 * The contexts that are stilled pinned on resume belong to the 1844 * kernel, and are local to each engine. All other contexts will 1845 * have their head/tail sanitized upon pinning before use, so they 1846 * will never see garbage, 1847 * 1848 * So to avoid that we reset the context images upon resume. For 1849 * simplicity, we just zero everything out. 1850 */ 1851 intel_ring_reset(ce->ring, 0); 1852 __execlists_update_reg_state(ce, ce->engine); 1853 } 1854 1855 static const struct intel_context_ops execlists_context_ops = { 1856 .alloc = execlists_context_alloc, 1857 1858 .pin = execlists_context_pin, 1859 .unpin = execlists_context_unpin, 1860 1861 .enter = intel_context_enter_engine, 1862 .exit = intel_context_exit_engine, 1863 1864 .reset = execlists_context_reset, 1865 .destroy = execlists_context_destroy, 1866 }; 1867 1868 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 1869 { 1870 u32 *cs; 1871 1872 GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb); 1873 1874 cs = intel_ring_begin(rq, 6); 1875 if (IS_ERR(cs)) 1876 return PTR_ERR(cs); 1877 1878 /* 1879 * Check if we have been preempted before we even get started. 1880 * 1881 * After this point i915_request_started() reports true, even if 1882 * we get preempted and so are no longer running. 1883 */ 1884 *cs++ = MI_ARB_CHECK; 1885 *cs++ = MI_NOOP; 1886 1887 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1888 *cs++ = rq->timeline->hwsp_offset; 1889 *cs++ = 0; 1890 *cs++ = rq->fence.seqno - 1; 1891 1892 intel_ring_advance(rq, cs); 1893 1894 /* Record the updated position of the request's payload */ 1895 rq->infix = intel_ring_offset(rq, cs); 1896 1897 return 0; 1898 } 1899 1900 static int emit_pdps(struct i915_request *rq) 1901 { 1902 const struct intel_engine_cs * const engine = rq->engine; 1903 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->hw_context->vm); 1904 int err, i; 1905 u32 *cs; 1906 1907 GEM_BUG_ON(intel_vgpu_active(rq->i915)); 1908 1909 /* 1910 * Beware ye of the dragons, this sequence is magic! 1911 * 1912 * Small changes to this sequence can cause anything from 1913 * GPU hangs to forcewake errors and machine lockups! 1914 */ 1915 1916 /* Flush any residual operations from the context load */ 1917 err = engine->emit_flush(rq, EMIT_FLUSH); 1918 if (err) 1919 return err; 1920 1921 /* Magic required to prevent forcewake errors! */ 1922 err = engine->emit_flush(rq, EMIT_INVALIDATE); 1923 if (err) 1924 return err; 1925 1926 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 1927 if (IS_ERR(cs)) 1928 return PTR_ERR(cs); 1929 1930 /* Ensure the LRI have landed before we invalidate & continue */ 1931 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 1932 for (i = GEN8_3LVL_PDPES; i--; ) { 1933 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1934 u32 base = engine->mmio_base; 1935 1936 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 1937 *cs++ = upper_32_bits(pd_daddr); 1938 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 1939 *cs++ = lower_32_bits(pd_daddr); 1940 } 1941 *cs++ = MI_NOOP; 1942 1943 intel_ring_advance(rq, cs); 1944 1945 /* Be doubly sure the LRI have landed before proceeding */ 1946 err = engine->emit_flush(rq, EMIT_FLUSH); 1947 if (err) 1948 return err; 1949 1950 /* Re-invalidate the TLB for luck */ 1951 return engine->emit_flush(rq, EMIT_INVALIDATE); 1952 } 1953 1954 static int execlists_request_alloc(struct i915_request *request) 1955 { 1956 int ret; 1957 1958 GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); 1959 1960 /* 1961 * Flush enough space to reduce the likelihood of waiting after 1962 * we start building the request - in which case we will just 1963 * have to repeat work. 1964 */ 1965 request->reserved_space += EXECLISTS_REQUEST_SIZE; 1966 1967 /* 1968 * Note that after this point, we have committed to using 1969 * this request as it is being used to both track the 1970 * state of engine initialisation and liveness of the 1971 * golden renderstate above. Think twice before you try 1972 * to cancel/unwind this request now. 1973 */ 1974 1975 /* Unconditionally invalidate GPU caches and TLBs. */ 1976 if (i915_vm_is_4lvl(request->hw_context->vm)) 1977 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 1978 else 1979 ret = emit_pdps(request); 1980 if (ret) 1981 return ret; 1982 1983 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 1984 return 0; 1985 } 1986 1987 /* 1988 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1989 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1990 * but there is a slight complication as this is applied in WA batch where the 1991 * values are only initialized once so we cannot take register value at the 1992 * beginning and reuse it further; hence we save its value to memory, upload a 1993 * constant value with bit21 set and then we restore it back with the saved value. 1994 * To simplify the WA, a constant value is formed by using the default value 1995 * of this register. This shouldn't be a problem because we are only modifying 1996 * it for a short period and this batch in non-premptible. We can ofcourse 1997 * use additional instructions that read the actual value of the register 1998 * at that time and set our bit of interest but it makes the WA complicated. 1999 * 2000 * This WA is also required for Gen9 so extracting as a function avoids 2001 * code duplication. 2002 */ 2003 static u32 * 2004 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 2005 { 2006 /* NB no one else is allowed to scribble over scratch + 256! */ 2007 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2008 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2009 *batch++ = intel_gt_scratch_offset(engine->gt, 2010 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2011 *batch++ = 0; 2012 2013 *batch++ = MI_LOAD_REGISTER_IMM(1); 2014 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2015 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 2016 2017 batch = gen8_emit_pipe_control(batch, 2018 PIPE_CONTROL_CS_STALL | 2019 PIPE_CONTROL_DC_FLUSH_ENABLE, 2020 0); 2021 2022 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2023 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2024 *batch++ = intel_gt_scratch_offset(engine->gt, 2025 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2026 *batch++ = 0; 2027 2028 return batch; 2029 } 2030 2031 static u32 slm_offset(struct intel_engine_cs *engine) 2032 { 2033 return intel_gt_scratch_offset(engine->gt, 2034 INTEL_GT_SCRATCH_FIELD_CLEAR_SLM_WA); 2035 } 2036 2037 /* 2038 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 2039 * initialized at the beginning and shared across all contexts but this field 2040 * helps us to have multiple batches at different offsets and select them based 2041 * on a criteria. At the moment this batch always start at the beginning of the page 2042 * and at this point we don't have multiple wa_ctx batch buffers. 2043 * 2044 * The number of WA applied are not known at the beginning; we use this field 2045 * to return the no of DWORDS written. 2046 * 2047 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 2048 * so it adds NOOPs as padding to make it cacheline aligned. 2049 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 2050 * makes a complete batch buffer. 2051 */ 2052 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2053 { 2054 /* WaDisableCtxRestoreArbitration:bdw,chv */ 2055 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2056 2057 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 2058 if (IS_BROADWELL(engine->i915)) 2059 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2060 2061 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 2062 /* Actual scratch location is at 128 bytes offset */ 2063 batch = gen8_emit_pipe_control(batch, 2064 PIPE_CONTROL_FLUSH_L3 | 2065 PIPE_CONTROL_GLOBAL_GTT_IVB | 2066 PIPE_CONTROL_CS_STALL | 2067 PIPE_CONTROL_QW_WRITE, 2068 slm_offset(engine)); 2069 2070 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2071 2072 /* Pad to end of cacheline */ 2073 while ((unsigned long)batch % CACHELINE_BYTES) 2074 *batch++ = MI_NOOP; 2075 2076 /* 2077 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 2078 * execution depends on the length specified in terms of cache lines 2079 * in the register CTX_RCS_INDIRECT_CTX 2080 */ 2081 2082 return batch; 2083 } 2084 2085 struct lri { 2086 i915_reg_t reg; 2087 u32 value; 2088 }; 2089 2090 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 2091 { 2092 GEM_BUG_ON(!count || count > 63); 2093 2094 *batch++ = MI_LOAD_REGISTER_IMM(count); 2095 do { 2096 *batch++ = i915_mmio_reg_offset(lri->reg); 2097 *batch++ = lri->value; 2098 } while (lri++, --count); 2099 *batch++ = MI_NOOP; 2100 2101 return batch; 2102 } 2103 2104 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2105 { 2106 static const struct lri lri[] = { 2107 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 2108 { 2109 COMMON_SLICE_CHICKEN2, 2110 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 2111 0), 2112 }, 2113 2114 /* BSpec: 11391 */ 2115 { 2116 FF_SLICE_CHICKEN, 2117 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 2118 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 2119 }, 2120 2121 /* BSpec: 11299 */ 2122 { 2123 _3D_CHICKEN3, 2124 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 2125 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 2126 } 2127 }; 2128 2129 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2130 2131 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 2132 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2133 2134 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 2135 2136 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 2137 if (HAS_POOLED_EU(engine->i915)) { 2138 /* 2139 * EU pool configuration is setup along with golden context 2140 * during context initialization. This value depends on 2141 * device type (2x6 or 3x6) and needs to be updated based 2142 * on which subslice is disabled especially for 2x6 2143 * devices, however it is safe to load default 2144 * configuration of 3x6 device instead of masking off 2145 * corresponding bits because HW ignores bits of a disabled 2146 * subslice and drops down to appropriate config. Please 2147 * see render_state_setup() in i915_gem_render_state.c for 2148 * possible configurations, to avoid duplication they are 2149 * not shown here again. 2150 */ 2151 *batch++ = GEN9_MEDIA_POOL_STATE; 2152 *batch++ = GEN9_MEDIA_POOL_ENABLE; 2153 *batch++ = 0x00777000; 2154 *batch++ = 0; 2155 *batch++ = 0; 2156 *batch++ = 0; 2157 } 2158 2159 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2160 2161 /* Pad to end of cacheline */ 2162 while ((unsigned long)batch % CACHELINE_BYTES) 2163 *batch++ = MI_NOOP; 2164 2165 return batch; 2166 } 2167 2168 static u32 * 2169 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2170 { 2171 int i; 2172 2173 /* 2174 * WaPipeControlBefore3DStateSamplePattern: cnl 2175 * 2176 * Ensure the engine is idle prior to programming a 2177 * 3DSTATE_SAMPLE_PATTERN during a context restore. 2178 */ 2179 batch = gen8_emit_pipe_control(batch, 2180 PIPE_CONTROL_CS_STALL, 2181 0); 2182 /* 2183 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 2184 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 2185 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 2186 * confusing. Since gen8_emit_pipe_control() already advances the 2187 * batch by 6 dwords, we advance the other 10 here, completing a 2188 * cacheline. It's not clear if the workaround requires this padding 2189 * before other commands, or if it's just the regular padding we would 2190 * already have for the workaround bb, so leave it here for now. 2191 */ 2192 for (i = 0; i < 10; i++) 2193 *batch++ = MI_NOOP; 2194 2195 /* Pad to end of cacheline */ 2196 while ((unsigned long)batch % CACHELINE_BYTES) 2197 *batch++ = MI_NOOP; 2198 2199 return batch; 2200 } 2201 2202 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 2203 2204 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 2205 { 2206 struct drm_i915_gem_object *obj; 2207 struct i915_vma *vma; 2208 int err; 2209 2210 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 2211 if (IS_ERR(obj)) 2212 return PTR_ERR(obj); 2213 2214 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 2215 if (IS_ERR(vma)) { 2216 err = PTR_ERR(vma); 2217 goto err; 2218 } 2219 2220 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 2221 if (err) 2222 goto err; 2223 2224 engine->wa_ctx.vma = vma; 2225 return 0; 2226 2227 err: 2228 i915_gem_object_put(obj); 2229 return err; 2230 } 2231 2232 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 2233 { 2234 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 2235 } 2236 2237 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 2238 2239 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 2240 { 2241 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 2242 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 2243 &wa_ctx->per_ctx }; 2244 wa_bb_func_t wa_bb_fn[2]; 2245 struct page *page; 2246 void *batch, *batch_ptr; 2247 unsigned int i; 2248 int ret; 2249 2250 if (engine->class != RENDER_CLASS) 2251 return 0; 2252 2253 switch (INTEL_GEN(engine->i915)) { 2254 case 12: 2255 case 11: 2256 return 0; 2257 case 10: 2258 wa_bb_fn[0] = gen10_init_indirectctx_bb; 2259 wa_bb_fn[1] = NULL; 2260 break; 2261 case 9: 2262 wa_bb_fn[0] = gen9_init_indirectctx_bb; 2263 wa_bb_fn[1] = NULL; 2264 break; 2265 case 8: 2266 wa_bb_fn[0] = gen8_init_indirectctx_bb; 2267 wa_bb_fn[1] = NULL; 2268 break; 2269 default: 2270 MISSING_CASE(INTEL_GEN(engine->i915)); 2271 return 0; 2272 } 2273 2274 ret = lrc_setup_wa_ctx(engine); 2275 if (ret) { 2276 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 2277 return ret; 2278 } 2279 2280 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 2281 batch = batch_ptr = kmap_atomic(page); 2282 2283 /* 2284 * Emit the two workaround batch buffers, recording the offset from the 2285 * start of the workaround batch buffer object for each and their 2286 * respective sizes. 2287 */ 2288 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 2289 wa_bb[i]->offset = batch_ptr - batch; 2290 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 2291 CACHELINE_BYTES))) { 2292 ret = -EINVAL; 2293 break; 2294 } 2295 if (wa_bb_fn[i]) 2296 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 2297 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 2298 } 2299 2300 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 2301 2302 kunmap_atomic(batch); 2303 if (ret) 2304 lrc_destroy_wa_ctx(engine); 2305 2306 return ret; 2307 } 2308 2309 static void enable_execlists(struct intel_engine_cs *engine) 2310 { 2311 u32 mode; 2312 2313 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 2314 2315 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 2316 2317 if (INTEL_GEN(engine->i915) >= 11) 2318 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 2319 else 2320 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 2321 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 2322 2323 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 2324 2325 ENGINE_WRITE_FW(engine, 2326 RING_HWS_PGA, 2327 i915_ggtt_offset(engine->status_page.vma)); 2328 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 2329 } 2330 2331 static bool unexpected_starting_state(struct intel_engine_cs *engine) 2332 { 2333 bool unexpected = false; 2334 2335 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 2336 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 2337 unexpected = true; 2338 } 2339 2340 return unexpected; 2341 } 2342 2343 static int execlists_resume(struct intel_engine_cs *engine) 2344 { 2345 intel_engine_apply_workarounds(engine); 2346 intel_engine_apply_whitelist(engine); 2347 2348 intel_mocs_init_engine(engine); 2349 2350 intel_engine_reset_breadcrumbs(engine); 2351 2352 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 2353 struct drm_printer p = drm_debug_printer(__func__); 2354 2355 intel_engine_dump(engine, &p, NULL); 2356 } 2357 2358 enable_execlists(engine); 2359 2360 return 0; 2361 } 2362 2363 static void execlists_reset_prepare(struct intel_engine_cs *engine) 2364 { 2365 struct intel_engine_execlists * const execlists = &engine->execlists; 2366 unsigned long flags; 2367 2368 GEM_TRACE("%s: depth<-%d\n", engine->name, 2369 atomic_read(&execlists->tasklet.count)); 2370 2371 /* 2372 * Prevent request submission to the hardware until we have 2373 * completed the reset in i915_gem_reset_finish(). If a request 2374 * is completed by one engine, it may then queue a request 2375 * to a second via its execlists->tasklet *just* as we are 2376 * calling engine->resume() and also writing the ELSP. 2377 * Turning off the execlists->tasklet until the reset is over 2378 * prevents the race. 2379 */ 2380 __tasklet_disable_sync_once(&execlists->tasklet); 2381 GEM_BUG_ON(!reset_in_progress(execlists)); 2382 2383 /* And flush any current direct submission. */ 2384 spin_lock_irqsave(&engine->active.lock, flags); 2385 spin_unlock_irqrestore(&engine->active.lock, flags); 2386 2387 /* 2388 * We stop engines, otherwise we might get failed reset and a 2389 * dead gpu (on elk). Also as modern gpu as kbl can suffer 2390 * from system hang if batchbuffer is progressing when 2391 * the reset is issued, regardless of READY_TO_RESET ack. 2392 * Thus assume it is best to stop engines on all gens 2393 * where we have a gpu reset. 2394 * 2395 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 2396 * 2397 * FIXME: Wa for more modern gens needs to be validated 2398 */ 2399 intel_engine_stop_cs(engine); 2400 } 2401 2402 static void reset_csb_pointers(struct intel_engine_cs *engine) 2403 { 2404 struct intel_engine_execlists * const execlists = &engine->execlists; 2405 const unsigned int reset_value = execlists->csb_size - 1; 2406 2407 ring_set_paused(engine, 0); 2408 2409 /* 2410 * After a reset, the HW starts writing into CSB entry [0]. We 2411 * therefore have to set our HEAD pointer back one entry so that 2412 * the *first* entry we check is entry 0. To complicate this further, 2413 * as we don't wait for the first interrupt after reset, we have to 2414 * fake the HW write to point back to the last entry so that our 2415 * inline comparison of our cached head position against the last HW 2416 * write works even before the first interrupt. 2417 */ 2418 execlists->csb_head = reset_value; 2419 WRITE_ONCE(*execlists->csb_write, reset_value); 2420 wmb(); /* Make sure this is visible to HW (paranoia?) */ 2421 2422 invalidate_csb_entries(&execlists->csb_status[0], 2423 &execlists->csb_status[reset_value]); 2424 } 2425 2426 static struct i915_request *active_request(struct i915_request *rq) 2427 { 2428 const struct intel_context * const ce = rq->hw_context; 2429 struct i915_request *active = NULL; 2430 struct list_head *list; 2431 2432 if (!i915_request_is_active(rq)) /* unwound, but incomplete! */ 2433 return rq; 2434 2435 list = &rq->timeline->requests; 2436 list_for_each_entry_from_reverse(rq, list, link) { 2437 if (i915_request_completed(rq)) 2438 break; 2439 2440 if (rq->hw_context != ce) 2441 break; 2442 2443 active = rq; 2444 } 2445 2446 return active; 2447 } 2448 2449 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 2450 { 2451 struct intel_engine_execlists * const execlists = &engine->execlists; 2452 struct intel_context *ce; 2453 struct i915_request *rq; 2454 u32 *regs; 2455 2456 process_csb(engine); /* drain preemption events */ 2457 2458 /* Following the reset, we need to reload the CSB read/write pointers */ 2459 reset_csb_pointers(engine); 2460 2461 /* 2462 * Save the currently executing context, even if we completed 2463 * its request, it was still running at the time of the 2464 * reset and will have been clobbered. 2465 */ 2466 rq = execlists_active(execlists); 2467 if (!rq) 2468 goto unwind; 2469 2470 ce = rq->hw_context; 2471 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 2472 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2473 rq = active_request(rq); 2474 if (!rq) { 2475 ce->ring->head = ce->ring->tail; 2476 goto out_replay; 2477 } 2478 2479 ce->ring->head = intel_ring_wrap(ce->ring, rq->head); 2480 2481 /* 2482 * If this request hasn't started yet, e.g. it is waiting on a 2483 * semaphore, we need to avoid skipping the request or else we 2484 * break the signaling chain. However, if the context is corrupt 2485 * the request will not restart and we will be stuck with a wedged 2486 * device. It is quite often the case that if we issue a reset 2487 * while the GPU is loading the context image, that the context 2488 * image becomes corrupt. 2489 * 2490 * Otherwise, if we have not started yet, the request should replay 2491 * perfectly and we do not need to flag the result as being erroneous. 2492 */ 2493 if (!i915_request_started(rq)) 2494 goto out_replay; 2495 2496 /* 2497 * If the request was innocent, we leave the request in the ELSP 2498 * and will try to replay it on restarting. The context image may 2499 * have been corrupted by the reset, in which case we may have 2500 * to service a new GPU hang, but more likely we can continue on 2501 * without impact. 2502 * 2503 * If the request was guilty, we presume the context is corrupt 2504 * and have to at least restore the RING register in the context 2505 * image back to the expected values to skip over the guilty request. 2506 */ 2507 __i915_request_reset(rq, stalled); 2508 if (!stalled) 2509 goto out_replay; 2510 2511 /* 2512 * We want a simple context + ring to execute the breadcrumb update. 2513 * We cannot rely on the context being intact across the GPU hang, 2514 * so clear it and rebuild just what we need for the breadcrumb. 2515 * All pending requests for this context will be zapped, and any 2516 * future request will be after userspace has had the opportunity 2517 * to recreate its own state. 2518 */ 2519 regs = ce->lrc_reg_state; 2520 if (engine->pinned_default_state) { 2521 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 2522 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 2523 engine->context_size - PAGE_SIZE); 2524 } 2525 execlists_init_reg_state(regs, ce, engine, ce->ring); 2526 2527 out_replay: 2528 GEM_TRACE("%s replay {head:%04x, tail:%04x\n", 2529 engine->name, ce->ring->head, ce->ring->tail); 2530 intel_ring_update_space(ce->ring); 2531 __execlists_update_reg_state(ce, engine); 2532 2533 unwind: 2534 /* Push back any incomplete requests for replay after the reset. */ 2535 cancel_port_requests(execlists); 2536 __unwind_incomplete_requests(engine); 2537 } 2538 2539 static void execlists_reset(struct intel_engine_cs *engine, bool stalled) 2540 { 2541 unsigned long flags; 2542 2543 GEM_TRACE("%s\n", engine->name); 2544 2545 spin_lock_irqsave(&engine->active.lock, flags); 2546 2547 __execlists_reset(engine, stalled); 2548 2549 spin_unlock_irqrestore(&engine->active.lock, flags); 2550 } 2551 2552 static void nop_submission_tasklet(unsigned long data) 2553 { 2554 /* The driver is wedged; don't process any more events. */ 2555 } 2556 2557 static void execlists_cancel_requests(struct intel_engine_cs *engine) 2558 { 2559 struct intel_engine_execlists * const execlists = &engine->execlists; 2560 struct i915_request *rq, *rn; 2561 struct rb_node *rb; 2562 unsigned long flags; 2563 2564 GEM_TRACE("%s\n", engine->name); 2565 2566 /* 2567 * Before we call engine->cancel_requests(), we should have exclusive 2568 * access to the submission state. This is arranged for us by the 2569 * caller disabling the interrupt generation, the tasklet and other 2570 * threads that may then access the same state, giving us a free hand 2571 * to reset state. However, we still need to let lockdep be aware that 2572 * we know this state may be accessed in hardirq context, so we 2573 * disable the irq around this manipulation and we want to keep 2574 * the spinlock focused on its duties and not accidentally conflate 2575 * coverage to the submission's irq state. (Similarly, although we 2576 * shouldn't need to disable irq around the manipulation of the 2577 * submission's irq state, we also wish to remind ourselves that 2578 * it is irq state.) 2579 */ 2580 spin_lock_irqsave(&engine->active.lock, flags); 2581 2582 __execlists_reset(engine, true); 2583 2584 /* Mark all executing requests as skipped. */ 2585 list_for_each_entry(rq, &engine->active.requests, sched.link) 2586 mark_eio(rq); 2587 2588 /* Flush the queued requests to the timeline list (for retiring). */ 2589 while ((rb = rb_first_cached(&execlists->queue))) { 2590 struct i915_priolist *p = to_priolist(rb); 2591 int i; 2592 2593 priolist_for_each_request_consume(rq, rn, p, i) { 2594 mark_eio(rq); 2595 __i915_request_submit(rq); 2596 } 2597 2598 rb_erase_cached(&p->node, &execlists->queue); 2599 i915_priolist_free(p); 2600 } 2601 2602 /* Cancel all attached virtual engines */ 2603 while ((rb = rb_first_cached(&execlists->virtual))) { 2604 struct virtual_engine *ve = 2605 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2606 2607 rb_erase_cached(rb, &execlists->virtual); 2608 RB_CLEAR_NODE(rb); 2609 2610 spin_lock(&ve->base.active.lock); 2611 rq = fetch_and_zero(&ve->request); 2612 if (rq) { 2613 mark_eio(rq); 2614 2615 rq->engine = engine; 2616 __i915_request_submit(rq); 2617 i915_request_put(rq); 2618 2619 ve->base.execlists.queue_priority_hint = INT_MIN; 2620 } 2621 spin_unlock(&ve->base.active.lock); 2622 } 2623 2624 /* Remaining _unready_ requests will be nop'ed when submitted */ 2625 2626 execlists->queue_priority_hint = INT_MIN; 2627 execlists->queue = RB_ROOT_CACHED; 2628 2629 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 2630 execlists->tasklet.func = nop_submission_tasklet; 2631 2632 spin_unlock_irqrestore(&engine->active.lock, flags); 2633 } 2634 2635 static void execlists_reset_finish(struct intel_engine_cs *engine) 2636 { 2637 struct intel_engine_execlists * const execlists = &engine->execlists; 2638 2639 /* 2640 * After a GPU reset, we may have requests to replay. Do so now while 2641 * we still have the forcewake to be sure that the GPU is not allowed 2642 * to sleep before we restart and reload a context. 2643 */ 2644 GEM_BUG_ON(!reset_in_progress(execlists)); 2645 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 2646 execlists->tasklet.func(execlists->tasklet.data); 2647 2648 if (__tasklet_enable(&execlists->tasklet)) 2649 /* And kick in case we missed a new request submission. */ 2650 tasklet_hi_schedule(&execlists->tasklet); 2651 GEM_TRACE("%s: depth->%d\n", engine->name, 2652 atomic_read(&execlists->tasklet.count)); 2653 } 2654 2655 static int gen8_emit_bb_start(struct i915_request *rq, 2656 u64 offset, u32 len, 2657 const unsigned int flags) 2658 { 2659 u32 *cs; 2660 2661 cs = intel_ring_begin(rq, 4); 2662 if (IS_ERR(cs)) 2663 return PTR_ERR(cs); 2664 2665 /* 2666 * WaDisableCtxRestoreArbitration:bdw,chv 2667 * 2668 * We don't need to perform MI_ARB_ENABLE as often as we do (in 2669 * particular all the gen that do not need the w/a at all!), if we 2670 * took care to make sure that on every switch into this context 2671 * (both ordinary and for preemption) that arbitrartion was enabled 2672 * we would be fine. However, for gen8 there is another w/a that 2673 * requires us to not preempt inside GPGPU execution, so we keep 2674 * arbitration disabled for gen8 batches. Arbitration will be 2675 * re-enabled before we close the request 2676 * (engine->emit_fini_breadcrumb). 2677 */ 2678 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2679 2680 /* FIXME(BDW+): Address space and security selectors. */ 2681 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 2682 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 2683 *cs++ = lower_32_bits(offset); 2684 *cs++ = upper_32_bits(offset); 2685 2686 intel_ring_advance(rq, cs); 2687 2688 return 0; 2689 } 2690 2691 static int gen9_emit_bb_start(struct i915_request *rq, 2692 u64 offset, u32 len, 2693 const unsigned int flags) 2694 { 2695 u32 *cs; 2696 2697 cs = intel_ring_begin(rq, 6); 2698 if (IS_ERR(cs)) 2699 return PTR_ERR(cs); 2700 2701 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2702 2703 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 2704 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 2705 *cs++ = lower_32_bits(offset); 2706 *cs++ = upper_32_bits(offset); 2707 2708 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2709 *cs++ = MI_NOOP; 2710 2711 intel_ring_advance(rq, cs); 2712 2713 return 0; 2714 } 2715 2716 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 2717 { 2718 ENGINE_WRITE(engine, RING_IMR, 2719 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 2720 ENGINE_POSTING_READ(engine, RING_IMR); 2721 } 2722 2723 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 2724 { 2725 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 2726 } 2727 2728 static int gen8_emit_flush(struct i915_request *request, u32 mode) 2729 { 2730 u32 cmd, *cs; 2731 2732 cs = intel_ring_begin(request, 4); 2733 if (IS_ERR(cs)) 2734 return PTR_ERR(cs); 2735 2736 cmd = MI_FLUSH_DW + 1; 2737 2738 /* We always require a command barrier so that subsequent 2739 * commands, such as breadcrumb interrupts, are strictly ordered 2740 * wrt the contents of the write cache being flushed to memory 2741 * (and thus being coherent from the CPU). 2742 */ 2743 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 2744 2745 if (mode & EMIT_INVALIDATE) { 2746 cmd |= MI_INVALIDATE_TLB; 2747 if (request->engine->class == VIDEO_DECODE_CLASS) 2748 cmd |= MI_INVALIDATE_BSD; 2749 } 2750 2751 *cs++ = cmd; 2752 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; 2753 *cs++ = 0; /* upper addr */ 2754 *cs++ = 0; /* value */ 2755 intel_ring_advance(request, cs); 2756 2757 return 0; 2758 } 2759 2760 static int gen8_emit_flush_render(struct i915_request *request, 2761 u32 mode) 2762 { 2763 struct intel_engine_cs *engine = request->engine; 2764 u32 scratch_addr = 2765 intel_gt_scratch_offset(engine->gt, 2766 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 2767 bool vf_flush_wa = false, dc_flush_wa = false; 2768 u32 *cs, flags = 0; 2769 int len; 2770 2771 flags |= PIPE_CONTROL_CS_STALL; 2772 2773 if (mode & EMIT_FLUSH) { 2774 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 2775 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 2776 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 2777 flags |= PIPE_CONTROL_FLUSH_ENABLE; 2778 } 2779 2780 if (mode & EMIT_INVALIDATE) { 2781 flags |= PIPE_CONTROL_TLB_INVALIDATE; 2782 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 2783 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 2784 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 2785 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 2786 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 2787 flags |= PIPE_CONTROL_QW_WRITE; 2788 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 2789 2790 /* 2791 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 2792 * pipe control. 2793 */ 2794 if (IS_GEN(request->i915, 9)) 2795 vf_flush_wa = true; 2796 2797 /* WaForGAMHang:kbl */ 2798 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 2799 dc_flush_wa = true; 2800 } 2801 2802 len = 6; 2803 2804 if (vf_flush_wa) 2805 len += 6; 2806 2807 if (dc_flush_wa) 2808 len += 12; 2809 2810 cs = intel_ring_begin(request, len); 2811 if (IS_ERR(cs)) 2812 return PTR_ERR(cs); 2813 2814 if (vf_flush_wa) 2815 cs = gen8_emit_pipe_control(cs, 0, 0); 2816 2817 if (dc_flush_wa) 2818 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 2819 0); 2820 2821 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 2822 2823 if (dc_flush_wa) 2824 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 2825 2826 intel_ring_advance(request, cs); 2827 2828 return 0; 2829 } 2830 2831 static int gen11_emit_flush_render(struct i915_request *request, 2832 u32 mode) 2833 { 2834 struct intel_engine_cs *engine = request->engine; 2835 const u32 scratch_addr = 2836 intel_gt_scratch_offset(engine->gt, 2837 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 2838 2839 if (mode & EMIT_FLUSH) { 2840 u32 *cs; 2841 u32 flags = 0; 2842 2843 flags |= PIPE_CONTROL_CS_STALL; 2844 2845 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 2846 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 2847 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 2848 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 2849 flags |= PIPE_CONTROL_FLUSH_ENABLE; 2850 flags |= PIPE_CONTROL_QW_WRITE; 2851 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 2852 2853 cs = intel_ring_begin(request, 6); 2854 if (IS_ERR(cs)) 2855 return PTR_ERR(cs); 2856 2857 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 2858 intel_ring_advance(request, cs); 2859 } 2860 2861 if (mode & EMIT_INVALIDATE) { 2862 u32 *cs; 2863 u32 flags = 0; 2864 2865 flags |= PIPE_CONTROL_CS_STALL; 2866 2867 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 2868 flags |= PIPE_CONTROL_TLB_INVALIDATE; 2869 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 2870 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 2871 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 2872 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 2873 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 2874 flags |= PIPE_CONTROL_QW_WRITE; 2875 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 2876 2877 cs = intel_ring_begin(request, 6); 2878 if (IS_ERR(cs)) 2879 return PTR_ERR(cs); 2880 2881 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 2882 intel_ring_advance(request, cs); 2883 } 2884 2885 return 0; 2886 } 2887 2888 /* 2889 * Reserve space for 2 NOOPs at the end of each request to be 2890 * used as a workaround for not being allowed to do lite 2891 * restore with HEAD==TAIL (WaIdleLiteRestore). 2892 */ 2893 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 2894 { 2895 /* Ensure there's always at least one preemption point per-request. */ 2896 *cs++ = MI_ARB_CHECK; 2897 *cs++ = MI_NOOP; 2898 request->wa_tail = intel_ring_offset(request, cs); 2899 2900 return cs; 2901 } 2902 2903 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 2904 { 2905 *cs++ = MI_SEMAPHORE_WAIT | 2906 MI_SEMAPHORE_GLOBAL_GTT | 2907 MI_SEMAPHORE_POLL | 2908 MI_SEMAPHORE_SAD_EQ_SDD; 2909 *cs++ = 0; 2910 *cs++ = intel_hws_preempt_address(request->engine); 2911 *cs++ = 0; 2912 2913 return cs; 2914 } 2915 2916 static __always_inline u32* 2917 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 2918 u32 *cs) 2919 { 2920 *cs++ = MI_USER_INTERRUPT; 2921 2922 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2923 if (intel_engine_has_semaphores(request->engine)) 2924 cs = emit_preempt_busywait(request, cs); 2925 2926 request->tail = intel_ring_offset(request, cs); 2927 assert_ring_tail_valid(request->ring, request->tail); 2928 2929 return gen8_emit_wa_tail(request, cs); 2930 } 2931 2932 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 2933 { 2934 cs = gen8_emit_ggtt_write(cs, 2935 request->fence.seqno, 2936 request->timeline->hwsp_offset, 2937 0); 2938 2939 return gen8_emit_fini_breadcrumb_footer(request, cs); 2940 } 2941 2942 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 2943 { 2944 cs = gen8_emit_ggtt_write_rcs(cs, 2945 request->fence.seqno, 2946 request->timeline->hwsp_offset, 2947 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 2948 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 2949 PIPE_CONTROL_DC_FLUSH_ENABLE); 2950 2951 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 2952 cs = gen8_emit_pipe_control(cs, 2953 PIPE_CONTROL_FLUSH_ENABLE | 2954 PIPE_CONTROL_CS_STALL, 2955 0); 2956 2957 return gen8_emit_fini_breadcrumb_footer(request, cs); 2958 } 2959 2960 static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, 2961 u32 *cs) 2962 { 2963 cs = gen8_emit_ggtt_write_rcs(cs, 2964 request->fence.seqno, 2965 request->timeline->hwsp_offset, 2966 PIPE_CONTROL_CS_STALL | 2967 PIPE_CONTROL_TILE_CACHE_FLUSH | 2968 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 2969 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 2970 PIPE_CONTROL_DC_FLUSH_ENABLE | 2971 PIPE_CONTROL_FLUSH_ENABLE); 2972 2973 return gen8_emit_fini_breadcrumb_footer(request, cs); 2974 } 2975 2976 static void execlists_park(struct intel_engine_cs *engine) 2977 { 2978 del_timer(&engine->execlists.timer); 2979 } 2980 2981 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 2982 { 2983 engine->submit_request = execlists_submit_request; 2984 engine->cancel_requests = execlists_cancel_requests; 2985 engine->schedule = i915_schedule; 2986 engine->execlists.tasklet.func = execlists_submission_tasklet; 2987 2988 engine->reset.prepare = execlists_reset_prepare; 2989 engine->reset.reset = execlists_reset; 2990 engine->reset.finish = execlists_reset_finish; 2991 2992 engine->park = execlists_park; 2993 engine->unpark = NULL; 2994 2995 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 2996 if (!intel_vgpu_active(engine->i915)) { 2997 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 2998 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 2999 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 3000 } 3001 } 3002 3003 static void execlists_destroy(struct intel_engine_cs *engine) 3004 { 3005 intel_engine_cleanup_common(engine); 3006 lrc_destroy_wa_ctx(engine); 3007 kfree(engine); 3008 } 3009 3010 static void 3011 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 3012 { 3013 /* Default vfuncs which can be overriden by each engine. */ 3014 3015 engine->destroy = execlists_destroy; 3016 engine->resume = execlists_resume; 3017 3018 engine->reset.prepare = execlists_reset_prepare; 3019 engine->reset.reset = execlists_reset; 3020 engine->reset.finish = execlists_reset_finish; 3021 3022 engine->cops = &execlists_context_ops; 3023 engine->request_alloc = execlists_request_alloc; 3024 3025 engine->emit_flush = gen8_emit_flush; 3026 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 3027 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 3028 3029 engine->set_default_submission = intel_execlists_set_default_submission; 3030 3031 if (INTEL_GEN(engine->i915) < 11) { 3032 engine->irq_enable = gen8_logical_ring_enable_irq; 3033 engine->irq_disable = gen8_logical_ring_disable_irq; 3034 } else { 3035 /* 3036 * TODO: On Gen11 interrupt masks need to be clear 3037 * to allow C6 entry. Keep interrupts enabled at 3038 * and take the hit of generating extra interrupts 3039 * until a more refined solution exists. 3040 */ 3041 } 3042 if (IS_GEN(engine->i915, 8)) 3043 engine->emit_bb_start = gen8_emit_bb_start; 3044 else 3045 engine->emit_bb_start = gen9_emit_bb_start; 3046 } 3047 3048 static inline void 3049 logical_ring_default_irqs(struct intel_engine_cs *engine) 3050 { 3051 unsigned int shift = 0; 3052 3053 if (INTEL_GEN(engine->i915) < 11) { 3054 const u8 irq_shifts[] = { 3055 [RCS0] = GEN8_RCS_IRQ_SHIFT, 3056 [BCS0] = GEN8_BCS_IRQ_SHIFT, 3057 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 3058 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 3059 [VECS0] = GEN8_VECS_IRQ_SHIFT, 3060 }; 3061 3062 shift = irq_shifts[engine->id]; 3063 } 3064 3065 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 3066 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 3067 } 3068 3069 static void rcs_submission_override(struct intel_engine_cs *engine) 3070 { 3071 switch (INTEL_GEN(engine->i915)) { 3072 case 12: 3073 case 11: 3074 engine->emit_flush = gen11_emit_flush_render; 3075 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 3076 break; 3077 default: 3078 engine->emit_flush = gen8_emit_flush_render; 3079 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 3080 break; 3081 } 3082 } 3083 3084 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 3085 { 3086 tasklet_init(&engine->execlists.tasklet, 3087 execlists_submission_tasklet, (unsigned long)engine); 3088 timer_setup(&engine->execlists.timer, execlists_submission_timer, 0); 3089 3090 logical_ring_default_vfuncs(engine); 3091 logical_ring_default_irqs(engine); 3092 3093 if (engine->class == RENDER_CLASS) 3094 rcs_submission_override(engine); 3095 3096 return 0; 3097 } 3098 3099 int intel_execlists_submission_init(struct intel_engine_cs *engine) 3100 { 3101 struct intel_engine_execlists * const execlists = &engine->execlists; 3102 struct drm_i915_private *i915 = engine->i915; 3103 struct intel_uncore *uncore = engine->uncore; 3104 u32 base = engine->mmio_base; 3105 int ret; 3106 3107 ret = intel_engine_init_common(engine); 3108 if (ret) 3109 return ret; 3110 3111 if (intel_init_workaround_bb(engine)) 3112 /* 3113 * We continue even if we fail to initialize WA batch 3114 * because we only expect rare glitches but nothing 3115 * critical to prevent us from using GPU 3116 */ 3117 DRM_ERROR("WA batch buffer initialization failed\n"); 3118 3119 if (HAS_LOGICAL_RING_ELSQ(i915)) { 3120 execlists->submit_reg = uncore->regs + 3121 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 3122 execlists->ctrl_reg = uncore->regs + 3123 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 3124 } else { 3125 execlists->submit_reg = uncore->regs + 3126 i915_mmio_reg_offset(RING_ELSP(base)); 3127 } 3128 3129 execlists->csb_status = 3130 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 3131 3132 execlists->csb_write = 3133 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 3134 3135 if (INTEL_GEN(i915) < 11) 3136 execlists->csb_size = GEN8_CSB_ENTRIES; 3137 else 3138 execlists->csb_size = GEN11_CSB_ENTRIES; 3139 3140 reset_csb_pointers(engine); 3141 3142 return 0; 3143 } 3144 3145 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine) 3146 { 3147 u32 indirect_ctx_offset; 3148 3149 switch (INTEL_GEN(engine->i915)) { 3150 default: 3151 MISSING_CASE(INTEL_GEN(engine->i915)); 3152 /* fall through */ 3153 case 12: 3154 indirect_ctx_offset = 3155 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3156 break; 3157 case 11: 3158 indirect_ctx_offset = 3159 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3160 break; 3161 case 10: 3162 indirect_ctx_offset = 3163 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3164 break; 3165 case 9: 3166 indirect_ctx_offset = 3167 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3168 break; 3169 case 8: 3170 indirect_ctx_offset = 3171 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3172 break; 3173 } 3174 3175 return indirect_ctx_offset; 3176 } 3177 3178 static void execlists_init_reg_state(u32 *regs, 3179 struct intel_context *ce, 3180 struct intel_engine_cs *engine, 3181 struct intel_ring *ring) 3182 { 3183 struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm); 3184 bool rcs = engine->class == RENDER_CLASS; 3185 u32 base = engine->mmio_base; 3186 3187 /* 3188 * A context is actually a big batch buffer with several 3189 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 3190 * values we are setting here are only for the first context restore: 3191 * on a subsequent save, the GPU will recreate this batchbuffer with new 3192 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 3193 * we are not initializing here). 3194 * 3195 * Must keep consistent with virtual_update_register_offsets(). 3196 */ 3197 regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) | 3198 MI_LRI_FORCE_POSTED; 3199 3200 CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base), 3201 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | 3202 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH)); 3203 if (INTEL_GEN(engine->i915) < 11) { 3204 regs[CTX_CONTEXT_CONTROL + 1] |= 3205 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 3206 CTX_CTRL_RS_CTX_ENABLE); 3207 } 3208 CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0); 3209 CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0); 3210 CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0); 3211 CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base), 3212 RING_CTL_SIZE(ring->size) | RING_VALID); 3213 CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0); 3214 CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0); 3215 CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT); 3216 CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0); 3217 CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0); 3218 CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0); 3219 if (rcs) { 3220 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3221 3222 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0); 3223 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET, 3224 RING_INDIRECT_CTX_OFFSET(base), 0); 3225 if (wa_ctx->indirect_ctx.size) { 3226 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3227 3228 regs[CTX_RCS_INDIRECT_CTX + 1] = 3229 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 3230 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 3231 3232 regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = 3233 intel_lr_indirect_ctx_offset(engine) << 6; 3234 } 3235 3236 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); 3237 if (wa_ctx->per_ctx.size) { 3238 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3239 3240 regs[CTX_BB_PER_CTX_PTR + 1] = 3241 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 3242 } 3243 } 3244 3245 regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; 3246 3247 CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0); 3248 /* PDP values well be assigned later if needed */ 3249 CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0); 3250 CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0); 3251 CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0); 3252 CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0); 3253 CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0); 3254 CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0); 3255 CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0); 3256 CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0); 3257 3258 if (i915_vm_is_4lvl(&ppgtt->vm)) { 3259 /* 64b PPGTT (48bit canonical) 3260 * PDP0_DESCRIPTOR contains the base address to PML4 and 3261 * other PDP Descriptors are ignored. 3262 */ 3263 ASSIGN_CTX_PML4(ppgtt, regs); 3264 } else { 3265 ASSIGN_CTX_PDP(ppgtt, regs, 3); 3266 ASSIGN_CTX_PDP(ppgtt, regs, 2); 3267 ASSIGN_CTX_PDP(ppgtt, regs, 1); 3268 ASSIGN_CTX_PDP(ppgtt, regs, 0); 3269 } 3270 3271 if (rcs) { 3272 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 3273 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0); 3274 } 3275 3276 regs[CTX_END] = MI_BATCH_BUFFER_END; 3277 if (INTEL_GEN(engine->i915) >= 10) 3278 regs[CTX_END] |= BIT(0); 3279 } 3280 3281 static int 3282 populate_lr_context(struct intel_context *ce, 3283 struct drm_i915_gem_object *ctx_obj, 3284 struct intel_engine_cs *engine, 3285 struct intel_ring *ring) 3286 { 3287 void *vaddr; 3288 u32 *regs; 3289 int ret; 3290 3291 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 3292 if (IS_ERR(vaddr)) { 3293 ret = PTR_ERR(vaddr); 3294 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 3295 return ret; 3296 } 3297 3298 set_redzone(vaddr, engine); 3299 3300 if (engine->default_state) { 3301 /* 3302 * We only want to copy over the template context state; 3303 * skipping over the headers reserved for GuC communication, 3304 * leaving those as zero. 3305 */ 3306 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE; 3307 void *defaults; 3308 3309 defaults = i915_gem_object_pin_map(engine->default_state, 3310 I915_MAP_WB); 3311 if (IS_ERR(defaults)) { 3312 ret = PTR_ERR(defaults); 3313 goto err_unpin_ctx; 3314 } 3315 3316 memcpy(vaddr + start, defaults + start, engine->context_size); 3317 i915_gem_object_unpin_map(engine->default_state); 3318 } 3319 3320 /* The second page of the context object contains some fields which must 3321 * be set up prior to the first execution. */ 3322 regs = vaddr + LRC_STATE_PN * PAGE_SIZE; 3323 execlists_init_reg_state(regs, ce, engine, ring); 3324 if (!engine->default_state) 3325 regs[CTX_CONTEXT_CONTROL + 1] |= 3326 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 3327 3328 ret = 0; 3329 err_unpin_ctx: 3330 __i915_gem_object_flush_map(ctx_obj, 3331 LRC_HEADER_PAGES * PAGE_SIZE, 3332 engine->context_size); 3333 i915_gem_object_unpin_map(ctx_obj); 3334 return ret; 3335 } 3336 3337 static int __execlists_context_alloc(struct intel_context *ce, 3338 struct intel_engine_cs *engine) 3339 { 3340 struct drm_i915_gem_object *ctx_obj; 3341 struct intel_ring *ring; 3342 struct i915_vma *vma; 3343 u32 context_size; 3344 int ret; 3345 3346 GEM_BUG_ON(ce->state); 3347 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 3348 3349 /* 3350 * Before the actual start of the context image, we insert a few pages 3351 * for our own use and for sharing with the GuC. 3352 */ 3353 context_size += LRC_HEADER_PAGES * PAGE_SIZE; 3354 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3355 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 3356 3357 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 3358 if (IS_ERR(ctx_obj)) 3359 return PTR_ERR(ctx_obj); 3360 3361 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 3362 if (IS_ERR(vma)) { 3363 ret = PTR_ERR(vma); 3364 goto error_deref_obj; 3365 } 3366 3367 if (!ce->timeline) { 3368 struct intel_timeline *tl; 3369 3370 tl = intel_timeline_create(engine->gt, NULL); 3371 if (IS_ERR(tl)) { 3372 ret = PTR_ERR(tl); 3373 goto error_deref_obj; 3374 } 3375 3376 ce->timeline = tl; 3377 } 3378 3379 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 3380 if (IS_ERR(ring)) { 3381 ret = PTR_ERR(ring); 3382 goto error_deref_obj; 3383 } 3384 3385 ret = populate_lr_context(ce, ctx_obj, engine, ring); 3386 if (ret) { 3387 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 3388 goto error_ring_free; 3389 } 3390 3391 ce->ring = ring; 3392 ce->state = vma; 3393 3394 return 0; 3395 3396 error_ring_free: 3397 intel_ring_put(ring); 3398 error_deref_obj: 3399 i915_gem_object_put(ctx_obj); 3400 return ret; 3401 } 3402 3403 static struct list_head *virtual_queue(struct virtual_engine *ve) 3404 { 3405 return &ve->base.execlists.default_priolist.requests[0]; 3406 } 3407 3408 static void virtual_context_destroy(struct kref *kref) 3409 { 3410 struct virtual_engine *ve = 3411 container_of(kref, typeof(*ve), context.ref); 3412 unsigned int n; 3413 3414 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 3415 GEM_BUG_ON(ve->request); 3416 GEM_BUG_ON(ve->context.inflight); 3417 3418 for (n = 0; n < ve->num_siblings; n++) { 3419 struct intel_engine_cs *sibling = ve->siblings[n]; 3420 struct rb_node *node = &ve->nodes[sibling->id].rb; 3421 3422 if (RB_EMPTY_NODE(node)) 3423 continue; 3424 3425 spin_lock_irq(&sibling->active.lock); 3426 3427 /* Detachment is lazily performed in the execlists tasklet */ 3428 if (!RB_EMPTY_NODE(node)) 3429 rb_erase_cached(node, &sibling->execlists.virtual); 3430 3431 spin_unlock_irq(&sibling->active.lock); 3432 } 3433 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 3434 3435 if (ve->context.state) 3436 __execlists_context_fini(&ve->context); 3437 intel_context_fini(&ve->context); 3438 3439 kfree(ve->bonds); 3440 kfree(ve); 3441 } 3442 3443 static void virtual_engine_initial_hint(struct virtual_engine *ve) 3444 { 3445 int swp; 3446 3447 /* 3448 * Pick a random sibling on starting to help spread the load around. 3449 * 3450 * New contexts are typically created with exactly the same order 3451 * of siblings, and often started in batches. Due to the way we iterate 3452 * the array of sibling when submitting requests, sibling[0] is 3453 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 3454 * randomised across the system, we also help spread the load by the 3455 * first engine we inspect being different each time. 3456 * 3457 * NB This does not force us to execute on this engine, it will just 3458 * typically be the first we inspect for submission. 3459 */ 3460 swp = prandom_u32_max(ve->num_siblings); 3461 if (!swp) 3462 return; 3463 3464 swap(ve->siblings[swp], ve->siblings[0]); 3465 virtual_update_register_offsets(ve->context.lrc_reg_state, 3466 ve->siblings[0]); 3467 } 3468 3469 static int virtual_context_pin(struct intel_context *ce) 3470 { 3471 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 3472 int err; 3473 3474 /* Note: we must use a real engine class for setting up reg state */ 3475 err = __execlists_context_pin(ce, ve->siblings[0]); 3476 if (err) 3477 return err; 3478 3479 virtual_engine_initial_hint(ve); 3480 return 0; 3481 } 3482 3483 static void virtual_context_enter(struct intel_context *ce) 3484 { 3485 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 3486 unsigned int n; 3487 3488 for (n = 0; n < ve->num_siblings; n++) 3489 intel_engine_pm_get(ve->siblings[n]); 3490 3491 intel_timeline_enter(ce->timeline); 3492 } 3493 3494 static void virtual_context_exit(struct intel_context *ce) 3495 { 3496 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 3497 unsigned int n; 3498 3499 intel_timeline_exit(ce->timeline); 3500 3501 for (n = 0; n < ve->num_siblings; n++) 3502 intel_engine_pm_put(ve->siblings[n]); 3503 } 3504 3505 static const struct intel_context_ops virtual_context_ops = { 3506 .pin = virtual_context_pin, 3507 .unpin = execlists_context_unpin, 3508 3509 .enter = virtual_context_enter, 3510 .exit = virtual_context_exit, 3511 3512 .destroy = virtual_context_destroy, 3513 }; 3514 3515 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 3516 { 3517 struct i915_request *rq; 3518 intel_engine_mask_t mask; 3519 3520 rq = READ_ONCE(ve->request); 3521 if (!rq) 3522 return 0; 3523 3524 /* The rq is ready for submission; rq->execution_mask is now stable. */ 3525 mask = rq->execution_mask; 3526 if (unlikely(!mask)) { 3527 /* Invalid selection, submit to a random engine in error */ 3528 i915_request_skip(rq, -ENODEV); 3529 mask = ve->siblings[0]->mask; 3530 } 3531 3532 GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n", 3533 ve->base.name, 3534 rq->fence.context, rq->fence.seqno, 3535 mask, ve->base.execlists.queue_priority_hint); 3536 3537 return mask; 3538 } 3539 3540 static void virtual_submission_tasklet(unsigned long data) 3541 { 3542 struct virtual_engine * const ve = (struct virtual_engine *)data; 3543 const int prio = ve->base.execlists.queue_priority_hint; 3544 intel_engine_mask_t mask; 3545 unsigned int n; 3546 3547 rcu_read_lock(); 3548 mask = virtual_submission_mask(ve); 3549 rcu_read_unlock(); 3550 if (unlikely(!mask)) 3551 return; 3552 3553 local_irq_disable(); 3554 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 3555 struct intel_engine_cs *sibling = ve->siblings[n]; 3556 struct ve_node * const node = &ve->nodes[sibling->id]; 3557 struct rb_node **parent, *rb; 3558 bool first; 3559 3560 if (unlikely(!(mask & sibling->mask))) { 3561 if (!RB_EMPTY_NODE(&node->rb)) { 3562 spin_lock(&sibling->active.lock); 3563 rb_erase_cached(&node->rb, 3564 &sibling->execlists.virtual); 3565 RB_CLEAR_NODE(&node->rb); 3566 spin_unlock(&sibling->active.lock); 3567 } 3568 continue; 3569 } 3570 3571 spin_lock(&sibling->active.lock); 3572 3573 if (!RB_EMPTY_NODE(&node->rb)) { 3574 /* 3575 * Cheat and avoid rebalancing the tree if we can 3576 * reuse this node in situ. 3577 */ 3578 first = rb_first_cached(&sibling->execlists.virtual) == 3579 &node->rb; 3580 if (prio == node->prio || (prio > node->prio && first)) 3581 goto submit_engine; 3582 3583 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 3584 } 3585 3586 rb = NULL; 3587 first = true; 3588 parent = &sibling->execlists.virtual.rb_root.rb_node; 3589 while (*parent) { 3590 struct ve_node *other; 3591 3592 rb = *parent; 3593 other = rb_entry(rb, typeof(*other), rb); 3594 if (prio > other->prio) { 3595 parent = &rb->rb_left; 3596 } else { 3597 parent = &rb->rb_right; 3598 first = false; 3599 } 3600 } 3601 3602 rb_link_node(&node->rb, rb, parent); 3603 rb_insert_color_cached(&node->rb, 3604 &sibling->execlists.virtual, 3605 first); 3606 3607 submit_engine: 3608 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 3609 node->prio = prio; 3610 if (first && prio > sibling->execlists.queue_priority_hint) { 3611 sibling->execlists.queue_priority_hint = prio; 3612 tasklet_hi_schedule(&sibling->execlists.tasklet); 3613 } 3614 3615 spin_unlock(&sibling->active.lock); 3616 } 3617 local_irq_enable(); 3618 } 3619 3620 static void virtual_submit_request(struct i915_request *rq) 3621 { 3622 struct virtual_engine *ve = to_virtual_engine(rq->engine); 3623 struct i915_request *old; 3624 unsigned long flags; 3625 3626 GEM_TRACE("%s: rq=%llx:%lld\n", 3627 ve->base.name, 3628 rq->fence.context, 3629 rq->fence.seqno); 3630 3631 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 3632 3633 spin_lock_irqsave(&ve->base.active.lock, flags); 3634 3635 old = ve->request; 3636 if (old) { /* background completion event from preempt-to-busy */ 3637 GEM_BUG_ON(!i915_request_completed(old)); 3638 __i915_request_submit(old); 3639 i915_request_put(old); 3640 } 3641 3642 if (i915_request_completed(rq)) { 3643 __i915_request_submit(rq); 3644 3645 ve->base.execlists.queue_priority_hint = INT_MIN; 3646 ve->request = NULL; 3647 } else { 3648 ve->base.execlists.queue_priority_hint = rq_prio(rq); 3649 ve->request = i915_request_get(rq); 3650 3651 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 3652 list_move_tail(&rq->sched.link, virtual_queue(ve)); 3653 3654 tasklet_schedule(&ve->base.execlists.tasklet); 3655 } 3656 3657 spin_unlock_irqrestore(&ve->base.active.lock, flags); 3658 } 3659 3660 static struct ve_bond * 3661 virtual_find_bond(struct virtual_engine *ve, 3662 const struct intel_engine_cs *master) 3663 { 3664 int i; 3665 3666 for (i = 0; i < ve->num_bonds; i++) { 3667 if (ve->bonds[i].master == master) 3668 return &ve->bonds[i]; 3669 } 3670 3671 return NULL; 3672 } 3673 3674 static void 3675 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 3676 { 3677 struct virtual_engine *ve = to_virtual_engine(rq->engine); 3678 intel_engine_mask_t allowed, exec; 3679 struct ve_bond *bond; 3680 3681 allowed = ~to_request(signal)->engine->mask; 3682 3683 bond = virtual_find_bond(ve, to_request(signal)->engine); 3684 if (bond) 3685 allowed &= bond->sibling_mask; 3686 3687 /* Restrict the bonded request to run on only the available engines */ 3688 exec = READ_ONCE(rq->execution_mask); 3689 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 3690 ; 3691 3692 /* Prevent the master from being re-run on the bonded engines */ 3693 to_request(signal)->execution_mask &= ~allowed; 3694 } 3695 3696 struct intel_context * 3697 intel_execlists_create_virtual(struct i915_gem_context *ctx, 3698 struct intel_engine_cs **siblings, 3699 unsigned int count) 3700 { 3701 struct virtual_engine *ve; 3702 unsigned int n; 3703 int err; 3704 3705 if (count == 0) 3706 return ERR_PTR(-EINVAL); 3707 3708 if (count == 1) 3709 return intel_context_create(ctx, siblings[0]); 3710 3711 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 3712 if (!ve) 3713 return ERR_PTR(-ENOMEM); 3714 3715 ve->base.i915 = ctx->i915; 3716 ve->base.gt = siblings[0]->gt; 3717 ve->base.id = -1; 3718 ve->base.class = OTHER_CLASS; 3719 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 3720 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 3721 3722 /* 3723 * The decision on whether to submit a request using semaphores 3724 * depends on the saturated state of the engine. We only compute 3725 * this during HW submission of the request, and we need for this 3726 * state to be globally applied to all requests being submitted 3727 * to this engine. Virtual engines encompass more than one physical 3728 * engine and so we cannot accurately tell in advance if one of those 3729 * engines is already saturated and so cannot afford to use a semaphore 3730 * and be pessimized in priority for doing so -- if we are the only 3731 * context using semaphores after all other clients have stopped, we 3732 * will be starved on the saturated system. Such a global switch for 3733 * semaphores is less than ideal, but alas is the current compromise. 3734 */ 3735 ve->base.saturated = ALL_ENGINES; 3736 3737 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 3738 3739 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 3740 3741 intel_engine_init_execlists(&ve->base); 3742 3743 ve->base.cops = &virtual_context_ops; 3744 ve->base.request_alloc = execlists_request_alloc; 3745 3746 ve->base.schedule = i915_schedule; 3747 ve->base.submit_request = virtual_submit_request; 3748 ve->base.bond_execute = virtual_bond_execute; 3749 3750 INIT_LIST_HEAD(virtual_queue(ve)); 3751 ve->base.execlists.queue_priority_hint = INT_MIN; 3752 tasklet_init(&ve->base.execlists.tasklet, 3753 virtual_submission_tasklet, 3754 (unsigned long)ve); 3755 3756 intel_context_init(&ve->context, ctx, &ve->base); 3757 3758 for (n = 0; n < count; n++) { 3759 struct intel_engine_cs *sibling = siblings[n]; 3760 3761 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 3762 if (sibling->mask & ve->base.mask) { 3763 DRM_DEBUG("duplicate %s entry in load balancer\n", 3764 sibling->name); 3765 err = -EINVAL; 3766 goto err_put; 3767 } 3768 3769 /* 3770 * The virtual engine implementation is tightly coupled to 3771 * the execlists backend -- we push out request directly 3772 * into a tree inside each physical engine. We could support 3773 * layering if we handle cloning of the requests and 3774 * submitting a copy into each backend. 3775 */ 3776 if (sibling->execlists.tasklet.func != 3777 execlists_submission_tasklet) { 3778 err = -ENODEV; 3779 goto err_put; 3780 } 3781 3782 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 3783 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 3784 3785 ve->siblings[ve->num_siblings++] = sibling; 3786 ve->base.mask |= sibling->mask; 3787 3788 /* 3789 * All physical engines must be compatible for their emission 3790 * functions (as we build the instructions during request 3791 * construction and do not alter them before submission 3792 * on the physical engine). We use the engine class as a guide 3793 * here, although that could be refined. 3794 */ 3795 if (ve->base.class != OTHER_CLASS) { 3796 if (ve->base.class != sibling->class) { 3797 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 3798 sibling->class, ve->base.class); 3799 err = -EINVAL; 3800 goto err_put; 3801 } 3802 continue; 3803 } 3804 3805 ve->base.class = sibling->class; 3806 ve->base.uabi_class = sibling->uabi_class; 3807 snprintf(ve->base.name, sizeof(ve->base.name), 3808 "v%dx%d", ve->base.class, count); 3809 ve->base.context_size = sibling->context_size; 3810 3811 ve->base.emit_bb_start = sibling->emit_bb_start; 3812 ve->base.emit_flush = sibling->emit_flush; 3813 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 3814 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 3815 ve->base.emit_fini_breadcrumb_dw = 3816 sibling->emit_fini_breadcrumb_dw; 3817 3818 ve->base.flags = sibling->flags; 3819 } 3820 3821 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 3822 3823 err = __execlists_context_alloc(&ve->context, siblings[0]); 3824 if (err) 3825 goto err_put; 3826 3827 __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags); 3828 3829 return &ve->context; 3830 3831 err_put: 3832 intel_context_put(&ve->context); 3833 return ERR_PTR(err); 3834 } 3835 3836 struct intel_context * 3837 intel_execlists_clone_virtual(struct i915_gem_context *ctx, 3838 struct intel_engine_cs *src) 3839 { 3840 struct virtual_engine *se = to_virtual_engine(src); 3841 struct intel_context *dst; 3842 3843 dst = intel_execlists_create_virtual(ctx, 3844 se->siblings, 3845 se->num_siblings); 3846 if (IS_ERR(dst)) 3847 return dst; 3848 3849 if (se->num_bonds) { 3850 struct virtual_engine *de = to_virtual_engine(dst->engine); 3851 3852 de->bonds = kmemdup(se->bonds, 3853 sizeof(*se->bonds) * se->num_bonds, 3854 GFP_KERNEL); 3855 if (!de->bonds) { 3856 intel_context_put(dst); 3857 return ERR_PTR(-ENOMEM); 3858 } 3859 3860 de->num_bonds = se->num_bonds; 3861 } 3862 3863 return dst; 3864 } 3865 3866 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 3867 const struct intel_engine_cs *master, 3868 const struct intel_engine_cs *sibling) 3869 { 3870 struct virtual_engine *ve = to_virtual_engine(engine); 3871 struct ve_bond *bond; 3872 int n; 3873 3874 /* Sanity check the sibling is part of the virtual engine */ 3875 for (n = 0; n < ve->num_siblings; n++) 3876 if (sibling == ve->siblings[n]) 3877 break; 3878 if (n == ve->num_siblings) 3879 return -EINVAL; 3880 3881 bond = virtual_find_bond(ve, master); 3882 if (bond) { 3883 bond->sibling_mask |= sibling->mask; 3884 return 0; 3885 } 3886 3887 bond = krealloc(ve->bonds, 3888 sizeof(*bond) * (ve->num_bonds + 1), 3889 GFP_KERNEL); 3890 if (!bond) 3891 return -ENOMEM; 3892 3893 bond[ve->num_bonds].master = master; 3894 bond[ve->num_bonds].sibling_mask = sibling->mask; 3895 3896 ve->bonds = bond; 3897 ve->num_bonds++; 3898 3899 return 0; 3900 } 3901 3902 void intel_execlists_show_requests(struct intel_engine_cs *engine, 3903 struct drm_printer *m, 3904 void (*show_request)(struct drm_printer *m, 3905 struct i915_request *rq, 3906 const char *prefix), 3907 unsigned int max) 3908 { 3909 const struct intel_engine_execlists *execlists = &engine->execlists; 3910 struct i915_request *rq, *last; 3911 unsigned long flags; 3912 unsigned int count; 3913 struct rb_node *rb; 3914 3915 spin_lock_irqsave(&engine->active.lock, flags); 3916 3917 last = NULL; 3918 count = 0; 3919 list_for_each_entry(rq, &engine->active.requests, sched.link) { 3920 if (count++ < max - 1) 3921 show_request(m, rq, "\t\tE "); 3922 else 3923 last = rq; 3924 } 3925 if (last) { 3926 if (count > max) { 3927 drm_printf(m, 3928 "\t\t...skipping %d executing requests...\n", 3929 count - max); 3930 } 3931 show_request(m, last, "\t\tE "); 3932 } 3933 3934 last = NULL; 3935 count = 0; 3936 if (execlists->queue_priority_hint != INT_MIN) 3937 drm_printf(m, "\t\tQueue priority hint: %d\n", 3938 execlists->queue_priority_hint); 3939 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 3940 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 3941 int i; 3942 3943 priolist_for_each_request(rq, p, i) { 3944 if (count++ < max - 1) 3945 show_request(m, rq, "\t\tQ "); 3946 else 3947 last = rq; 3948 } 3949 } 3950 if (last) { 3951 if (count > max) { 3952 drm_printf(m, 3953 "\t\t...skipping %d queued requests...\n", 3954 count - max); 3955 } 3956 show_request(m, last, "\t\tQ "); 3957 } 3958 3959 last = NULL; 3960 count = 0; 3961 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 3962 struct virtual_engine *ve = 3963 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3964 struct i915_request *rq = READ_ONCE(ve->request); 3965 3966 if (rq) { 3967 if (count++ < max - 1) 3968 show_request(m, rq, "\t\tV "); 3969 else 3970 last = rq; 3971 } 3972 } 3973 if (last) { 3974 if (count > max) { 3975 drm_printf(m, 3976 "\t\t...skipping %d virtual requests...\n", 3977 count - max); 3978 } 3979 show_request(m, last, "\t\tV "); 3980 } 3981 3982 spin_unlock_irqrestore(&engine->active.lock, flags); 3983 } 3984 3985 void intel_lr_context_reset(struct intel_engine_cs *engine, 3986 struct intel_context *ce, 3987 u32 head, 3988 bool scrub) 3989 { 3990 /* 3991 * We want a simple context + ring to execute the breadcrumb update. 3992 * We cannot rely on the context being intact across the GPU hang, 3993 * so clear it and rebuild just what we need for the breadcrumb. 3994 * All pending requests for this context will be zapped, and any 3995 * future request will be after userspace has had the opportunity 3996 * to recreate its own state. 3997 */ 3998 if (scrub) { 3999 u32 *regs = ce->lrc_reg_state; 4000 4001 if (engine->pinned_default_state) { 4002 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 4003 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 4004 engine->context_size - PAGE_SIZE); 4005 } 4006 execlists_init_reg_state(regs, ce, engine, ce->ring); 4007 } 4008 4009 /* Rerun the request; its payload has been neutered (if guilty). */ 4010 ce->ring->head = head; 4011 intel_ring_update_space(ce->ring); 4012 4013 __execlists_update_reg_state(ce, engine); 4014 } 4015 4016 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 4017 #include "selftest_lrc.c" 4018 #endif 4019