1 /* 2 * Copyright © 2008-2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/dma-fence-array.h> 26 #include <linux/irq_work.h> 27 #include <linux/prefetch.h> 28 #include <linux/sched.h> 29 #include <linux/sched/clock.h> 30 #include <linux/sched/signal.h> 31 32 #include "gem/i915_gem_context.h" 33 #include "gt/intel_context.h" 34 #include "gt/intel_ring.h" 35 #include "gt/intel_rps.h" 36 37 #include "i915_active.h" 38 #include "i915_drv.h" 39 #include "i915_globals.h" 40 #include "i915_trace.h" 41 #include "intel_pm.h" 42 43 struct execute_cb { 44 struct list_head link; 45 struct irq_work work; 46 struct i915_sw_fence *fence; 47 void (*hook)(struct i915_request *rq, struct dma_fence *signal); 48 struct i915_request *signal; 49 }; 50 51 static struct i915_global_request { 52 struct i915_global base; 53 struct kmem_cache *slab_requests; 54 struct kmem_cache *slab_execute_cbs; 55 } global; 56 57 static const char *i915_fence_get_driver_name(struct dma_fence *fence) 58 { 59 return dev_name(to_request(fence)->i915->drm.dev); 60 } 61 62 static const char *i915_fence_get_timeline_name(struct dma_fence *fence) 63 { 64 const struct i915_gem_context *ctx; 65 66 /* 67 * The timeline struct (as part of the ppgtt underneath a context) 68 * may be freed when the request is no longer in use by the GPU. 69 * We could extend the life of a context to beyond that of all 70 * fences, possibly keeping the hw resource around indefinitely, 71 * or we just give them a false name. Since 72 * dma_fence_ops.get_timeline_name is a debug feature, the occasional 73 * lie seems justifiable. 74 */ 75 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 76 return "signaled"; 77 78 ctx = i915_request_gem_context(to_request(fence)); 79 if (!ctx) 80 return "[" DRIVER_NAME "]"; 81 82 return ctx->name; 83 } 84 85 static bool i915_fence_signaled(struct dma_fence *fence) 86 { 87 return i915_request_completed(to_request(fence)); 88 } 89 90 static bool i915_fence_enable_signaling(struct dma_fence *fence) 91 { 92 return i915_request_enable_breadcrumb(to_request(fence)); 93 } 94 95 static signed long i915_fence_wait(struct dma_fence *fence, 96 bool interruptible, 97 signed long timeout) 98 { 99 return i915_request_wait(to_request(fence), 100 interruptible | I915_WAIT_PRIORITY, 101 timeout); 102 } 103 104 static void i915_fence_release(struct dma_fence *fence) 105 { 106 struct i915_request *rq = to_request(fence); 107 108 /* 109 * The request is put onto a RCU freelist (i.e. the address 110 * is immediately reused), mark the fences as being freed now. 111 * Otherwise the debugobjects for the fences are only marked as 112 * freed when the slab cache itself is freed, and so we would get 113 * caught trying to reuse dead objects. 114 */ 115 i915_sw_fence_fini(&rq->submit); 116 i915_sw_fence_fini(&rq->semaphore); 117 118 kmem_cache_free(global.slab_requests, rq); 119 } 120 121 const struct dma_fence_ops i915_fence_ops = { 122 .get_driver_name = i915_fence_get_driver_name, 123 .get_timeline_name = i915_fence_get_timeline_name, 124 .enable_signaling = i915_fence_enable_signaling, 125 .signaled = i915_fence_signaled, 126 .wait = i915_fence_wait, 127 .release = i915_fence_release, 128 }; 129 130 static void irq_execute_cb(struct irq_work *wrk) 131 { 132 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 133 134 i915_sw_fence_complete(cb->fence); 135 kmem_cache_free(global.slab_execute_cbs, cb); 136 } 137 138 static void irq_execute_cb_hook(struct irq_work *wrk) 139 { 140 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 141 142 cb->hook(container_of(cb->fence, struct i915_request, submit), 143 &cb->signal->fence); 144 i915_request_put(cb->signal); 145 146 irq_execute_cb(wrk); 147 } 148 149 static void __notify_execute_cb(struct i915_request *rq) 150 { 151 struct execute_cb *cb; 152 153 lockdep_assert_held(&rq->lock); 154 155 if (list_empty(&rq->execute_cb)) 156 return; 157 158 list_for_each_entry(cb, &rq->execute_cb, link) 159 irq_work_queue(&cb->work); 160 161 /* 162 * XXX Rollback on __i915_request_unsubmit() 163 * 164 * In the future, perhaps when we have an active time-slicing scheduler, 165 * it will be interesting to unsubmit parallel execution and remove 166 * busywaits from the GPU until their master is restarted. This is 167 * quite hairy, we have to carefully rollback the fence and do a 168 * preempt-to-idle cycle on the target engine, all the while the 169 * master execute_cb may refire. 170 */ 171 INIT_LIST_HEAD(&rq->execute_cb); 172 } 173 174 static inline void 175 remove_from_client(struct i915_request *request) 176 { 177 struct drm_i915_file_private *file_priv; 178 179 if (!READ_ONCE(request->file_priv)) 180 return; 181 182 rcu_read_lock(); 183 file_priv = xchg(&request->file_priv, NULL); 184 if (file_priv) { 185 spin_lock(&file_priv->mm.lock); 186 list_del(&request->client_link); 187 spin_unlock(&file_priv->mm.lock); 188 } 189 rcu_read_unlock(); 190 } 191 192 static void free_capture_list(struct i915_request *request) 193 { 194 struct i915_capture_list *capture; 195 196 capture = fetch_and_zero(&request->capture_list); 197 while (capture) { 198 struct i915_capture_list *next = capture->next; 199 200 kfree(capture); 201 capture = next; 202 } 203 } 204 205 static void __i915_request_fill(struct i915_request *rq, u8 val) 206 { 207 void *vaddr = rq->ring->vaddr; 208 u32 head; 209 210 head = rq->infix; 211 if (rq->postfix < head) { 212 memset(vaddr + head, val, rq->ring->size - head); 213 head = 0; 214 } 215 memset(vaddr + head, val, rq->postfix - head); 216 } 217 218 static void remove_from_engine(struct i915_request *rq) 219 { 220 struct intel_engine_cs *engine, *locked; 221 222 /* 223 * Virtual engines complicate acquiring the engine timeline lock, 224 * as their rq->engine pointer is not stable until under that 225 * engine lock. The simple ploy we use is to take the lock then 226 * check that the rq still belongs to the newly locked engine. 227 */ 228 locked = READ_ONCE(rq->engine); 229 spin_lock_irq(&locked->active.lock); 230 while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { 231 spin_unlock(&locked->active.lock); 232 spin_lock(&engine->active.lock); 233 locked = engine; 234 } 235 list_del_init(&rq->sched.link); 236 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 237 clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); 238 spin_unlock_irq(&locked->active.lock); 239 } 240 241 bool i915_request_retire(struct i915_request *rq) 242 { 243 if (!i915_request_completed(rq)) 244 return false; 245 246 RQ_TRACE(rq, "\n"); 247 248 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 249 trace_i915_request_retire(rq); 250 251 /* 252 * We know the GPU must have read the request to have 253 * sent us the seqno + interrupt, so use the position 254 * of tail of the request to update the last known position 255 * of the GPU head. 256 * 257 * Note this requires that we are always called in request 258 * completion order. 259 */ 260 GEM_BUG_ON(!list_is_first(&rq->link, 261 &i915_request_timeline(rq)->requests)); 262 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 263 /* Poison before we release our space in the ring */ 264 __i915_request_fill(rq, POISON_FREE); 265 rq->ring->head = rq->postfix; 266 267 /* 268 * We only loosely track inflight requests across preemption, 269 * and so we may find ourselves attempting to retire a _completed_ 270 * request that we have removed from the HW and put back on a run 271 * queue. 272 */ 273 remove_from_engine(rq); 274 275 spin_lock_irq(&rq->lock); 276 i915_request_mark_complete(rq); 277 if (!i915_request_signaled(rq)) 278 dma_fence_signal_locked(&rq->fence); 279 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) 280 i915_request_cancel_breadcrumb(rq); 281 if (i915_request_has_waitboost(rq)) { 282 GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); 283 atomic_dec(&rq->engine->gt->rps.num_waiters); 284 } 285 if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { 286 set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); 287 __notify_execute_cb(rq); 288 } 289 GEM_BUG_ON(!list_empty(&rq->execute_cb)); 290 spin_unlock_irq(&rq->lock); 291 292 remove_from_client(rq); 293 list_del(&rq->link); 294 295 intel_context_exit(rq->context); 296 intel_context_unpin(rq->context); 297 298 free_capture_list(rq); 299 i915_sched_node_fini(&rq->sched); 300 i915_request_put(rq); 301 302 return true; 303 } 304 305 void i915_request_retire_upto(struct i915_request *rq) 306 { 307 struct intel_timeline * const tl = i915_request_timeline(rq); 308 struct i915_request *tmp; 309 310 RQ_TRACE(rq, "\n"); 311 312 GEM_BUG_ON(!i915_request_completed(rq)); 313 314 do { 315 tmp = list_first_entry(&tl->requests, typeof(*tmp), link); 316 } while (i915_request_retire(tmp) && tmp != rq); 317 } 318 319 static int 320 __await_execution(struct i915_request *rq, 321 struct i915_request *signal, 322 void (*hook)(struct i915_request *rq, 323 struct dma_fence *signal), 324 gfp_t gfp) 325 { 326 struct execute_cb *cb; 327 328 if (i915_request_is_active(signal)) { 329 if (hook) 330 hook(rq, &signal->fence); 331 return 0; 332 } 333 334 cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); 335 if (!cb) 336 return -ENOMEM; 337 338 cb->fence = &rq->submit; 339 i915_sw_fence_await(cb->fence); 340 init_irq_work(&cb->work, irq_execute_cb); 341 342 if (hook) { 343 cb->hook = hook; 344 cb->signal = i915_request_get(signal); 345 cb->work.func = irq_execute_cb_hook; 346 } 347 348 spin_lock_irq(&signal->lock); 349 if (i915_request_is_active(signal)) { 350 if (hook) { 351 hook(rq, &signal->fence); 352 i915_request_put(signal); 353 } 354 i915_sw_fence_complete(cb->fence); 355 kmem_cache_free(global.slab_execute_cbs, cb); 356 } else { 357 list_add_tail(&cb->link, &signal->execute_cb); 358 } 359 spin_unlock_irq(&signal->lock); 360 361 /* Copy across semaphore status as we need the same behaviour */ 362 rq->sched.flags |= signal->sched.flags; 363 return 0; 364 } 365 366 bool __i915_request_submit(struct i915_request *request) 367 { 368 struct intel_engine_cs *engine = request->engine; 369 bool result = false; 370 371 RQ_TRACE(request, "\n"); 372 373 GEM_BUG_ON(!irqs_disabled()); 374 lockdep_assert_held(&engine->active.lock); 375 376 /* 377 * With the advent of preempt-to-busy, we frequently encounter 378 * requests that we have unsubmitted from HW, but left running 379 * until the next ack and so have completed in the meantime. On 380 * resubmission of that completed request, we can skip 381 * updating the payload, and execlists can even skip submitting 382 * the request. 383 * 384 * We must remove the request from the caller's priority queue, 385 * and the caller must only call us when the request is in their 386 * priority queue, under the active.lock. This ensures that the 387 * request has *not* yet been retired and we can safely move 388 * the request into the engine->active.list where it will be 389 * dropped upon retiring. (Otherwise if resubmit a *retired* 390 * request, this would be a horrible use-after-free.) 391 */ 392 if (i915_request_completed(request)) 393 goto xfer; 394 395 if (intel_context_is_banned(request->context)) 396 i915_request_skip(request, -EIO); 397 398 /* 399 * Are we using semaphores when the gpu is already saturated? 400 * 401 * Using semaphores incurs a cost in having the GPU poll a 402 * memory location, busywaiting for it to change. The continual 403 * memory reads can have a noticeable impact on the rest of the 404 * system with the extra bus traffic, stalling the cpu as it too 405 * tries to access memory across the bus (perf stat -e bus-cycles). 406 * 407 * If we installed a semaphore on this request and we only submit 408 * the request after the signaler completed, that indicates the 409 * system is overloaded and using semaphores at this time only 410 * increases the amount of work we are doing. If so, we disable 411 * further use of semaphores until we are idle again, whence we 412 * optimistically try again. 413 */ 414 if (request->sched.semaphores && 415 i915_sw_fence_signaled(&request->semaphore)) 416 engine->saturated |= request->sched.semaphores; 417 418 engine->emit_fini_breadcrumb(request, 419 request->ring->vaddr + request->postfix); 420 421 trace_i915_request_execute(request); 422 engine->serial++; 423 result = true; 424 425 xfer: /* We may be recursing from the signal callback of another i915 fence */ 426 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 427 428 if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) { 429 list_move_tail(&request->sched.link, &engine->active.requests); 430 clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); 431 } 432 433 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && 434 !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && 435 !i915_request_enable_breadcrumb(request)) 436 intel_engine_signal_breadcrumbs(engine); 437 438 __notify_execute_cb(request); 439 440 spin_unlock(&request->lock); 441 442 return result; 443 } 444 445 void i915_request_submit(struct i915_request *request) 446 { 447 struct intel_engine_cs *engine = request->engine; 448 unsigned long flags; 449 450 /* Will be called from irq-context when using foreign fences. */ 451 spin_lock_irqsave(&engine->active.lock, flags); 452 453 __i915_request_submit(request); 454 455 spin_unlock_irqrestore(&engine->active.lock, flags); 456 } 457 458 void __i915_request_unsubmit(struct i915_request *request) 459 { 460 struct intel_engine_cs *engine = request->engine; 461 462 RQ_TRACE(request, "\n"); 463 464 GEM_BUG_ON(!irqs_disabled()); 465 lockdep_assert_held(&engine->active.lock); 466 467 /* 468 * Only unwind in reverse order, required so that the per-context list 469 * is kept in seqno/ring order. 470 */ 471 472 /* We may be recursing from the signal callback of another i915 fence */ 473 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 474 475 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) 476 i915_request_cancel_breadcrumb(request); 477 478 GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); 479 clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); 480 481 spin_unlock(&request->lock); 482 483 /* We've already spun, don't charge on resubmitting. */ 484 if (request->sched.semaphores && i915_request_started(request)) { 485 request->sched.attr.priority |= I915_PRIORITY_NOSEMAPHORE; 486 request->sched.semaphores = 0; 487 } 488 489 /* 490 * We don't need to wake_up any waiters on request->execute, they 491 * will get woken by any other event or us re-adding this request 492 * to the engine timeline (__i915_request_submit()). The waiters 493 * should be quite adapt at finding that the request now has a new 494 * global_seqno to the one they went to sleep on. 495 */ 496 } 497 498 void i915_request_unsubmit(struct i915_request *request) 499 { 500 struct intel_engine_cs *engine = request->engine; 501 unsigned long flags; 502 503 /* Will be called from irq-context when using foreign fences. */ 504 spin_lock_irqsave(&engine->active.lock, flags); 505 506 __i915_request_unsubmit(request); 507 508 spin_unlock_irqrestore(&engine->active.lock, flags); 509 } 510 511 static int __i915_sw_fence_call 512 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 513 { 514 struct i915_request *request = 515 container_of(fence, typeof(*request), submit); 516 517 switch (state) { 518 case FENCE_COMPLETE: 519 trace_i915_request_submit(request); 520 521 if (unlikely(fence->error)) 522 i915_request_skip(request, fence->error); 523 524 /* 525 * We need to serialize use of the submit_request() callback 526 * with its hotplugging performed during an emergency 527 * i915_gem_set_wedged(). We use the RCU mechanism to mark the 528 * critical section in order to force i915_gem_set_wedged() to 529 * wait until the submit_request() is completed before 530 * proceeding. 531 */ 532 rcu_read_lock(); 533 request->engine->submit_request(request); 534 rcu_read_unlock(); 535 break; 536 537 case FENCE_FREE: 538 i915_request_put(request); 539 break; 540 } 541 542 return NOTIFY_DONE; 543 } 544 545 static int __i915_sw_fence_call 546 semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 547 { 548 struct i915_request *request = 549 container_of(fence, typeof(*request), semaphore); 550 551 switch (state) { 552 case FENCE_COMPLETE: 553 i915_schedule_bump_priority(request, I915_PRIORITY_NOSEMAPHORE); 554 break; 555 556 case FENCE_FREE: 557 i915_request_put(request); 558 break; 559 } 560 561 return NOTIFY_DONE; 562 } 563 564 static void retire_requests(struct intel_timeline *tl) 565 { 566 struct i915_request *rq, *rn; 567 568 list_for_each_entry_safe(rq, rn, &tl->requests, link) 569 if (!i915_request_retire(rq)) 570 break; 571 } 572 573 static noinline struct i915_request * 574 request_alloc_slow(struct intel_timeline *tl, gfp_t gfp) 575 { 576 struct i915_request *rq; 577 578 if (list_empty(&tl->requests)) 579 goto out; 580 581 if (!gfpflags_allow_blocking(gfp)) 582 goto out; 583 584 /* Move our oldest request to the slab-cache (if not in use!) */ 585 rq = list_first_entry(&tl->requests, typeof(*rq), link); 586 i915_request_retire(rq); 587 588 rq = kmem_cache_alloc(global.slab_requests, 589 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 590 if (rq) 591 return rq; 592 593 /* Ratelimit ourselves to prevent oom from malicious clients */ 594 rq = list_last_entry(&tl->requests, typeof(*rq), link); 595 cond_synchronize_rcu(rq->rcustate); 596 597 /* Retire our old requests in the hope that we free some */ 598 retire_requests(tl); 599 600 out: 601 return kmem_cache_alloc(global.slab_requests, gfp); 602 } 603 604 static void __i915_request_ctor(void *arg) 605 { 606 struct i915_request *rq = arg; 607 608 spin_lock_init(&rq->lock); 609 i915_sched_node_init(&rq->sched); 610 i915_sw_fence_init(&rq->submit, submit_notify); 611 i915_sw_fence_init(&rq->semaphore, semaphore_notify); 612 613 dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); 614 615 rq->file_priv = NULL; 616 rq->capture_list = NULL; 617 618 INIT_LIST_HEAD(&rq->execute_cb); 619 } 620 621 struct i915_request * 622 __i915_request_create(struct intel_context *ce, gfp_t gfp) 623 { 624 struct intel_timeline *tl = ce->timeline; 625 struct i915_request *rq; 626 u32 seqno; 627 int ret; 628 629 might_sleep_if(gfpflags_allow_blocking(gfp)); 630 631 /* Check that the caller provided an already pinned context */ 632 __intel_context_pin(ce); 633 634 /* 635 * Beware: Dragons be flying overhead. 636 * 637 * We use RCU to look up requests in flight. The lookups may 638 * race with the request being allocated from the slab freelist. 639 * That is the request we are writing to here, may be in the process 640 * of being read by __i915_active_request_get_rcu(). As such, 641 * we have to be very careful when overwriting the contents. During 642 * the RCU lookup, we change chase the request->engine pointer, 643 * read the request->global_seqno and increment the reference count. 644 * 645 * The reference count is incremented atomically. If it is zero, 646 * the lookup knows the request is unallocated and complete. Otherwise, 647 * it is either still in use, or has been reallocated and reset 648 * with dma_fence_init(). This increment is safe for release as we 649 * check that the request we have a reference to and matches the active 650 * request. 651 * 652 * Before we increment the refcount, we chase the request->engine 653 * pointer. We must not call kmem_cache_zalloc() or else we set 654 * that pointer to NULL and cause a crash during the lookup. If 655 * we see the request is completed (based on the value of the 656 * old engine and seqno), the lookup is complete and reports NULL. 657 * If we decide the request is not completed (new engine or seqno), 658 * then we grab a reference and double check that it is still the 659 * active request - which it won't be and restart the lookup. 660 * 661 * Do not use kmem_cache_zalloc() here! 662 */ 663 rq = kmem_cache_alloc(global.slab_requests, 664 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 665 if (unlikely(!rq)) { 666 rq = request_alloc_slow(tl, gfp); 667 if (!rq) { 668 ret = -ENOMEM; 669 goto err_unreserve; 670 } 671 } 672 673 rq->i915 = ce->engine->i915; 674 rq->context = ce; 675 rq->engine = ce->engine; 676 rq->ring = ce->ring; 677 rq->execution_mask = ce->engine->mask; 678 679 kref_init(&rq->fence.refcount); 680 rq->fence.flags = 0; 681 rq->fence.error = 0; 682 INIT_LIST_HEAD(&rq->fence.cb_list); 683 684 ret = intel_timeline_get_seqno(tl, rq, &seqno); 685 if (ret) 686 goto err_free; 687 688 rq->fence.context = tl->fence_context; 689 rq->fence.seqno = seqno; 690 691 RCU_INIT_POINTER(rq->timeline, tl); 692 RCU_INIT_POINTER(rq->hwsp_cacheline, tl->hwsp_cacheline); 693 rq->hwsp_seqno = tl->hwsp_seqno; 694 695 rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ 696 697 /* We bump the ref for the fence chain */ 698 i915_sw_fence_reinit(&i915_request_get(rq)->submit); 699 i915_sw_fence_reinit(&i915_request_get(rq)->semaphore); 700 701 i915_sched_node_reinit(&rq->sched); 702 703 /* No zalloc, everything must be cleared after use */ 704 rq->batch = NULL; 705 GEM_BUG_ON(rq->file_priv); 706 GEM_BUG_ON(rq->capture_list); 707 GEM_BUG_ON(!list_empty(&rq->execute_cb)); 708 709 /* 710 * Reserve space in the ring buffer for all the commands required to 711 * eventually emit this request. This is to guarantee that the 712 * i915_request_add() call can't fail. Note that the reserve may need 713 * to be redone if the request is not actually submitted straight 714 * away, e.g. because a GPU scheduler has deferred it. 715 * 716 * Note that due to how we add reserved_space to intel_ring_begin() 717 * we need to double our request to ensure that if we need to wrap 718 * around inside i915_request_add() there is sufficient space at 719 * the beginning of the ring as well. 720 */ 721 rq->reserved_space = 722 2 * rq->engine->emit_fini_breadcrumb_dw * sizeof(u32); 723 724 /* 725 * Record the position of the start of the request so that 726 * should we detect the updated seqno part-way through the 727 * GPU processing the request, we never over-estimate the 728 * position of the head. 729 */ 730 rq->head = rq->ring->emit; 731 732 ret = rq->engine->request_alloc(rq); 733 if (ret) 734 goto err_unwind; 735 736 rq->infix = rq->ring->emit; /* end of header; start of user payload */ 737 738 intel_context_mark_active(ce); 739 return rq; 740 741 err_unwind: 742 ce->ring->emit = rq->head; 743 744 /* Make sure we didn't add ourselves to external state before freeing */ 745 GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); 746 GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); 747 748 err_free: 749 kmem_cache_free(global.slab_requests, rq); 750 err_unreserve: 751 intel_context_unpin(ce); 752 return ERR_PTR(ret); 753 } 754 755 struct i915_request * 756 i915_request_create(struct intel_context *ce) 757 { 758 struct i915_request *rq; 759 struct intel_timeline *tl; 760 761 tl = intel_context_timeline_lock(ce); 762 if (IS_ERR(tl)) 763 return ERR_CAST(tl); 764 765 /* Move our oldest request to the slab-cache (if not in use!) */ 766 rq = list_first_entry(&tl->requests, typeof(*rq), link); 767 if (!list_is_last(&rq->link, &tl->requests)) 768 i915_request_retire(rq); 769 770 intel_context_enter(ce); 771 rq = __i915_request_create(ce, GFP_KERNEL); 772 intel_context_exit(ce); /* active reference transferred to request */ 773 if (IS_ERR(rq)) 774 goto err_unlock; 775 776 /* Check that we do not interrupt ourselves with a new request */ 777 rq->cookie = lockdep_pin_lock(&tl->mutex); 778 779 return rq; 780 781 err_unlock: 782 intel_context_timeline_unlock(tl); 783 return rq; 784 } 785 786 static int 787 i915_request_await_start(struct i915_request *rq, struct i915_request *signal) 788 { 789 struct dma_fence *fence; 790 int err; 791 792 GEM_BUG_ON(i915_request_timeline(rq) == 793 rcu_access_pointer(signal->timeline)); 794 795 fence = NULL; 796 rcu_read_lock(); 797 spin_lock_irq(&signal->lock); 798 if (!i915_request_started(signal) && 799 !list_is_first(&signal->link, 800 &rcu_dereference(signal->timeline)->requests)) { 801 struct i915_request *prev = list_prev_entry(signal, link); 802 803 /* 804 * Peek at the request before us in the timeline. That 805 * request will only be valid before it is retired, so 806 * after acquiring a reference to it, confirm that it is 807 * still part of the signaler's timeline. 808 */ 809 if (i915_request_get_rcu(prev)) { 810 if (list_next_entry(prev, link) == signal) 811 fence = &prev->fence; 812 else 813 i915_request_put(prev); 814 } 815 } 816 spin_unlock_irq(&signal->lock); 817 rcu_read_unlock(); 818 if (!fence) 819 return 0; 820 821 err = 0; 822 if (intel_timeline_sync_is_later(i915_request_timeline(rq), fence)) 823 err = i915_sw_fence_await_dma_fence(&rq->submit, 824 fence, 0, 825 I915_FENCE_GFP); 826 dma_fence_put(fence); 827 828 return err; 829 } 830 831 static intel_engine_mask_t 832 already_busywaiting(struct i915_request *rq) 833 { 834 /* 835 * Polling a semaphore causes bus traffic, delaying other users of 836 * both the GPU and CPU. We want to limit the impact on others, 837 * while taking advantage of early submission to reduce GPU 838 * latency. Therefore we restrict ourselves to not using more 839 * than one semaphore from each source, and not using a semaphore 840 * if we have detected the engine is saturated (i.e. would not be 841 * submitted early and cause bus traffic reading an already passed 842 * semaphore). 843 * 844 * See the are-we-too-late? check in __i915_request_submit(). 845 */ 846 return rq->sched.semaphores | rq->engine->saturated; 847 } 848 849 static int 850 __emit_semaphore_wait(struct i915_request *to, 851 struct i915_request *from, 852 u32 seqno) 853 { 854 const int has_token = INTEL_GEN(to->i915) >= 12; 855 u32 hwsp_offset; 856 int len, err; 857 u32 *cs; 858 859 GEM_BUG_ON(INTEL_GEN(to->i915) < 8); 860 861 /* We need to pin the signaler's HWSP until we are finished reading. */ 862 err = intel_timeline_read_hwsp(from, to, &hwsp_offset); 863 if (err) 864 return err; 865 866 len = 4; 867 if (has_token) 868 len += 2; 869 870 cs = intel_ring_begin(to, len); 871 if (IS_ERR(cs)) 872 return PTR_ERR(cs); 873 874 /* 875 * Using greater-than-or-equal here means we have to worry 876 * about seqno wraparound. To side step that issue, we swap 877 * the timeline HWSP upon wrapping, so that everyone listening 878 * for the old (pre-wrap) values do not see the much smaller 879 * (post-wrap) values than they were expecting (and so wait 880 * forever). 881 */ 882 *cs++ = (MI_SEMAPHORE_WAIT | 883 MI_SEMAPHORE_GLOBAL_GTT | 884 MI_SEMAPHORE_POLL | 885 MI_SEMAPHORE_SAD_GTE_SDD) + 886 has_token; 887 *cs++ = seqno; 888 *cs++ = hwsp_offset; 889 *cs++ = 0; 890 if (has_token) { 891 *cs++ = 0; 892 *cs++ = MI_NOOP; 893 } 894 895 intel_ring_advance(to, cs); 896 return 0; 897 } 898 899 static int 900 emit_semaphore_wait(struct i915_request *to, 901 struct i915_request *from, 902 gfp_t gfp) 903 { 904 if (!intel_context_use_semaphores(to->context)) 905 goto await_fence; 906 907 if (!rcu_access_pointer(from->hwsp_cacheline)) 908 goto await_fence; 909 910 /* Just emit the first semaphore we see as request space is limited. */ 911 if (already_busywaiting(to) & from->engine->mask) 912 goto await_fence; 913 914 if (i915_request_await_start(to, from) < 0) 915 goto await_fence; 916 917 /* Only submit our spinner after the signaler is running! */ 918 if (__await_execution(to, from, NULL, gfp)) 919 goto await_fence; 920 921 if (__emit_semaphore_wait(to, from, from->fence.seqno)) 922 goto await_fence; 923 924 to->sched.semaphores |= from->engine->mask; 925 to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; 926 return 0; 927 928 await_fence: 929 return i915_sw_fence_await_dma_fence(&to->submit, 930 &from->fence, 0, 931 I915_FENCE_GFP); 932 } 933 934 static int 935 i915_request_await_request(struct i915_request *to, struct i915_request *from) 936 { 937 int ret; 938 939 GEM_BUG_ON(to == from); 940 GEM_BUG_ON(to->timeline == from->timeline); 941 942 if (i915_request_completed(from)) 943 return 0; 944 945 if (to->engine->schedule) { 946 ret = i915_sched_node_add_dependency(&to->sched, &from->sched); 947 if (ret < 0) 948 return ret; 949 } 950 951 if (to->engine == from->engine) 952 ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, 953 &from->submit, 954 I915_FENCE_GFP); 955 else 956 ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); 957 if (ret < 0) 958 return ret; 959 960 if (to->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN) { 961 ret = i915_sw_fence_await_dma_fence(&to->semaphore, 962 &from->fence, 0, 963 I915_FENCE_GFP); 964 if (ret < 0) 965 return ret; 966 } 967 968 return 0; 969 } 970 971 int 972 i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) 973 { 974 struct dma_fence **child = &fence; 975 unsigned int nchild = 1; 976 int ret; 977 978 /* 979 * Note that if the fence-array was created in signal-on-any mode, 980 * we should *not* decompose it into its individual fences. However, 981 * we don't currently store which mode the fence-array is operating 982 * in. Fortunately, the only user of signal-on-any is private to 983 * amdgpu and we should not see any incoming fence-array from 984 * sync-file being in signal-on-any mode. 985 */ 986 if (dma_fence_is_array(fence)) { 987 struct dma_fence_array *array = to_dma_fence_array(fence); 988 989 child = array->fences; 990 nchild = array->num_fences; 991 GEM_BUG_ON(!nchild); 992 } 993 994 do { 995 fence = *child++; 996 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 997 i915_sw_fence_set_error_once(&rq->submit, fence->error); 998 continue; 999 } 1000 1001 /* 1002 * Requests on the same timeline are explicitly ordered, along 1003 * with their dependencies, by i915_request_add() which ensures 1004 * that requests are submitted in-order through each ring. 1005 */ 1006 if (fence->context == rq->fence.context) 1007 continue; 1008 1009 /* Squash repeated waits to the same timelines */ 1010 if (fence->context && 1011 intel_timeline_sync_is_later(i915_request_timeline(rq), 1012 fence)) 1013 continue; 1014 1015 if (dma_fence_is_i915(fence)) 1016 ret = i915_request_await_request(rq, to_request(fence)); 1017 else 1018 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1019 fence->context ? I915_FENCE_TIMEOUT : 0, 1020 I915_FENCE_GFP); 1021 if (ret < 0) 1022 return ret; 1023 1024 /* Record the latest fence used against each timeline */ 1025 if (fence->context) 1026 intel_timeline_sync_set(i915_request_timeline(rq), 1027 fence); 1028 } while (--nchild); 1029 1030 return 0; 1031 } 1032 1033 static bool intel_timeline_sync_has_start(struct intel_timeline *tl, 1034 struct dma_fence *fence) 1035 { 1036 return __intel_timeline_sync_is_later(tl, 1037 fence->context, 1038 fence->seqno - 1); 1039 } 1040 1041 static int intel_timeline_sync_set_start(struct intel_timeline *tl, 1042 const struct dma_fence *fence) 1043 { 1044 return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); 1045 } 1046 1047 static int 1048 __i915_request_await_execution(struct i915_request *to, 1049 struct i915_request *from, 1050 void (*hook)(struct i915_request *rq, 1051 struct dma_fence *signal)) 1052 { 1053 int err; 1054 1055 GEM_BUG_ON(intel_context_is_barrier(from->context)); 1056 1057 /* Submit both requests at the same time */ 1058 err = __await_execution(to, from, hook, I915_FENCE_GFP); 1059 if (err) 1060 return err; 1061 1062 /* Squash repeated depenendices to the same timelines */ 1063 if (intel_timeline_sync_has_start(i915_request_timeline(to), 1064 &from->fence)) 1065 return 0; 1066 1067 /* Ensure both start together [after all semaphores in signal] */ 1068 if (intel_engine_has_semaphores(to->engine)) 1069 err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); 1070 else 1071 err = i915_request_await_start(to, from); 1072 if (err < 0) 1073 return err; 1074 1075 /* Couple the dependency tree for PI on this exposed to->fence */ 1076 if (to->engine->schedule) { 1077 err = i915_sched_node_add_dependency(&to->sched, &from->sched); 1078 if (err < 0) 1079 return err; 1080 } 1081 1082 return intel_timeline_sync_set_start(i915_request_timeline(to), 1083 &from->fence); 1084 } 1085 1086 int 1087 i915_request_await_execution(struct i915_request *rq, 1088 struct dma_fence *fence, 1089 void (*hook)(struct i915_request *rq, 1090 struct dma_fence *signal)) 1091 { 1092 struct dma_fence **child = &fence; 1093 unsigned int nchild = 1; 1094 int ret; 1095 1096 if (dma_fence_is_array(fence)) { 1097 struct dma_fence_array *array = to_dma_fence_array(fence); 1098 1099 /* XXX Error for signal-on-any fence arrays */ 1100 1101 child = array->fences; 1102 nchild = array->num_fences; 1103 GEM_BUG_ON(!nchild); 1104 } 1105 1106 do { 1107 fence = *child++; 1108 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 1109 i915_sw_fence_set_error_once(&rq->submit, fence->error); 1110 continue; 1111 } 1112 1113 /* 1114 * We don't squash repeated fence dependencies here as we 1115 * want to run our callback in all cases. 1116 */ 1117 1118 if (dma_fence_is_i915(fence)) 1119 ret = __i915_request_await_execution(rq, 1120 to_request(fence), 1121 hook); 1122 else 1123 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1124 I915_FENCE_TIMEOUT, 1125 GFP_KERNEL); 1126 if (ret < 0) 1127 return ret; 1128 } while (--nchild); 1129 1130 return 0; 1131 } 1132 1133 /** 1134 * i915_request_await_object - set this request to (async) wait upon a bo 1135 * @to: request we are wishing to use 1136 * @obj: object which may be in use on another ring. 1137 * @write: whether the wait is on behalf of a writer 1138 * 1139 * This code is meant to abstract object synchronization with the GPU. 1140 * Conceptually we serialise writes between engines inside the GPU. 1141 * We only allow one engine to write into a buffer at any time, but 1142 * multiple readers. To ensure each has a coherent view of memory, we must: 1143 * 1144 * - If there is an outstanding write request to the object, the new 1145 * request must wait for it to complete (either CPU or in hw, requests 1146 * on the same ring will be naturally ordered). 1147 * 1148 * - If we are a write request (pending_write_domain is set), the new 1149 * request must wait for outstanding read requests to complete. 1150 * 1151 * Returns 0 if successful, else propagates up the lower layer error. 1152 */ 1153 int 1154 i915_request_await_object(struct i915_request *to, 1155 struct drm_i915_gem_object *obj, 1156 bool write) 1157 { 1158 struct dma_fence *excl; 1159 int ret = 0; 1160 1161 if (write) { 1162 struct dma_fence **shared; 1163 unsigned int count, i; 1164 1165 ret = dma_resv_get_fences_rcu(obj->base.resv, 1166 &excl, &count, &shared); 1167 if (ret) 1168 return ret; 1169 1170 for (i = 0; i < count; i++) { 1171 ret = i915_request_await_dma_fence(to, shared[i]); 1172 if (ret) 1173 break; 1174 1175 dma_fence_put(shared[i]); 1176 } 1177 1178 for (; i < count; i++) 1179 dma_fence_put(shared[i]); 1180 kfree(shared); 1181 } else { 1182 excl = dma_resv_get_excl_rcu(obj->base.resv); 1183 } 1184 1185 if (excl) { 1186 if (ret == 0) 1187 ret = i915_request_await_dma_fence(to, excl); 1188 1189 dma_fence_put(excl); 1190 } 1191 1192 return ret; 1193 } 1194 1195 void i915_request_skip(struct i915_request *rq, int error) 1196 { 1197 GEM_BUG_ON(!IS_ERR_VALUE((long)error)); 1198 dma_fence_set_error(&rq->fence, error); 1199 1200 if (rq->infix == rq->postfix) 1201 return; 1202 1203 /* 1204 * As this request likely depends on state from the lost 1205 * context, clear out all the user operations leaving the 1206 * breadcrumb at the end (so we get the fence notifications). 1207 */ 1208 __i915_request_fill(rq, 0); 1209 rq->infix = rq->postfix; 1210 } 1211 1212 static struct i915_request * 1213 __i915_request_add_to_timeline(struct i915_request *rq) 1214 { 1215 struct intel_timeline *timeline = i915_request_timeline(rq); 1216 struct i915_request *prev; 1217 1218 /* 1219 * Dependency tracking and request ordering along the timeline 1220 * is special cased so that we can eliminate redundant ordering 1221 * operations while building the request (we know that the timeline 1222 * itself is ordered, and here we guarantee it). 1223 * 1224 * As we know we will need to emit tracking along the timeline, 1225 * we embed the hooks into our request struct -- at the cost of 1226 * having to have specialised no-allocation interfaces (which will 1227 * be beneficial elsewhere). 1228 * 1229 * A second benefit to open-coding i915_request_await_request is 1230 * that we can apply a slight variant of the rules specialised 1231 * for timelines that jump between engines (such as virtual engines). 1232 * If we consider the case of virtual engine, we must emit a dma-fence 1233 * to prevent scheduling of the second request until the first is 1234 * complete (to maximise our greedy late load balancing) and this 1235 * precludes optimising to use semaphores serialisation of a single 1236 * timeline across engines. 1237 */ 1238 prev = to_request(__i915_active_fence_set(&timeline->last_request, 1239 &rq->fence)); 1240 if (prev && !i915_request_completed(prev)) { 1241 if (is_power_of_2(prev->engine->mask | rq->engine->mask)) 1242 i915_sw_fence_await_sw_fence(&rq->submit, 1243 &prev->submit, 1244 &rq->submitq); 1245 else 1246 __i915_sw_fence_await_dma_fence(&rq->submit, 1247 &prev->fence, 1248 &rq->dmaq); 1249 if (rq->engine->schedule) 1250 __i915_sched_node_add_dependency(&rq->sched, 1251 &prev->sched, 1252 &rq->dep, 1253 0); 1254 } 1255 1256 list_add_tail(&rq->link, &timeline->requests); 1257 1258 /* 1259 * Make sure that no request gazumped us - if it was allocated after 1260 * our i915_request_alloc() and called __i915_request_add() before 1261 * us, the timeline will hold its seqno which is later than ours. 1262 */ 1263 GEM_BUG_ON(timeline->seqno != rq->fence.seqno); 1264 1265 return prev; 1266 } 1267 1268 /* 1269 * NB: This function is not allowed to fail. Doing so would mean the the 1270 * request is not being tracked for completion but the work itself is 1271 * going to happen on the hardware. This would be a Bad Thing(tm). 1272 */ 1273 struct i915_request *__i915_request_commit(struct i915_request *rq) 1274 { 1275 struct intel_engine_cs *engine = rq->engine; 1276 struct intel_ring *ring = rq->ring; 1277 u32 *cs; 1278 1279 RQ_TRACE(rq, "\n"); 1280 1281 /* 1282 * To ensure that this call will not fail, space for its emissions 1283 * should already have been reserved in the ring buffer. Let the ring 1284 * know that it is time to use that space up. 1285 */ 1286 GEM_BUG_ON(rq->reserved_space > ring->space); 1287 rq->reserved_space = 0; 1288 rq->emitted_jiffies = jiffies; 1289 1290 /* 1291 * Record the position of the start of the breadcrumb so that 1292 * should we detect the updated seqno part-way through the 1293 * GPU processing the request, we never over-estimate the 1294 * position of the ring's HEAD. 1295 */ 1296 cs = intel_ring_begin(rq, engine->emit_fini_breadcrumb_dw); 1297 GEM_BUG_ON(IS_ERR(cs)); 1298 rq->postfix = intel_ring_offset(rq, cs); 1299 1300 return __i915_request_add_to_timeline(rq); 1301 } 1302 1303 void __i915_request_queue(struct i915_request *rq, 1304 const struct i915_sched_attr *attr) 1305 { 1306 /* 1307 * Let the backend know a new request has arrived that may need 1308 * to adjust the existing execution schedule due to a high priority 1309 * request - i.e. we may want to preempt the current request in order 1310 * to run a high priority dependency chain *before* we can execute this 1311 * request. 1312 * 1313 * This is called before the request is ready to run so that we can 1314 * decide whether to preempt the entire chain so that it is ready to 1315 * run at the earliest possible convenience. 1316 */ 1317 i915_sw_fence_commit(&rq->semaphore); 1318 if (attr && rq->engine->schedule) 1319 rq->engine->schedule(rq, attr); 1320 i915_sw_fence_commit(&rq->submit); 1321 } 1322 1323 void i915_request_add(struct i915_request *rq) 1324 { 1325 struct intel_timeline * const tl = i915_request_timeline(rq); 1326 struct i915_sched_attr attr = {}; 1327 struct i915_request *prev; 1328 1329 lockdep_assert_held(&tl->mutex); 1330 lockdep_unpin_lock(&tl->mutex, rq->cookie); 1331 1332 trace_i915_request_add(rq); 1333 1334 prev = __i915_request_commit(rq); 1335 1336 if (rcu_access_pointer(rq->context->gem_context)) 1337 attr = i915_request_gem_context(rq)->sched; 1338 1339 /* 1340 * Boost actual workloads past semaphores! 1341 * 1342 * With semaphores we spin on one engine waiting for another, 1343 * simply to reduce the latency of starting our work when 1344 * the signaler completes. However, if there is any other 1345 * work that we could be doing on this engine instead, that 1346 * is better utilisation and will reduce the overall duration 1347 * of the current work. To avoid PI boosting a semaphore 1348 * far in the distance past over useful work, we keep a history 1349 * of any semaphore use along our dependency chain. 1350 */ 1351 if (!(rq->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN)) 1352 attr.priority |= I915_PRIORITY_NOSEMAPHORE; 1353 1354 /* 1355 * Boost priorities to new clients (new request flows). 1356 * 1357 * Allow interactive/synchronous clients to jump ahead of 1358 * the bulk clients. (FQ_CODEL) 1359 */ 1360 if (list_empty(&rq->sched.signalers_list)) 1361 attr.priority |= I915_PRIORITY_WAIT; 1362 1363 local_bh_disable(); 1364 __i915_request_queue(rq, &attr); 1365 local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ 1366 1367 /* 1368 * In typical scenarios, we do not expect the previous request on 1369 * the timeline to be still tracked by timeline->last_request if it 1370 * has been completed. If the completed request is still here, that 1371 * implies that request retirement is a long way behind submission, 1372 * suggesting that we haven't been retiring frequently enough from 1373 * the combination of retire-before-alloc, waiters and the background 1374 * retirement worker. So if the last request on this timeline was 1375 * already completed, do a catch up pass, flushing the retirement queue 1376 * up to this client. Since we have now moved the heaviest operations 1377 * during retirement onto secondary workers, such as freeing objects 1378 * or contexts, retiring a bunch of requests is mostly list management 1379 * (and cache misses), and so we should not be overly penalizing this 1380 * client by performing excess work, though we may still performing 1381 * work on behalf of others -- but instead we should benefit from 1382 * improved resource management. (Well, that's the theory at least.) 1383 */ 1384 if (prev && 1385 i915_request_completed(prev) && 1386 rcu_access_pointer(prev->timeline) == tl) 1387 i915_request_retire_upto(prev); 1388 1389 mutex_unlock(&tl->mutex); 1390 } 1391 1392 static unsigned long local_clock_us(unsigned int *cpu) 1393 { 1394 unsigned long t; 1395 1396 /* 1397 * Cheaply and approximately convert from nanoseconds to microseconds. 1398 * The result and subsequent calculations are also defined in the same 1399 * approximate microseconds units. The principal source of timing 1400 * error here is from the simple truncation. 1401 * 1402 * Note that local_clock() is only defined wrt to the current CPU; 1403 * the comparisons are no longer valid if we switch CPUs. Instead of 1404 * blocking preemption for the entire busywait, we can detect the CPU 1405 * switch and use that as indicator of system load and a reason to 1406 * stop busywaiting, see busywait_stop(). 1407 */ 1408 *cpu = get_cpu(); 1409 t = local_clock() >> 10; 1410 put_cpu(); 1411 1412 return t; 1413 } 1414 1415 static bool busywait_stop(unsigned long timeout, unsigned int cpu) 1416 { 1417 unsigned int this_cpu; 1418 1419 if (time_after(local_clock_us(&this_cpu), timeout)) 1420 return true; 1421 1422 return this_cpu != cpu; 1423 } 1424 1425 static bool __i915_spin_request(const struct i915_request * const rq, 1426 int state, unsigned long timeout_us) 1427 { 1428 unsigned int cpu; 1429 1430 /* 1431 * Only wait for the request if we know it is likely to complete. 1432 * 1433 * We don't track the timestamps around requests, nor the average 1434 * request length, so we do not have a good indicator that this 1435 * request will complete within the timeout. What we do know is the 1436 * order in which requests are executed by the context and so we can 1437 * tell if the request has been started. If the request is not even 1438 * running yet, it is a fair assumption that it will not complete 1439 * within our relatively short timeout. 1440 */ 1441 if (!i915_request_is_running(rq)) 1442 return false; 1443 1444 /* 1445 * When waiting for high frequency requests, e.g. during synchronous 1446 * rendering split between the CPU and GPU, the finite amount of time 1447 * required to set up the irq and wait upon it limits the response 1448 * rate. By busywaiting on the request completion for a short while we 1449 * can service the high frequency waits as quick as possible. However, 1450 * if it is a slow request, we want to sleep as quickly as possible. 1451 * The tradeoff between waiting and sleeping is roughly the time it 1452 * takes to sleep on a request, on the order of a microsecond. 1453 */ 1454 1455 timeout_us += local_clock_us(&cpu); 1456 do { 1457 if (i915_request_completed(rq)) 1458 return true; 1459 1460 if (signal_pending_state(state, current)) 1461 break; 1462 1463 if (busywait_stop(timeout_us, cpu)) 1464 break; 1465 1466 cpu_relax(); 1467 } while (!need_resched()); 1468 1469 return false; 1470 } 1471 1472 struct request_wait { 1473 struct dma_fence_cb cb; 1474 struct task_struct *tsk; 1475 }; 1476 1477 static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) 1478 { 1479 struct request_wait *wait = container_of(cb, typeof(*wait), cb); 1480 1481 wake_up_process(wait->tsk); 1482 } 1483 1484 /** 1485 * i915_request_wait - wait until execution of request has finished 1486 * @rq: the request to wait upon 1487 * @flags: how to wait 1488 * @timeout: how long to wait in jiffies 1489 * 1490 * i915_request_wait() waits for the request to be completed, for a 1491 * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an 1492 * unbounded wait). 1493 * 1494 * Returns the remaining time (in jiffies) if the request completed, which may 1495 * be zero or -ETIME if the request is unfinished after the timeout expires. 1496 * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is 1497 * pending before the request completes. 1498 */ 1499 long i915_request_wait(struct i915_request *rq, 1500 unsigned int flags, 1501 long timeout) 1502 { 1503 const int state = flags & I915_WAIT_INTERRUPTIBLE ? 1504 TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; 1505 struct request_wait wait; 1506 1507 might_sleep(); 1508 GEM_BUG_ON(timeout < 0); 1509 1510 if (dma_fence_is_signaled(&rq->fence)) 1511 return timeout; 1512 1513 if (!timeout) 1514 return -ETIME; 1515 1516 trace_i915_request_wait_begin(rq, flags); 1517 1518 /* 1519 * We must never wait on the GPU while holding a lock as we 1520 * may need to perform a GPU reset. So while we don't need to 1521 * serialise wait/reset with an explicit lock, we do want 1522 * lockdep to detect potential dependency cycles. 1523 */ 1524 mutex_acquire(&rq->engine->gt->reset.mutex.dep_map, 0, 0, _THIS_IP_); 1525 1526 /* 1527 * Optimistic spin before touching IRQs. 1528 * 1529 * We may use a rather large value here to offset the penalty of 1530 * switching away from the active task. Frequently, the client will 1531 * wait upon an old swapbuffer to throttle itself to remain within a 1532 * frame of the gpu. If the client is running in lockstep with the gpu, 1533 * then it should not be waiting long at all, and a sleep now will incur 1534 * extra scheduler latency in producing the next frame. To try to 1535 * avoid adding the cost of enabling/disabling the interrupt to the 1536 * short wait, we first spin to see if the request would have completed 1537 * in the time taken to setup the interrupt. 1538 * 1539 * We need upto 5us to enable the irq, and upto 20us to hide the 1540 * scheduler latency of a context switch, ignoring the secondary 1541 * impacts from a context switch such as cache eviction. 1542 * 1543 * The scheme used for low-latency IO is called "hybrid interrupt 1544 * polling". The suggestion there is to sleep until just before you 1545 * expect to be woken by the device interrupt and then poll for its 1546 * completion. That requires having a good predictor for the request 1547 * duration, which we currently lack. 1548 */ 1549 if (IS_ACTIVE(CONFIG_DRM_I915_SPIN_REQUEST) && 1550 __i915_spin_request(rq, state, CONFIG_DRM_I915_SPIN_REQUEST)) { 1551 dma_fence_signal(&rq->fence); 1552 goto out; 1553 } 1554 1555 /* 1556 * This client is about to stall waiting for the GPU. In many cases 1557 * this is undesirable and limits the throughput of the system, as 1558 * many clients cannot continue processing user input/output whilst 1559 * blocked. RPS autotuning may take tens of milliseconds to respond 1560 * to the GPU load and thus incurs additional latency for the client. 1561 * We can circumvent that by promoting the GPU frequency to maximum 1562 * before we sleep. This makes the GPU throttle up much more quickly 1563 * (good for benchmarks and user experience, e.g. window animations), 1564 * but at a cost of spending more power processing the workload 1565 * (bad for battery). 1566 */ 1567 if (flags & I915_WAIT_PRIORITY) { 1568 if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6) 1569 intel_rps_boost(rq); 1570 i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); 1571 } 1572 1573 wait.tsk = current; 1574 if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) 1575 goto out; 1576 1577 for (;;) { 1578 set_current_state(state); 1579 1580 if (i915_request_completed(rq)) { 1581 dma_fence_signal(&rq->fence); 1582 break; 1583 } 1584 1585 intel_engine_flush_submission(rq->engine); 1586 1587 if (signal_pending_state(state, current)) { 1588 timeout = -ERESTARTSYS; 1589 break; 1590 } 1591 1592 if (!timeout) { 1593 timeout = -ETIME; 1594 break; 1595 } 1596 1597 timeout = io_schedule_timeout(timeout); 1598 } 1599 __set_current_state(TASK_RUNNING); 1600 1601 dma_fence_remove_callback(&rq->fence, &wait.cb); 1602 1603 out: 1604 mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); 1605 trace_i915_request_wait_end(rq); 1606 return timeout; 1607 } 1608 1609 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1610 #include "selftests/mock_request.c" 1611 #include "selftests/i915_request.c" 1612 #endif 1613 1614 static void i915_global_request_shrink(void) 1615 { 1616 kmem_cache_shrink(global.slab_execute_cbs); 1617 kmem_cache_shrink(global.slab_requests); 1618 } 1619 1620 static void i915_global_request_exit(void) 1621 { 1622 kmem_cache_destroy(global.slab_execute_cbs); 1623 kmem_cache_destroy(global.slab_requests); 1624 } 1625 1626 static struct i915_global_request global = { { 1627 .shrink = i915_global_request_shrink, 1628 .exit = i915_global_request_exit, 1629 } }; 1630 1631 int __init i915_global_request_init(void) 1632 { 1633 global.slab_requests = 1634 kmem_cache_create("i915_request", 1635 sizeof(struct i915_request), 1636 __alignof__(struct i915_request), 1637 SLAB_HWCACHE_ALIGN | 1638 SLAB_RECLAIM_ACCOUNT | 1639 SLAB_TYPESAFE_BY_RCU, 1640 __i915_request_ctor); 1641 if (!global.slab_requests) 1642 return -ENOMEM; 1643 1644 global.slab_execute_cbs = KMEM_CACHE(execute_cb, 1645 SLAB_HWCACHE_ALIGN | 1646 SLAB_RECLAIM_ACCOUNT | 1647 SLAB_TYPESAFE_BY_RCU); 1648 if (!global.slab_execute_cbs) 1649 goto err_requests; 1650 1651 i915_global_register(&global.base); 1652 return 0; 1653 1654 err_requests: 1655 kmem_cache_destroy(global.slab_requests); 1656 return -ENOMEM; 1657 } 1658