1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2008,2010 Intel Corporation 5 */ 6 7 #include <linux/intel-iommu.h> 8 #include <linux/dma-resv.h> 9 #include <linux/sync_file.h> 10 #include <linux/uaccess.h> 11 12 #include <drm/drm_syncobj.h> 13 #include <drm/i915_drm.h> 14 15 #include "display/intel_frontbuffer.h" 16 17 #include "gem/i915_gem_ioctls.h" 18 #include "gt/intel_context.h" 19 #include "gt/intel_engine_pool.h" 20 #include "gt/intel_gt.h" 21 #include "gt/intel_gt_pm.h" 22 #include "gt/intel_ring.h" 23 24 #include "i915_drv.h" 25 #include "i915_gem_clflush.h" 26 #include "i915_gem_context.h" 27 #include "i915_gem_ioctls.h" 28 #include "i915_sw_fence_work.h" 29 #include "i915_trace.h" 30 31 enum { 32 FORCE_CPU_RELOC = 1, 33 FORCE_GTT_RELOC, 34 FORCE_GPU_RELOC, 35 #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 36 }; 37 38 #define __EXEC_OBJECT_HAS_REF BIT(31) 39 #define __EXEC_OBJECT_HAS_PIN BIT(30) 40 #define __EXEC_OBJECT_HAS_FENCE BIT(29) 41 #define __EXEC_OBJECT_NEEDS_MAP BIT(28) 42 #define __EXEC_OBJECT_NEEDS_BIAS BIT(27) 43 #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */ 44 #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) 45 46 #define __EXEC_HAS_RELOC BIT(31) 47 #define __EXEC_VALIDATED BIT(30) 48 #define __EXEC_INTERNAL_FLAGS (~0u << 30) 49 #define UPDATE PIN_OFFSET_FIXED 50 51 #define BATCH_OFFSET_BIAS (256*1024) 52 53 #define __I915_EXEC_ILLEGAL_FLAGS \ 54 (__I915_EXEC_UNKNOWN_FLAGS | \ 55 I915_EXEC_CONSTANTS_MASK | \ 56 I915_EXEC_RESOURCE_STREAMER) 57 58 /* Catch emission of unexpected errors for CI! */ 59 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 60 #undef EINVAL 61 #define EINVAL ({ \ 62 DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \ 63 22; \ 64 }) 65 #endif 66 67 /** 68 * DOC: User command execution 69 * 70 * Userspace submits commands to be executed on the GPU as an instruction 71 * stream within a GEM object we call a batchbuffer. This instructions may 72 * refer to other GEM objects containing auxiliary state such as kernels, 73 * samplers, render targets and even secondary batchbuffers. Userspace does 74 * not know where in the GPU memory these objects reside and so before the 75 * batchbuffer is passed to the GPU for execution, those addresses in the 76 * batchbuffer and auxiliary objects are updated. This is known as relocation, 77 * or patching. To try and avoid having to relocate each object on the next 78 * execution, userspace is told the location of those objects in this pass, 79 * but this remains just a hint as the kernel may choose a new location for 80 * any object in the future. 81 * 82 * At the level of talking to the hardware, submitting a batchbuffer for the 83 * GPU to execute is to add content to a buffer from which the HW 84 * command streamer is reading. 85 * 86 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. 87 * Execlists, this command is not placed on the same buffer as the 88 * remaining items. 89 * 90 * 2. Add a command to invalidate caches to the buffer. 91 * 92 * 3. Add a batchbuffer start command to the buffer; the start command is 93 * essentially a token together with the GPU address of the batchbuffer 94 * to be executed. 95 * 96 * 4. Add a pipeline flush to the buffer. 97 * 98 * 5. Add a memory write command to the buffer to record when the GPU 99 * is done executing the batchbuffer. The memory write writes the 100 * global sequence number of the request, ``i915_request::global_seqno``; 101 * the i915 driver uses the current value in the register to determine 102 * if the GPU has completed the batchbuffer. 103 * 104 * 6. Add a user interrupt command to the buffer. This command instructs 105 * the GPU to issue an interrupt when the command, pipeline flush and 106 * memory write are completed. 107 * 108 * 7. Inform the hardware of the additional commands added to the buffer 109 * (by updating the tail pointer). 110 * 111 * Processing an execbuf ioctl is conceptually split up into a few phases. 112 * 113 * 1. Validation - Ensure all the pointers, handles and flags are valid. 114 * 2. Reservation - Assign GPU address space for every object 115 * 3. Relocation - Update any addresses to point to the final locations 116 * 4. Serialisation - Order the request with respect to its dependencies 117 * 5. Construction - Construct a request to execute the batchbuffer 118 * 6. Submission (at some point in the future execution) 119 * 120 * Reserving resources for the execbuf is the most complicated phase. We 121 * neither want to have to migrate the object in the address space, nor do 122 * we want to have to update any relocations pointing to this object. Ideally, 123 * we want to leave the object where it is and for all the existing relocations 124 * to match. If the object is given a new address, or if userspace thinks the 125 * object is elsewhere, we have to parse all the relocation entries and update 126 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that 127 * all the target addresses in all of its objects match the value in the 128 * relocation entries and that they all match the presumed offsets given by the 129 * list of execbuffer objects. Using this knowledge, we know that if we haven't 130 * moved any buffers, all the relocation entries are valid and we can skip 131 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU 132 * hang.) The requirement for using I915_EXEC_NO_RELOC are: 133 * 134 * The addresses written in the objects must match the corresponding 135 * reloc.presumed_offset which in turn must match the corresponding 136 * execobject.offset. 137 * 138 * Any render targets written to in the batch must be flagged with 139 * EXEC_OBJECT_WRITE. 140 * 141 * To avoid stalling, execobject.offset should match the current 142 * address of that object within the active context. 143 * 144 * The reservation is done is multiple phases. First we try and keep any 145 * object already bound in its current location - so as long as meets the 146 * constraints imposed by the new execbuffer. Any object left unbound after the 147 * first pass is then fitted into any available idle space. If an object does 148 * not fit, all objects are removed from the reservation and the process rerun 149 * after sorting the objects into a priority order (more difficult to fit 150 * objects are tried first). Failing that, the entire VM is cleared and we try 151 * to fit the execbuf once last time before concluding that it simply will not 152 * fit. 153 * 154 * A small complication to all of this is that we allow userspace not only to 155 * specify an alignment and a size for the object in the address space, but 156 * we also allow userspace to specify the exact offset. This objects are 157 * simpler to place (the location is known a priori) all we have to do is make 158 * sure the space is available. 159 * 160 * Once all the objects are in place, patching up the buried pointers to point 161 * to the final locations is a fairly simple job of walking over the relocation 162 * entry arrays, looking up the right address and rewriting the value into 163 * the object. Simple! ... The relocation entries are stored in user memory 164 * and so to access them we have to copy them into a local buffer. That copy 165 * has to avoid taking any pagefaults as they may lead back to a GEM object 166 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split 167 * the relocation into multiple passes. First we try to do everything within an 168 * atomic context (avoid the pagefaults) which requires that we never wait. If 169 * we detect that we may wait, or if we need to fault, then we have to fallback 170 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm 171 * bells yet?) Dropping the mutex means that we lose all the state we have 172 * built up so far for the execbuf and we must reset any global data. However, 173 * we do leave the objects pinned in their final locations - which is a 174 * potential issue for concurrent execbufs. Once we have left the mutex, we can 175 * allocate and copy all the relocation entries into a large array at our 176 * leisure, reacquire the mutex, reclaim all the objects and other state and 177 * then proceed to update any incorrect addresses with the objects. 178 * 179 * As we process the relocation entries, we maintain a record of whether the 180 * object is being written to. Using NORELOC, we expect userspace to provide 181 * this information instead. We also check whether we can skip the relocation 182 * by comparing the expected value inside the relocation entry with the target's 183 * final address. If they differ, we have to map the current object and rewrite 184 * the 4 or 8 byte pointer within. 185 * 186 * Serialising an execbuf is quite simple according to the rules of the GEM 187 * ABI. Execution within each context is ordered by the order of submission. 188 * Writes to any GEM object are in order of submission and are exclusive. Reads 189 * from a GEM object are unordered with respect to other reads, but ordered by 190 * writes. A write submitted after a read cannot occur before the read, and 191 * similarly any read submitted after a write cannot occur before the write. 192 * Writes are ordered between engines such that only one write occurs at any 193 * time (completing any reads beforehand) - using semaphores where available 194 * and CPU serialisation otherwise. Other GEM access obey the same rules, any 195 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU 196 * reads before starting, and any read (either using set-domain or pread) must 197 * flush all GPU writes before starting. (Note we only employ a barrier before, 198 * we currently rely on userspace not concurrently starting a new execution 199 * whilst reading or writing to an object. This may be an advantage or not 200 * depending on how much you trust userspace not to shoot themselves in the 201 * foot.) Serialisation may just result in the request being inserted into 202 * a DAG awaiting its turn, but most simple is to wait on the CPU until 203 * all dependencies are resolved. 204 * 205 * After all of that, is just a matter of closing the request and handing it to 206 * the hardware (well, leaving it in a queue to be executed). However, we also 207 * offer the ability for batchbuffers to be run with elevated privileges so 208 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) 209 * Before any batch is given extra privileges we first must check that it 210 * contains no nefarious instructions, we check that each instruction is from 211 * our whitelist and all registers are also from an allowed list. We first 212 * copy the user's batchbuffer to a shadow (so that the user doesn't have 213 * access to it, either by the CPU or GPU as we scan it) and then parse each 214 * instruction. If everything is ok, we set a flag telling the hardware to run 215 * the batchbuffer in trusted mode, otherwise the ioctl is rejected. 216 */ 217 218 struct i915_execbuffer { 219 struct drm_i915_private *i915; /** i915 backpointer */ 220 struct drm_file *file; /** per-file lookup tables and limits */ 221 struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ 222 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 223 struct i915_vma **vma; 224 unsigned int *flags; 225 226 struct intel_engine_cs *engine; /** engine to queue the request to */ 227 struct intel_context *context; /* logical state for the request */ 228 struct i915_gem_context *gem_context; /** caller's context */ 229 230 struct i915_request *request; /** our request to build */ 231 struct i915_vma *batch; /** identity of the batch obj/vma */ 232 struct i915_vma *trampoline; /** trampoline used for chaining */ 233 234 /** actual size of execobj[] as we may extend it for the cmdparser */ 235 unsigned int buffer_count; 236 237 /** list of vma not yet bound during reservation phase */ 238 struct list_head unbound; 239 240 /** list of vma that have execobj.relocation_count */ 241 struct list_head relocs; 242 243 /** 244 * Track the most recently used object for relocations, as we 245 * frequently have to perform multiple relocations within the same 246 * obj/page 247 */ 248 struct reloc_cache { 249 struct drm_mm_node node; /** temporary GTT binding */ 250 unsigned long vaddr; /** Current kmap address */ 251 unsigned long page; /** Currently mapped page index */ 252 unsigned int gen; /** Cached value of INTEL_GEN */ 253 bool use_64bit_reloc : 1; 254 bool has_llc : 1; 255 bool has_fence : 1; 256 bool needs_unfenced : 1; 257 258 struct i915_request *rq; 259 u32 *rq_cmd; 260 unsigned int rq_size; 261 } reloc_cache; 262 263 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 264 u32 context_flags; /** Set of execobj.flags to insert from the ctx */ 265 266 u32 batch_start_offset; /** Location within object of batch */ 267 u32 batch_len; /** Length of batch within object */ 268 u32 batch_flags; /** Flags composed for emit_bb_start() */ 269 270 /** 271 * Indicate either the size of the hastable used to resolve 272 * relocation handles, or if negative that we are using a direct 273 * index into the execobj[]. 274 */ 275 int lut_size; 276 struct hlist_head *buckets; /** ht for relocation handles */ 277 }; 278 279 #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) 280 281 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 282 { 283 return intel_engine_requires_cmd_parser(eb->engine) || 284 (intel_engine_using_cmd_parser(eb->engine) && 285 eb->args->batch_len); 286 } 287 288 static int eb_create(struct i915_execbuffer *eb) 289 { 290 if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { 291 unsigned int size = 1 + ilog2(eb->buffer_count); 292 293 /* 294 * Without a 1:1 association between relocation handles and 295 * the execobject[] index, we instead create a hashtable. 296 * We size it dynamically based on available memory, starting 297 * first with 1:1 assocative hash and scaling back until 298 * the allocation succeeds. 299 * 300 * Later on we use a positive lut_size to indicate we are 301 * using this hashtable, and a negative value to indicate a 302 * direct lookup. 303 */ 304 do { 305 gfp_t flags; 306 307 /* While we can still reduce the allocation size, don't 308 * raise a warning and allow the allocation to fail. 309 * On the last pass though, we want to try as hard 310 * as possible to perform the allocation and warn 311 * if it fails. 312 */ 313 flags = GFP_KERNEL; 314 if (size > 1) 315 flags |= __GFP_NORETRY | __GFP_NOWARN; 316 317 eb->buckets = kzalloc(sizeof(struct hlist_head) << size, 318 flags); 319 if (eb->buckets) 320 break; 321 } while (--size); 322 323 if (unlikely(!size)) 324 return -ENOMEM; 325 326 eb->lut_size = size; 327 } else { 328 eb->lut_size = -eb->buffer_count; 329 } 330 331 return 0; 332 } 333 334 static bool 335 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, 336 const struct i915_vma *vma, 337 unsigned int flags) 338 { 339 if (vma->node.size < entry->pad_to_size) 340 return true; 341 342 if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) 343 return true; 344 345 if (flags & EXEC_OBJECT_PINNED && 346 vma->node.start != entry->offset) 347 return true; 348 349 if (flags & __EXEC_OBJECT_NEEDS_BIAS && 350 vma->node.start < BATCH_OFFSET_BIAS) 351 return true; 352 353 if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && 354 (vma->node.start + vma->node.size - 1) >> 32) 355 return true; 356 357 if (flags & __EXEC_OBJECT_NEEDS_MAP && 358 !i915_vma_is_map_and_fenceable(vma)) 359 return true; 360 361 return false; 362 } 363 364 static inline bool 365 eb_pin_vma(struct i915_execbuffer *eb, 366 const struct drm_i915_gem_exec_object2 *entry, 367 struct i915_vma *vma) 368 { 369 unsigned int exec_flags = *vma->exec_flags; 370 u64 pin_flags; 371 372 if (vma->node.size) 373 pin_flags = vma->node.start; 374 else 375 pin_flags = entry->offset & PIN_OFFSET_MASK; 376 377 pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED; 378 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT)) 379 pin_flags |= PIN_GLOBAL; 380 381 if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) 382 return false; 383 384 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 385 if (unlikely(i915_vma_pin_fence(vma))) { 386 i915_vma_unpin(vma); 387 return false; 388 } 389 390 if (vma->fence) 391 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 392 } 393 394 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 395 return !eb_vma_misplaced(entry, vma, exec_flags); 396 } 397 398 static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags) 399 { 400 GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN)); 401 402 if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) 403 __i915_vma_unpin_fence(vma); 404 405 __i915_vma_unpin(vma); 406 } 407 408 static inline void 409 eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags) 410 { 411 if (!(*flags & __EXEC_OBJECT_HAS_PIN)) 412 return; 413 414 __eb_unreserve_vma(vma, *flags); 415 *flags &= ~__EXEC_OBJECT_RESERVED; 416 } 417 418 static int 419 eb_validate_vma(struct i915_execbuffer *eb, 420 struct drm_i915_gem_exec_object2 *entry, 421 struct i915_vma *vma) 422 { 423 struct drm_i915_private *i915 = eb->i915; 424 if (unlikely(entry->flags & eb->invalid_flags)) 425 return -EINVAL; 426 427 if (unlikely(entry->alignment && !is_power_of_2(entry->alignment))) 428 return -EINVAL; 429 430 /* 431 * Offset can be used as input (EXEC_OBJECT_PINNED), reject 432 * any non-page-aligned or non-canonical addresses. 433 */ 434 if (unlikely(entry->flags & EXEC_OBJECT_PINNED && 435 entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) 436 return -EINVAL; 437 438 /* pad_to_size was once a reserved field, so sanitize it */ 439 if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { 440 if (unlikely(offset_in_page(entry->pad_to_size))) 441 return -EINVAL; 442 } else { 443 entry->pad_to_size = 0; 444 } 445 446 if (unlikely(vma->exec_flags)) { 447 drm_dbg(&i915->drm, 448 "Object [handle %d, index %d] appears more than once in object list\n", 449 entry->handle, (int)(entry - eb->exec)); 450 return -EINVAL; 451 } 452 453 /* 454 * From drm_mm perspective address space is continuous, 455 * so from this point we're always using non-canonical 456 * form internally. 457 */ 458 entry->offset = gen8_noncanonical_addr(entry->offset); 459 460 if (!eb->reloc_cache.has_fence) { 461 entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; 462 } else { 463 if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || 464 eb->reloc_cache.needs_unfenced) && 465 i915_gem_object_is_tiled(vma->obj)) 466 entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; 467 } 468 469 if (!(entry->flags & EXEC_OBJECT_PINNED)) 470 entry->flags |= eb->context_flags; 471 472 return 0; 473 } 474 475 static int 476 eb_add_vma(struct i915_execbuffer *eb, 477 unsigned int i, unsigned batch_idx, 478 struct i915_vma *vma) 479 { 480 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 481 int err; 482 483 GEM_BUG_ON(i915_vma_is_closed(vma)); 484 485 if (!(eb->args->flags & __EXEC_VALIDATED)) { 486 err = eb_validate_vma(eb, entry, vma); 487 if (unlikely(err)) 488 return err; 489 } 490 491 if (eb->lut_size > 0) { 492 vma->exec_handle = entry->handle; 493 hlist_add_head(&vma->exec_node, 494 &eb->buckets[hash_32(entry->handle, 495 eb->lut_size)]); 496 } 497 498 if (entry->relocation_count) 499 list_add_tail(&vma->reloc_link, &eb->relocs); 500 501 /* 502 * Stash a pointer from the vma to execobj, so we can query its flags, 503 * size, alignment etc as provided by the user. Also we stash a pointer 504 * to the vma inside the execobj so that we can use a direct lookup 505 * to find the right target VMA when doing relocations. 506 */ 507 eb->vma[i] = vma; 508 eb->flags[i] = entry->flags; 509 vma->exec_flags = &eb->flags[i]; 510 511 /* 512 * SNA is doing fancy tricks with compressing batch buffers, which leads 513 * to negative relocation deltas. Usually that works out ok since the 514 * relocate address is still positive, except when the batch is placed 515 * very low in the GTT. Ensure this doesn't happen. 516 * 517 * Note that actual hangs have only been observed on gen7, but for 518 * paranoia do it everywhere. 519 */ 520 if (i == batch_idx) { 521 if (entry->relocation_count && 522 !(eb->flags[i] & EXEC_OBJECT_PINNED)) 523 eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; 524 if (eb->reloc_cache.has_fence) 525 eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; 526 527 eb->batch = vma; 528 } 529 530 err = 0; 531 if (eb_pin_vma(eb, entry, vma)) { 532 if (entry->offset != vma->node.start) { 533 entry->offset = vma->node.start | UPDATE; 534 eb->args->flags |= __EXEC_HAS_RELOC; 535 } 536 } else { 537 eb_unreserve_vma(vma, vma->exec_flags); 538 539 list_add_tail(&vma->exec_link, &eb->unbound); 540 if (drm_mm_node_allocated(&vma->node)) 541 err = i915_vma_unbind(vma); 542 if (unlikely(err)) 543 vma->exec_flags = NULL; 544 } 545 return err; 546 } 547 548 static inline int use_cpu_reloc(const struct reloc_cache *cache, 549 const struct drm_i915_gem_object *obj) 550 { 551 if (!i915_gem_object_has_struct_page(obj)) 552 return false; 553 554 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 555 return true; 556 557 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 558 return false; 559 560 return (cache->has_llc || 561 obj->cache_dirty || 562 obj->cache_level != I915_CACHE_NONE); 563 } 564 565 static int eb_reserve_vma(const struct i915_execbuffer *eb, 566 struct i915_vma *vma) 567 { 568 struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 569 unsigned int exec_flags = *vma->exec_flags; 570 u64 pin_flags; 571 int err; 572 573 pin_flags = PIN_USER | PIN_NONBLOCK; 574 if (exec_flags & EXEC_OBJECT_NEEDS_GTT) 575 pin_flags |= PIN_GLOBAL; 576 577 /* 578 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, 579 * limit address to the first 4GBs for unflagged objects. 580 */ 581 if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 582 pin_flags |= PIN_ZONE_4G; 583 584 if (exec_flags & __EXEC_OBJECT_NEEDS_MAP) 585 pin_flags |= PIN_MAPPABLE; 586 587 if (exec_flags & EXEC_OBJECT_PINNED) { 588 pin_flags |= entry->offset | PIN_OFFSET_FIXED; 589 pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */ 590 } else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) { 591 pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; 592 } 593 594 err = i915_vma_pin(vma, 595 entry->pad_to_size, entry->alignment, 596 pin_flags); 597 if (err) 598 return err; 599 600 if (entry->offset != vma->node.start) { 601 entry->offset = vma->node.start | UPDATE; 602 eb->args->flags |= __EXEC_HAS_RELOC; 603 } 604 605 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 606 err = i915_vma_pin_fence(vma); 607 if (unlikely(err)) { 608 i915_vma_unpin(vma); 609 return err; 610 } 611 612 if (vma->fence) 613 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 614 } 615 616 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 617 GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags)); 618 619 return 0; 620 } 621 622 static int eb_reserve(struct i915_execbuffer *eb) 623 { 624 const unsigned int count = eb->buffer_count; 625 struct list_head last; 626 struct i915_vma *vma; 627 unsigned int i, pass; 628 int err; 629 630 /* 631 * Attempt to pin all of the buffers into the GTT. 632 * This is done in 3 phases: 633 * 634 * 1a. Unbind all objects that do not match the GTT constraints for 635 * the execbuffer (fenceable, mappable, alignment etc). 636 * 1b. Increment pin count for already bound objects. 637 * 2. Bind new objects. 638 * 3. Decrement pin count. 639 * 640 * This avoid unnecessary unbinding of later objects in order to make 641 * room for the earlier objects *unless* we need to defragment. 642 */ 643 644 pass = 0; 645 err = 0; 646 do { 647 list_for_each_entry(vma, &eb->unbound, exec_link) { 648 err = eb_reserve_vma(eb, vma); 649 if (err) 650 break; 651 } 652 if (err != -ENOSPC) 653 return err; 654 655 /* Resort *all* the objects into priority order */ 656 INIT_LIST_HEAD(&eb->unbound); 657 INIT_LIST_HEAD(&last); 658 for (i = 0; i < count; i++) { 659 unsigned int flags = eb->flags[i]; 660 struct i915_vma *vma = eb->vma[i]; 661 662 if (flags & EXEC_OBJECT_PINNED && 663 flags & __EXEC_OBJECT_HAS_PIN) 664 continue; 665 666 eb_unreserve_vma(vma, &eb->flags[i]); 667 668 if (flags & EXEC_OBJECT_PINNED) 669 /* Pinned must have their slot */ 670 list_add(&vma->exec_link, &eb->unbound); 671 else if (flags & __EXEC_OBJECT_NEEDS_MAP) 672 /* Map require the lowest 256MiB (aperture) */ 673 list_add_tail(&vma->exec_link, &eb->unbound); 674 else if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 675 /* Prioritise 4GiB region for restricted bo */ 676 list_add(&vma->exec_link, &last); 677 else 678 list_add_tail(&vma->exec_link, &last); 679 } 680 list_splice_tail(&last, &eb->unbound); 681 682 switch (pass++) { 683 case 0: 684 break; 685 686 case 1: 687 /* Too fragmented, unbind everything and retry */ 688 mutex_lock(&eb->context->vm->mutex); 689 err = i915_gem_evict_vm(eb->context->vm); 690 mutex_unlock(&eb->context->vm->mutex); 691 if (err) 692 return err; 693 break; 694 695 default: 696 return -ENOSPC; 697 } 698 } while (1); 699 } 700 701 static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 702 { 703 if (eb->args->flags & I915_EXEC_BATCH_FIRST) 704 return 0; 705 else 706 return eb->buffer_count - 1; 707 } 708 709 static int eb_select_context(struct i915_execbuffer *eb) 710 { 711 struct i915_gem_context *ctx; 712 713 ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); 714 if (unlikely(!ctx)) 715 return -ENOENT; 716 717 eb->gem_context = ctx; 718 if (rcu_access_pointer(ctx->vm)) 719 eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT; 720 721 eb->context_flags = 0; 722 if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) 723 eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; 724 725 return 0; 726 } 727 728 static int eb_lookup_vmas(struct i915_execbuffer *eb) 729 { 730 struct radix_tree_root *handles_vma = &eb->gem_context->handles_vma; 731 struct drm_i915_gem_object *obj; 732 unsigned int i, batch; 733 int err; 734 735 INIT_LIST_HEAD(&eb->relocs); 736 INIT_LIST_HEAD(&eb->unbound); 737 738 batch = eb_batch_index(eb); 739 740 mutex_lock(&eb->gem_context->mutex); 741 if (unlikely(i915_gem_context_is_closed(eb->gem_context))) { 742 err = -ENOENT; 743 goto err_ctx; 744 } 745 746 for (i = 0; i < eb->buffer_count; i++) { 747 u32 handle = eb->exec[i].handle; 748 struct i915_lut_handle *lut; 749 struct i915_vma *vma; 750 751 vma = radix_tree_lookup(handles_vma, handle); 752 if (likely(vma)) 753 goto add_vma; 754 755 obj = i915_gem_object_lookup(eb->file, handle); 756 if (unlikely(!obj)) { 757 err = -ENOENT; 758 goto err_vma; 759 } 760 761 vma = i915_vma_instance(obj, eb->context->vm, NULL); 762 if (IS_ERR(vma)) { 763 err = PTR_ERR(vma); 764 goto err_obj; 765 } 766 767 lut = i915_lut_handle_alloc(); 768 if (unlikely(!lut)) { 769 err = -ENOMEM; 770 goto err_obj; 771 } 772 773 err = radix_tree_insert(handles_vma, handle, vma); 774 if (unlikely(err)) { 775 i915_lut_handle_free(lut); 776 goto err_obj; 777 } 778 779 /* transfer ref to lut */ 780 if (!atomic_fetch_inc(&vma->open_count)) 781 i915_vma_reopen(vma); 782 lut->handle = handle; 783 lut->ctx = eb->gem_context; 784 785 i915_gem_object_lock(obj); 786 list_add(&lut->obj_link, &obj->lut_list); 787 i915_gem_object_unlock(obj); 788 789 add_vma: 790 err = eb_add_vma(eb, i, batch, vma); 791 if (unlikely(err)) 792 goto err_vma; 793 794 GEM_BUG_ON(vma != eb->vma[i]); 795 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 796 GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && 797 eb_vma_misplaced(&eb->exec[i], vma, eb->flags[i])); 798 } 799 800 mutex_unlock(&eb->gem_context->mutex); 801 802 eb->args->flags |= __EXEC_VALIDATED; 803 return eb_reserve(eb); 804 805 err_obj: 806 i915_gem_object_put(obj); 807 err_vma: 808 eb->vma[i] = NULL; 809 err_ctx: 810 mutex_unlock(&eb->gem_context->mutex); 811 return err; 812 } 813 814 static struct i915_vma * 815 eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) 816 { 817 if (eb->lut_size < 0) { 818 if (handle >= -eb->lut_size) 819 return NULL; 820 return eb->vma[handle]; 821 } else { 822 struct hlist_head *head; 823 struct i915_vma *vma; 824 825 head = &eb->buckets[hash_32(handle, eb->lut_size)]; 826 hlist_for_each_entry(vma, head, exec_node) { 827 if (vma->exec_handle == handle) 828 return vma; 829 } 830 return NULL; 831 } 832 } 833 834 static void eb_release_vmas(const struct i915_execbuffer *eb) 835 { 836 const unsigned int count = eb->buffer_count; 837 unsigned int i; 838 839 for (i = 0; i < count; i++) { 840 struct i915_vma *vma = eb->vma[i]; 841 unsigned int flags = eb->flags[i]; 842 843 if (!vma) 844 break; 845 846 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 847 vma->exec_flags = NULL; 848 eb->vma[i] = NULL; 849 850 if (flags & __EXEC_OBJECT_HAS_PIN) 851 __eb_unreserve_vma(vma, flags); 852 853 if (flags & __EXEC_OBJECT_HAS_REF) 854 i915_vma_put(vma); 855 } 856 } 857 858 static void eb_reset_vmas(const struct i915_execbuffer *eb) 859 { 860 eb_release_vmas(eb); 861 if (eb->lut_size > 0) 862 memset(eb->buckets, 0, 863 sizeof(struct hlist_head) << eb->lut_size); 864 } 865 866 static void eb_destroy(const struct i915_execbuffer *eb) 867 { 868 GEM_BUG_ON(eb->reloc_cache.rq); 869 870 if (eb->lut_size > 0) 871 kfree(eb->buckets); 872 } 873 874 static inline u64 875 relocation_target(const struct drm_i915_gem_relocation_entry *reloc, 876 const struct i915_vma *target) 877 { 878 return gen8_canonical_addr((int)reloc->delta + target->node.start); 879 } 880 881 static void reloc_cache_init(struct reloc_cache *cache, 882 struct drm_i915_private *i915) 883 { 884 cache->page = -1; 885 cache->vaddr = 0; 886 /* Must be a variable in the struct to allow GCC to unroll. */ 887 cache->gen = INTEL_GEN(i915); 888 cache->has_llc = HAS_LLC(i915); 889 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 890 cache->has_fence = cache->gen < 4; 891 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 892 cache->node.flags = 0; 893 cache->rq = NULL; 894 cache->rq_size = 0; 895 } 896 897 static inline void *unmask_page(unsigned long p) 898 { 899 return (void *)(uintptr_t)(p & PAGE_MASK); 900 } 901 902 static inline unsigned int unmask_flags(unsigned long p) 903 { 904 return p & ~PAGE_MASK; 905 } 906 907 #define KMAP 0x4 /* after CLFLUSH_FLAGS */ 908 909 static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) 910 { 911 struct drm_i915_private *i915 = 912 container_of(cache, struct i915_execbuffer, reloc_cache)->i915; 913 return &i915->ggtt; 914 } 915 916 static void reloc_gpu_flush(struct reloc_cache *cache) 917 { 918 GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32)); 919 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; 920 921 __i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size); 922 i915_gem_object_unpin_map(cache->rq->batch->obj); 923 924 intel_gt_chipset_flush(cache->rq->engine->gt); 925 926 i915_request_add(cache->rq); 927 cache->rq = NULL; 928 } 929 930 static void reloc_cache_reset(struct reloc_cache *cache) 931 { 932 void *vaddr; 933 934 if (cache->rq) 935 reloc_gpu_flush(cache); 936 937 if (!cache->vaddr) 938 return; 939 940 vaddr = unmask_page(cache->vaddr); 941 if (cache->vaddr & KMAP) { 942 if (cache->vaddr & CLFLUSH_AFTER) 943 mb(); 944 945 kunmap_atomic(vaddr); 946 i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); 947 } else { 948 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 949 950 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 951 io_mapping_unmap_atomic((void __iomem *)vaddr); 952 953 if (drm_mm_node_allocated(&cache->node)) { 954 ggtt->vm.clear_range(&ggtt->vm, 955 cache->node.start, 956 cache->node.size); 957 mutex_lock(&ggtt->vm.mutex); 958 drm_mm_remove_node(&cache->node); 959 mutex_unlock(&ggtt->vm.mutex); 960 } else { 961 i915_vma_unpin((struct i915_vma *)cache->node.mm); 962 } 963 } 964 965 cache->vaddr = 0; 966 cache->page = -1; 967 } 968 969 static void *reloc_kmap(struct drm_i915_gem_object *obj, 970 struct reloc_cache *cache, 971 unsigned long page) 972 { 973 void *vaddr; 974 975 if (cache->vaddr) { 976 kunmap_atomic(unmask_page(cache->vaddr)); 977 } else { 978 unsigned int flushes; 979 int err; 980 981 err = i915_gem_object_prepare_write(obj, &flushes); 982 if (err) 983 return ERR_PTR(err); 984 985 BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 986 BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 987 988 cache->vaddr = flushes | KMAP; 989 cache->node.mm = (void *)obj; 990 if (flushes) 991 mb(); 992 } 993 994 vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); 995 cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 996 cache->page = page; 997 998 return vaddr; 999 } 1000 1001 static void *reloc_iomap(struct drm_i915_gem_object *obj, 1002 struct reloc_cache *cache, 1003 unsigned long page) 1004 { 1005 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1006 unsigned long offset; 1007 void *vaddr; 1008 1009 if (cache->vaddr) { 1010 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1011 io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1012 } else { 1013 struct i915_vma *vma; 1014 int err; 1015 1016 if (i915_gem_object_is_tiled(obj)) 1017 return ERR_PTR(-EINVAL); 1018 1019 if (use_cpu_reloc(cache, obj)) 1020 return NULL; 1021 1022 i915_gem_object_lock(obj); 1023 err = i915_gem_object_set_to_gtt_domain(obj, true); 1024 i915_gem_object_unlock(obj); 1025 if (err) 1026 return ERR_PTR(err); 1027 1028 vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 1029 PIN_MAPPABLE | 1030 PIN_NONBLOCK /* NOWARN */ | 1031 PIN_NOEVICT); 1032 if (IS_ERR(vma)) { 1033 memset(&cache->node, 0, sizeof(cache->node)); 1034 mutex_lock(&ggtt->vm.mutex); 1035 err = drm_mm_insert_node_in_range 1036 (&ggtt->vm.mm, &cache->node, 1037 PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1038 0, ggtt->mappable_end, 1039 DRM_MM_INSERT_LOW); 1040 mutex_unlock(&ggtt->vm.mutex); 1041 if (err) /* no inactive aperture space, use cpu reloc */ 1042 return NULL; 1043 } else { 1044 cache->node.start = vma->node.start; 1045 cache->node.mm = (void *)vma; 1046 } 1047 } 1048 1049 offset = cache->node.start; 1050 if (drm_mm_node_allocated(&cache->node)) { 1051 ggtt->vm.insert_page(&ggtt->vm, 1052 i915_gem_object_get_dma_address(obj, page), 1053 offset, I915_CACHE_NONE, 0); 1054 } else { 1055 offset += page << PAGE_SHIFT; 1056 } 1057 1058 vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1059 offset); 1060 cache->page = page; 1061 cache->vaddr = (unsigned long)vaddr; 1062 1063 return vaddr; 1064 } 1065 1066 static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1067 struct reloc_cache *cache, 1068 unsigned long page) 1069 { 1070 void *vaddr; 1071 1072 if (cache->page == page) { 1073 vaddr = unmask_page(cache->vaddr); 1074 } else { 1075 vaddr = NULL; 1076 if ((cache->vaddr & KMAP) == 0) 1077 vaddr = reloc_iomap(obj, cache, page); 1078 if (!vaddr) 1079 vaddr = reloc_kmap(obj, cache, page); 1080 } 1081 1082 return vaddr; 1083 } 1084 1085 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1086 { 1087 if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1088 if (flushes & CLFLUSH_BEFORE) { 1089 clflushopt(addr); 1090 mb(); 1091 } 1092 1093 *addr = value; 1094 1095 /* 1096 * Writes to the same cacheline are serialised by the CPU 1097 * (including clflush). On the write path, we only require 1098 * that it hits memory in an orderly fashion and place 1099 * mb barriers at the start and end of the relocation phase 1100 * to ensure ordering of clflush wrt to the system. 1101 */ 1102 if (flushes & CLFLUSH_AFTER) 1103 clflushopt(addr); 1104 } else 1105 *addr = value; 1106 } 1107 1108 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) 1109 { 1110 struct drm_i915_gem_object *obj = vma->obj; 1111 int err; 1112 1113 i915_vma_lock(vma); 1114 1115 if (obj->cache_dirty & ~obj->cache_coherent) 1116 i915_gem_clflush_object(obj, 0); 1117 obj->write_domain = 0; 1118 1119 err = i915_request_await_object(rq, vma->obj, true); 1120 if (err == 0) 1121 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 1122 1123 i915_vma_unlock(vma); 1124 1125 return err; 1126 } 1127 1128 static int __reloc_gpu_alloc(struct i915_execbuffer *eb, 1129 struct i915_vma *vma, 1130 unsigned int len) 1131 { 1132 struct reloc_cache *cache = &eb->reloc_cache; 1133 struct intel_engine_pool_node *pool; 1134 struct i915_request *rq; 1135 struct i915_vma *batch; 1136 u32 *cmd; 1137 int err; 1138 1139 pool = intel_engine_get_pool(eb->engine, PAGE_SIZE); 1140 if (IS_ERR(pool)) 1141 return PTR_ERR(pool); 1142 1143 cmd = i915_gem_object_pin_map(pool->obj, 1144 cache->has_llc ? 1145 I915_MAP_FORCE_WB : 1146 I915_MAP_FORCE_WC); 1147 if (IS_ERR(cmd)) { 1148 err = PTR_ERR(cmd); 1149 goto out_pool; 1150 } 1151 1152 batch = i915_vma_instance(pool->obj, vma->vm, NULL); 1153 if (IS_ERR(batch)) { 1154 err = PTR_ERR(batch); 1155 goto err_unmap; 1156 } 1157 1158 err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); 1159 if (err) 1160 goto err_unmap; 1161 1162 rq = i915_request_create(eb->context); 1163 if (IS_ERR(rq)) { 1164 err = PTR_ERR(rq); 1165 goto err_unpin; 1166 } 1167 1168 err = intel_engine_pool_mark_active(pool, rq); 1169 if (err) 1170 goto err_request; 1171 1172 err = reloc_move_to_gpu(rq, vma); 1173 if (err) 1174 goto err_request; 1175 1176 err = eb->engine->emit_bb_start(rq, 1177 batch->node.start, PAGE_SIZE, 1178 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); 1179 if (err) 1180 goto skip_request; 1181 1182 i915_vma_lock(batch); 1183 err = i915_request_await_object(rq, batch->obj, false); 1184 if (err == 0) 1185 err = i915_vma_move_to_active(batch, rq, 0); 1186 i915_vma_unlock(batch); 1187 if (err) 1188 goto skip_request; 1189 1190 rq->batch = batch; 1191 i915_vma_unpin(batch); 1192 1193 cache->rq = rq; 1194 cache->rq_cmd = cmd; 1195 cache->rq_size = 0; 1196 1197 /* Return with batch mapping (cmd) still pinned */ 1198 goto out_pool; 1199 1200 skip_request: 1201 i915_request_skip(rq, err); 1202 err_request: 1203 i915_request_add(rq); 1204 err_unpin: 1205 i915_vma_unpin(batch); 1206 err_unmap: 1207 i915_gem_object_unpin_map(pool->obj); 1208 out_pool: 1209 intel_engine_pool_put(pool); 1210 return err; 1211 } 1212 1213 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1214 struct i915_vma *vma, 1215 unsigned int len) 1216 { 1217 struct reloc_cache *cache = &eb->reloc_cache; 1218 u32 *cmd; 1219 1220 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) 1221 reloc_gpu_flush(cache); 1222 1223 if (unlikely(!cache->rq)) { 1224 int err; 1225 1226 if (!intel_engine_can_store_dword(eb->engine)) 1227 return ERR_PTR(-ENODEV); 1228 1229 err = __reloc_gpu_alloc(eb, vma, len); 1230 if (unlikely(err)) 1231 return ERR_PTR(err); 1232 } 1233 1234 cmd = cache->rq_cmd + cache->rq_size; 1235 cache->rq_size += len; 1236 1237 return cmd; 1238 } 1239 1240 static u64 1241 relocate_entry(struct i915_vma *vma, 1242 const struct drm_i915_gem_relocation_entry *reloc, 1243 struct i915_execbuffer *eb, 1244 const struct i915_vma *target) 1245 { 1246 u64 offset = reloc->offset; 1247 u64 target_offset = relocation_target(reloc, target); 1248 bool wide = eb->reloc_cache.use_64bit_reloc; 1249 void *vaddr; 1250 1251 if (!eb->reloc_cache.vaddr && 1252 (DBG_FORCE_RELOC == FORCE_GPU_RELOC || 1253 !dma_resv_test_signaled_rcu(vma->resv, true))) { 1254 const unsigned int gen = eb->reloc_cache.gen; 1255 unsigned int len; 1256 u32 *batch; 1257 u64 addr; 1258 1259 if (wide) 1260 len = offset & 7 ? 8 : 5; 1261 else if (gen >= 4) 1262 len = 4; 1263 else 1264 len = 3; 1265 1266 batch = reloc_gpu(eb, vma, len); 1267 if (IS_ERR(batch)) 1268 goto repeat; 1269 1270 addr = gen8_canonical_addr(vma->node.start + offset); 1271 if (wide) { 1272 if (offset & 7) { 1273 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1274 *batch++ = lower_32_bits(addr); 1275 *batch++ = upper_32_bits(addr); 1276 *batch++ = lower_32_bits(target_offset); 1277 1278 addr = gen8_canonical_addr(addr + 4); 1279 1280 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1281 *batch++ = lower_32_bits(addr); 1282 *batch++ = upper_32_bits(addr); 1283 *batch++ = upper_32_bits(target_offset); 1284 } else { 1285 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1286 *batch++ = lower_32_bits(addr); 1287 *batch++ = upper_32_bits(addr); 1288 *batch++ = lower_32_bits(target_offset); 1289 *batch++ = upper_32_bits(target_offset); 1290 } 1291 } else if (gen >= 6) { 1292 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1293 *batch++ = 0; 1294 *batch++ = addr; 1295 *batch++ = target_offset; 1296 } else if (gen >= 4) { 1297 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1298 *batch++ = 0; 1299 *batch++ = addr; 1300 *batch++ = target_offset; 1301 } else { 1302 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1303 *batch++ = addr; 1304 *batch++ = target_offset; 1305 } 1306 1307 goto out; 1308 } 1309 1310 repeat: 1311 vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1312 if (IS_ERR(vaddr)) 1313 return PTR_ERR(vaddr); 1314 1315 clflush_write32(vaddr + offset_in_page(offset), 1316 lower_32_bits(target_offset), 1317 eb->reloc_cache.vaddr); 1318 1319 if (wide) { 1320 offset += sizeof(u32); 1321 target_offset >>= 32; 1322 wide = false; 1323 goto repeat; 1324 } 1325 1326 out: 1327 return target->node.start | UPDATE; 1328 } 1329 1330 static u64 1331 eb_relocate_entry(struct i915_execbuffer *eb, 1332 struct i915_vma *vma, 1333 const struct drm_i915_gem_relocation_entry *reloc) 1334 { 1335 struct drm_i915_private *i915 = eb->i915; 1336 struct i915_vma *target; 1337 int err; 1338 1339 /* we've already hold a reference to all valid objects */ 1340 target = eb_get_vma(eb, reloc->target_handle); 1341 if (unlikely(!target)) 1342 return -ENOENT; 1343 1344 /* Validate that the target is in a valid r/w GPU domain */ 1345 if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) { 1346 drm_dbg(&i915->drm, "reloc with multiple write domains: " 1347 "target %d offset %d " 1348 "read %08x write %08x", 1349 reloc->target_handle, 1350 (int) reloc->offset, 1351 reloc->read_domains, 1352 reloc->write_domain); 1353 return -EINVAL; 1354 } 1355 if (unlikely((reloc->write_domain | reloc->read_domains) 1356 & ~I915_GEM_GPU_DOMAINS)) { 1357 drm_dbg(&i915->drm, "reloc with read/write non-GPU domains: " 1358 "target %d offset %d " 1359 "read %08x write %08x", 1360 reloc->target_handle, 1361 (int) reloc->offset, 1362 reloc->read_domains, 1363 reloc->write_domain); 1364 return -EINVAL; 1365 } 1366 1367 if (reloc->write_domain) { 1368 *target->exec_flags |= EXEC_OBJECT_WRITE; 1369 1370 /* 1371 * Sandybridge PPGTT errata: We need a global gtt mapping 1372 * for MI and pipe_control writes because the gpu doesn't 1373 * properly redirect them through the ppgtt for non_secure 1374 * batchbuffers. 1375 */ 1376 if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && 1377 IS_GEN(eb->i915, 6)) { 1378 err = i915_vma_bind(target, target->obj->cache_level, 1379 PIN_GLOBAL, NULL); 1380 if (WARN_ONCE(err, 1381 "Unexpected failure to bind target VMA!")) 1382 return err; 1383 } 1384 } 1385 1386 /* 1387 * If the relocation already has the right value in it, no 1388 * more work needs to be done. 1389 */ 1390 if (!DBG_FORCE_RELOC && 1391 gen8_canonical_addr(target->node.start) == reloc->presumed_offset) 1392 return 0; 1393 1394 /* Check that the relocation address is valid... */ 1395 if (unlikely(reloc->offset > 1396 vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) { 1397 drm_dbg(&i915->drm, "Relocation beyond object bounds: " 1398 "target %d offset %d size %d.\n", 1399 reloc->target_handle, 1400 (int)reloc->offset, 1401 (int)vma->size); 1402 return -EINVAL; 1403 } 1404 if (unlikely(reloc->offset & 3)) { 1405 drm_dbg(&i915->drm, "Relocation not 4-byte aligned: " 1406 "target %d offset %d.\n", 1407 reloc->target_handle, 1408 (int)reloc->offset); 1409 return -EINVAL; 1410 } 1411 1412 /* 1413 * If we write into the object, we need to force the synchronisation 1414 * barrier, either with an asynchronous clflush or if we executed the 1415 * patching using the GPU (though that should be serialised by the 1416 * timeline). To be completely sure, and since we are required to 1417 * do relocations we are already stalling, disable the user's opt 1418 * out of our synchronisation. 1419 */ 1420 *vma->exec_flags &= ~EXEC_OBJECT_ASYNC; 1421 1422 /* and update the user's relocation entry */ 1423 return relocate_entry(vma, reloc, eb, target); 1424 } 1425 1426 static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) 1427 { 1428 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) 1429 struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; 1430 struct drm_i915_gem_relocation_entry __user *urelocs; 1431 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1432 unsigned int remain; 1433 1434 urelocs = u64_to_user_ptr(entry->relocs_ptr); 1435 remain = entry->relocation_count; 1436 if (unlikely(remain > N_RELOC(ULONG_MAX))) 1437 return -EINVAL; 1438 1439 /* 1440 * We must check that the entire relocation array is safe 1441 * to read. However, if the array is not writable the user loses 1442 * the updated relocation values. 1443 */ 1444 if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs)))) 1445 return -EFAULT; 1446 1447 do { 1448 struct drm_i915_gem_relocation_entry *r = stack; 1449 unsigned int count = 1450 min_t(unsigned int, remain, ARRAY_SIZE(stack)); 1451 unsigned int copied; 1452 1453 /* 1454 * This is the fast path and we cannot handle a pagefault 1455 * whilst holding the struct mutex lest the user pass in the 1456 * relocations contained within a mmaped bo. For in such a case 1457 * we, the page fault handler would call i915_gem_fault() and 1458 * we would try to acquire the struct mutex again. Obviously 1459 * this is bad and so lockdep complains vehemently. 1460 */ 1461 pagefault_disable(); 1462 copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); 1463 pagefault_enable(); 1464 if (unlikely(copied)) { 1465 remain = -EFAULT; 1466 goto out; 1467 } 1468 1469 remain -= count; 1470 do { 1471 u64 offset = eb_relocate_entry(eb, vma, r); 1472 1473 if (likely(offset == 0)) { 1474 } else if ((s64)offset < 0) { 1475 remain = (int)offset; 1476 goto out; 1477 } else { 1478 /* 1479 * Note that reporting an error now 1480 * leaves everything in an inconsistent 1481 * state as we have *already* changed 1482 * the relocation value inside the 1483 * object. As we have not changed the 1484 * reloc.presumed_offset or will not 1485 * change the execobject.offset, on the 1486 * call we may not rewrite the value 1487 * inside the object, leaving it 1488 * dangling and causing a GPU hang. Unless 1489 * userspace dynamically rebuilds the 1490 * relocations on each execbuf rather than 1491 * presume a static tree. 1492 * 1493 * We did previously check if the relocations 1494 * were writable (access_ok), an error now 1495 * would be a strange race with mprotect, 1496 * having already demonstrated that we 1497 * can read from this userspace address. 1498 */ 1499 offset = gen8_canonical_addr(offset & ~UPDATE); 1500 if (unlikely(__put_user(offset, &urelocs[r-stack].presumed_offset))) { 1501 remain = -EFAULT; 1502 goto out; 1503 } 1504 } 1505 } while (r++, --count); 1506 urelocs += ARRAY_SIZE(stack); 1507 } while (remain); 1508 out: 1509 reloc_cache_reset(&eb->reloc_cache); 1510 return remain; 1511 } 1512 1513 static int 1514 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma) 1515 { 1516 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1517 struct drm_i915_gem_relocation_entry *relocs = 1518 u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1519 unsigned int i; 1520 int err; 1521 1522 for (i = 0; i < entry->relocation_count; i++) { 1523 u64 offset = eb_relocate_entry(eb, vma, &relocs[i]); 1524 1525 if ((s64)offset < 0) { 1526 err = (int)offset; 1527 goto err; 1528 } 1529 } 1530 err = 0; 1531 err: 1532 reloc_cache_reset(&eb->reloc_cache); 1533 return err; 1534 } 1535 1536 static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) 1537 { 1538 const char __user *addr, *end; 1539 unsigned long size; 1540 char __maybe_unused c; 1541 1542 size = entry->relocation_count; 1543 if (size == 0) 1544 return 0; 1545 1546 if (size > N_RELOC(ULONG_MAX)) 1547 return -EINVAL; 1548 1549 addr = u64_to_user_ptr(entry->relocs_ptr); 1550 size *= sizeof(struct drm_i915_gem_relocation_entry); 1551 if (!access_ok(addr, size)) 1552 return -EFAULT; 1553 1554 end = addr + size; 1555 for (; addr < end; addr += PAGE_SIZE) { 1556 int err = __get_user(c, addr); 1557 if (err) 1558 return err; 1559 } 1560 return __get_user(c, end - 1); 1561 } 1562 1563 static int eb_copy_relocations(const struct i915_execbuffer *eb) 1564 { 1565 struct drm_i915_gem_relocation_entry *relocs; 1566 const unsigned int count = eb->buffer_count; 1567 unsigned int i; 1568 int err; 1569 1570 for (i = 0; i < count; i++) { 1571 const unsigned int nreloc = eb->exec[i].relocation_count; 1572 struct drm_i915_gem_relocation_entry __user *urelocs; 1573 unsigned long size; 1574 unsigned long copied; 1575 1576 if (nreloc == 0) 1577 continue; 1578 1579 err = check_relocations(&eb->exec[i]); 1580 if (err) 1581 goto err; 1582 1583 urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); 1584 size = nreloc * sizeof(*relocs); 1585 1586 relocs = kvmalloc_array(size, 1, GFP_KERNEL); 1587 if (!relocs) { 1588 err = -ENOMEM; 1589 goto err; 1590 } 1591 1592 /* copy_from_user is limited to < 4GiB */ 1593 copied = 0; 1594 do { 1595 unsigned int len = 1596 min_t(u64, BIT_ULL(31), size - copied); 1597 1598 if (__copy_from_user((char *)relocs + copied, 1599 (char __user *)urelocs + copied, 1600 len)) 1601 goto end; 1602 1603 copied += len; 1604 } while (copied < size); 1605 1606 /* 1607 * As we do not update the known relocation offsets after 1608 * relocating (due to the complexities in lock handling), 1609 * we need to mark them as invalid now so that we force the 1610 * relocation processing next time. Just in case the target 1611 * object is evicted and then rebound into its old 1612 * presumed_offset before the next execbuffer - if that 1613 * happened we would make the mistake of assuming that the 1614 * relocations were valid. 1615 */ 1616 if (!user_access_begin(urelocs, size)) 1617 goto end; 1618 1619 for (copied = 0; copied < nreloc; copied++) 1620 unsafe_put_user(-1, 1621 &urelocs[copied].presumed_offset, 1622 end_user); 1623 user_access_end(); 1624 1625 eb->exec[i].relocs_ptr = (uintptr_t)relocs; 1626 } 1627 1628 return 0; 1629 1630 end_user: 1631 user_access_end(); 1632 end: 1633 kvfree(relocs); 1634 err = -EFAULT; 1635 err: 1636 while (i--) { 1637 relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); 1638 if (eb->exec[i].relocation_count) 1639 kvfree(relocs); 1640 } 1641 return err; 1642 } 1643 1644 static int eb_prefault_relocations(const struct i915_execbuffer *eb) 1645 { 1646 const unsigned int count = eb->buffer_count; 1647 unsigned int i; 1648 1649 for (i = 0; i < count; i++) { 1650 int err; 1651 1652 err = check_relocations(&eb->exec[i]); 1653 if (err) 1654 return err; 1655 } 1656 1657 return 0; 1658 } 1659 1660 static noinline int eb_relocate_slow(struct i915_execbuffer *eb) 1661 { 1662 struct drm_device *dev = &eb->i915->drm; 1663 bool have_copy = false; 1664 struct i915_vma *vma; 1665 int err = 0; 1666 1667 repeat: 1668 if (signal_pending(current)) { 1669 err = -ERESTARTSYS; 1670 goto out; 1671 } 1672 1673 /* We may process another execbuffer during the unlock... */ 1674 eb_reset_vmas(eb); 1675 mutex_unlock(&dev->struct_mutex); 1676 1677 /* 1678 * We take 3 passes through the slowpatch. 1679 * 1680 * 1 - we try to just prefault all the user relocation entries and 1681 * then attempt to reuse the atomic pagefault disabled fast path again. 1682 * 1683 * 2 - we copy the user entries to a local buffer here outside of the 1684 * local and allow ourselves to wait upon any rendering before 1685 * relocations 1686 * 1687 * 3 - we already have a local copy of the relocation entries, but 1688 * were interrupted (EAGAIN) whilst waiting for the objects, try again. 1689 */ 1690 if (!err) { 1691 err = eb_prefault_relocations(eb); 1692 } else if (!have_copy) { 1693 err = eb_copy_relocations(eb); 1694 have_copy = err == 0; 1695 } else { 1696 cond_resched(); 1697 err = 0; 1698 } 1699 if (err) { 1700 mutex_lock(&dev->struct_mutex); 1701 goto out; 1702 } 1703 1704 /* A frequent cause for EAGAIN are currently unavailable client pages */ 1705 flush_workqueue(eb->i915->mm.userptr_wq); 1706 1707 err = i915_mutex_lock_interruptible(dev); 1708 if (err) { 1709 mutex_lock(&dev->struct_mutex); 1710 goto out; 1711 } 1712 1713 /* reacquire the objects */ 1714 err = eb_lookup_vmas(eb); 1715 if (err) 1716 goto err; 1717 1718 GEM_BUG_ON(!eb->batch); 1719 1720 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1721 if (!have_copy) { 1722 pagefault_disable(); 1723 err = eb_relocate_vma(eb, vma); 1724 pagefault_enable(); 1725 if (err) 1726 goto repeat; 1727 } else { 1728 err = eb_relocate_vma_slow(eb, vma); 1729 if (err) 1730 goto err; 1731 } 1732 } 1733 1734 /* 1735 * Leave the user relocations as are, this is the painfully slow path, 1736 * and we want to avoid the complication of dropping the lock whilst 1737 * having buffers reserved in the aperture and so causing spurious 1738 * ENOSPC for random operations. 1739 */ 1740 1741 err: 1742 if (err == -EAGAIN) 1743 goto repeat; 1744 1745 out: 1746 if (have_copy) { 1747 const unsigned int count = eb->buffer_count; 1748 unsigned int i; 1749 1750 for (i = 0; i < count; i++) { 1751 const struct drm_i915_gem_exec_object2 *entry = 1752 &eb->exec[i]; 1753 struct drm_i915_gem_relocation_entry *relocs; 1754 1755 if (!entry->relocation_count) 1756 continue; 1757 1758 relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1759 kvfree(relocs); 1760 } 1761 } 1762 1763 return err; 1764 } 1765 1766 static int eb_relocate(struct i915_execbuffer *eb) 1767 { 1768 if (eb_lookup_vmas(eb)) 1769 goto slow; 1770 1771 /* The objects are in their final locations, apply the relocations. */ 1772 if (eb->args->flags & __EXEC_HAS_RELOC) { 1773 struct i915_vma *vma; 1774 1775 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1776 if (eb_relocate_vma(eb, vma)) 1777 goto slow; 1778 } 1779 } 1780 1781 return 0; 1782 1783 slow: 1784 return eb_relocate_slow(eb); 1785 } 1786 1787 static int eb_move_to_gpu(struct i915_execbuffer *eb) 1788 { 1789 const unsigned int count = eb->buffer_count; 1790 struct ww_acquire_ctx acquire; 1791 unsigned int i; 1792 int err = 0; 1793 1794 ww_acquire_init(&acquire, &reservation_ww_class); 1795 1796 for (i = 0; i < count; i++) { 1797 struct i915_vma *vma = eb->vma[i]; 1798 1799 err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); 1800 if (!err) 1801 continue; 1802 1803 GEM_BUG_ON(err == -EALREADY); /* No duplicate vma */ 1804 1805 if (err == -EDEADLK) { 1806 GEM_BUG_ON(i == 0); 1807 do { 1808 int j = i - 1; 1809 1810 ww_mutex_unlock(&eb->vma[j]->resv->lock); 1811 1812 swap(eb->flags[i], eb->flags[j]); 1813 swap(eb->vma[i], eb->vma[j]); 1814 eb->vma[i]->exec_flags = &eb->flags[i]; 1815 } while (--i); 1816 GEM_BUG_ON(vma != eb->vma[0]); 1817 vma->exec_flags = &eb->flags[0]; 1818 1819 err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, 1820 &acquire); 1821 } 1822 if (err) 1823 break; 1824 } 1825 ww_acquire_done(&acquire); 1826 1827 while (i--) { 1828 unsigned int flags = eb->flags[i]; 1829 struct i915_vma *vma = eb->vma[i]; 1830 struct drm_i915_gem_object *obj = vma->obj; 1831 1832 assert_vma_held(vma); 1833 1834 if (flags & EXEC_OBJECT_CAPTURE) { 1835 struct i915_capture_list *capture; 1836 1837 capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1838 if (capture) { 1839 capture->next = eb->request->capture_list; 1840 capture->vma = vma; 1841 eb->request->capture_list = capture; 1842 } 1843 } 1844 1845 /* 1846 * If the GPU is not _reading_ through the CPU cache, we need 1847 * to make sure that any writes (both previous GPU writes from 1848 * before a change in snooping levels and normal CPU writes) 1849 * caught in that cache are flushed to main memory. 1850 * 1851 * We want to say 1852 * obj->cache_dirty && 1853 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 1854 * but gcc's optimiser doesn't handle that as well and emits 1855 * two jumps instead of one. Maybe one day... 1856 */ 1857 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 1858 if (i915_gem_clflush_object(obj, 0)) 1859 flags &= ~EXEC_OBJECT_ASYNC; 1860 } 1861 1862 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 1863 err = i915_request_await_object 1864 (eb->request, obj, flags & EXEC_OBJECT_WRITE); 1865 } 1866 1867 if (err == 0) 1868 err = i915_vma_move_to_active(vma, eb->request, flags); 1869 1870 i915_vma_unlock(vma); 1871 1872 __eb_unreserve_vma(vma, flags); 1873 vma->exec_flags = NULL; 1874 1875 if (unlikely(flags & __EXEC_OBJECT_HAS_REF)) 1876 i915_vma_put(vma); 1877 } 1878 ww_acquire_fini(&acquire); 1879 1880 if (unlikely(err)) 1881 goto err_skip; 1882 1883 eb->exec = NULL; 1884 1885 /* Unconditionally flush any chipset caches (for streaming writes). */ 1886 intel_gt_chipset_flush(eb->engine->gt); 1887 return 0; 1888 1889 err_skip: 1890 i915_request_skip(eb->request, err); 1891 return err; 1892 } 1893 1894 static int i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 1895 { 1896 if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) 1897 return -EINVAL; 1898 1899 /* Kernel clipping was a DRI1 misfeature */ 1900 if (!(exec->flags & I915_EXEC_FENCE_ARRAY)) { 1901 if (exec->num_cliprects || exec->cliprects_ptr) 1902 return -EINVAL; 1903 } 1904 1905 if (exec->DR4 == 0xffffffff) { 1906 DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); 1907 exec->DR4 = 0; 1908 } 1909 if (exec->DR1 || exec->DR4) 1910 return -EINVAL; 1911 1912 if ((exec->batch_start_offset | exec->batch_len) & 0x7) 1913 return -EINVAL; 1914 1915 return 0; 1916 } 1917 1918 static int i915_reset_gen7_sol_offsets(struct i915_request *rq) 1919 { 1920 u32 *cs; 1921 int i; 1922 1923 if (!IS_GEN(rq->i915, 7) || rq->engine->id != RCS0) { 1924 drm_dbg(&rq->i915->drm, "sol reset is gen7/rcs only\n"); 1925 return -EINVAL; 1926 } 1927 1928 cs = intel_ring_begin(rq, 4 * 2 + 2); 1929 if (IS_ERR(cs)) 1930 return PTR_ERR(cs); 1931 1932 *cs++ = MI_LOAD_REGISTER_IMM(4); 1933 for (i = 0; i < 4; i++) { 1934 *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); 1935 *cs++ = 0; 1936 } 1937 *cs++ = MI_NOOP; 1938 intel_ring_advance(rq, cs); 1939 1940 return 0; 1941 } 1942 1943 static struct i915_vma * 1944 shadow_batch_pin(struct drm_i915_gem_object *obj, 1945 struct i915_address_space *vm, 1946 unsigned int flags) 1947 { 1948 struct i915_vma *vma; 1949 int err; 1950 1951 vma = i915_vma_instance(obj, vm, NULL); 1952 if (IS_ERR(vma)) 1953 return vma; 1954 1955 err = i915_vma_pin(vma, 0, 0, flags); 1956 if (err) 1957 return ERR_PTR(err); 1958 1959 return vma; 1960 } 1961 1962 struct eb_parse_work { 1963 struct dma_fence_work base; 1964 struct intel_engine_cs *engine; 1965 struct i915_vma *batch; 1966 struct i915_vma *shadow; 1967 struct i915_vma *trampoline; 1968 unsigned int batch_offset; 1969 unsigned int batch_length; 1970 }; 1971 1972 static int __eb_parse(struct dma_fence_work *work) 1973 { 1974 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 1975 1976 return intel_engine_cmd_parser(pw->engine, 1977 pw->batch, 1978 pw->batch_offset, 1979 pw->batch_length, 1980 pw->shadow, 1981 pw->trampoline); 1982 } 1983 1984 static void __eb_parse_release(struct dma_fence_work *work) 1985 { 1986 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 1987 1988 if (pw->trampoline) 1989 i915_active_release(&pw->trampoline->active); 1990 i915_active_release(&pw->shadow->active); 1991 i915_active_release(&pw->batch->active); 1992 } 1993 1994 static const struct dma_fence_work_ops eb_parse_ops = { 1995 .name = "eb_parse", 1996 .work = __eb_parse, 1997 .release = __eb_parse_release, 1998 }; 1999 2000 static int eb_parse_pipeline(struct i915_execbuffer *eb, 2001 struct i915_vma *shadow, 2002 struct i915_vma *trampoline) 2003 { 2004 struct eb_parse_work *pw; 2005 int err; 2006 2007 pw = kzalloc(sizeof(*pw), GFP_KERNEL); 2008 if (!pw) 2009 return -ENOMEM; 2010 2011 err = i915_active_acquire(&eb->batch->active); 2012 if (err) 2013 goto err_free; 2014 2015 err = i915_active_acquire(&shadow->active); 2016 if (err) 2017 goto err_batch; 2018 2019 if (trampoline) { 2020 err = i915_active_acquire(&trampoline->active); 2021 if (err) 2022 goto err_shadow; 2023 } 2024 2025 dma_fence_work_init(&pw->base, &eb_parse_ops); 2026 2027 pw->engine = eb->engine; 2028 pw->batch = eb->batch; 2029 pw->batch_offset = eb->batch_start_offset; 2030 pw->batch_length = eb->batch_len; 2031 pw->shadow = shadow; 2032 pw->trampoline = trampoline; 2033 2034 err = dma_resv_lock_interruptible(pw->batch->resv, NULL); 2035 if (err) 2036 goto err_trampoline; 2037 2038 err = dma_resv_reserve_shared(pw->batch->resv, 1); 2039 if (err) 2040 goto err_batch_unlock; 2041 2042 /* Wait for all writes (and relocs) into the batch to complete */ 2043 err = i915_sw_fence_await_reservation(&pw->base.chain, 2044 pw->batch->resv, NULL, false, 2045 0, I915_FENCE_GFP); 2046 if (err < 0) 2047 goto err_batch_unlock; 2048 2049 /* Keep the batch alive and unwritten as we parse */ 2050 dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma); 2051 2052 dma_resv_unlock(pw->batch->resv); 2053 2054 /* Force execution to wait for completion of the parser */ 2055 dma_resv_lock(shadow->resv, NULL); 2056 dma_resv_add_excl_fence(shadow->resv, &pw->base.dma); 2057 dma_resv_unlock(shadow->resv); 2058 2059 dma_fence_work_commit(&pw->base); 2060 return 0; 2061 2062 err_batch_unlock: 2063 dma_resv_unlock(pw->batch->resv); 2064 err_trampoline: 2065 if (trampoline) 2066 i915_active_release(&trampoline->active); 2067 err_shadow: 2068 i915_active_release(&shadow->active); 2069 err_batch: 2070 i915_active_release(&eb->batch->active); 2071 err_free: 2072 kfree(pw); 2073 return err; 2074 } 2075 2076 static int eb_parse(struct i915_execbuffer *eb) 2077 { 2078 struct drm_i915_private *i915 = eb->i915; 2079 struct intel_engine_pool_node *pool; 2080 struct i915_vma *shadow, *trampoline; 2081 unsigned int len; 2082 int err; 2083 2084 if (!eb_use_cmdparser(eb)) 2085 return 0; 2086 2087 len = eb->batch_len; 2088 if (!CMDPARSER_USES_GGTT(eb->i915)) { 2089 /* 2090 * ppGTT backed shadow buffers must be mapped RO, to prevent 2091 * post-scan tampering 2092 */ 2093 if (!eb->context->vm->has_read_only) { 2094 drm_dbg(&i915->drm, 2095 "Cannot prevent post-scan tampering without RO capable vm\n"); 2096 return -EINVAL; 2097 } 2098 } else { 2099 len += I915_CMD_PARSER_TRAMPOLINE_SIZE; 2100 } 2101 2102 pool = intel_engine_get_pool(eb->engine, len); 2103 if (IS_ERR(pool)) 2104 return PTR_ERR(pool); 2105 2106 shadow = shadow_batch_pin(pool->obj, eb->context->vm, PIN_USER); 2107 if (IS_ERR(shadow)) { 2108 err = PTR_ERR(shadow); 2109 goto err; 2110 } 2111 i915_gem_object_set_readonly(shadow->obj); 2112 2113 trampoline = NULL; 2114 if (CMDPARSER_USES_GGTT(eb->i915)) { 2115 trampoline = shadow; 2116 2117 shadow = shadow_batch_pin(pool->obj, 2118 &eb->engine->gt->ggtt->vm, 2119 PIN_GLOBAL); 2120 if (IS_ERR(shadow)) { 2121 err = PTR_ERR(shadow); 2122 shadow = trampoline; 2123 goto err_shadow; 2124 } 2125 2126 eb->batch_flags |= I915_DISPATCH_SECURE; 2127 } 2128 2129 err = eb_parse_pipeline(eb, shadow, trampoline); 2130 if (err) 2131 goto err_trampoline; 2132 2133 eb->vma[eb->buffer_count] = i915_vma_get(shadow); 2134 eb->flags[eb->buffer_count] = 2135 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; 2136 shadow->exec_flags = &eb->flags[eb->buffer_count]; 2137 eb->buffer_count++; 2138 2139 eb->trampoline = trampoline; 2140 eb->batch_start_offset = 0; 2141 eb->batch = shadow; 2142 2143 shadow->private = pool; 2144 return 0; 2145 2146 err_trampoline: 2147 if (trampoline) 2148 i915_vma_unpin(trampoline); 2149 err_shadow: 2150 i915_vma_unpin(shadow); 2151 err: 2152 intel_engine_pool_put(pool); 2153 return err; 2154 } 2155 2156 static void 2157 add_to_client(struct i915_request *rq, struct drm_file *file) 2158 { 2159 struct drm_i915_file_private *file_priv = file->driver_priv; 2160 2161 rq->file_priv = file_priv; 2162 2163 spin_lock(&file_priv->mm.lock); 2164 list_add_tail(&rq->client_link, &file_priv->mm.request_list); 2165 spin_unlock(&file_priv->mm.lock); 2166 } 2167 2168 static int eb_submit(struct i915_execbuffer *eb) 2169 { 2170 int err; 2171 2172 err = eb_move_to_gpu(eb); 2173 if (err) 2174 return err; 2175 2176 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2177 err = i915_reset_gen7_sol_offsets(eb->request); 2178 if (err) 2179 return err; 2180 } 2181 2182 /* 2183 * After we completed waiting for other engines (using HW semaphores) 2184 * then we can signal that this request/batch is ready to run. This 2185 * allows us to determine if the batch is still waiting on the GPU 2186 * or actually running by checking the breadcrumb. 2187 */ 2188 if (eb->engine->emit_init_breadcrumb) { 2189 err = eb->engine->emit_init_breadcrumb(eb->request); 2190 if (err) 2191 return err; 2192 } 2193 2194 err = eb->engine->emit_bb_start(eb->request, 2195 eb->batch->node.start + 2196 eb->batch_start_offset, 2197 eb->batch_len, 2198 eb->batch_flags); 2199 if (err) 2200 return err; 2201 2202 if (eb->trampoline) { 2203 GEM_BUG_ON(eb->batch_start_offset); 2204 err = eb->engine->emit_bb_start(eb->request, 2205 eb->trampoline->node.start + 2206 eb->batch_len, 2207 0, 0); 2208 if (err) 2209 return err; 2210 } 2211 2212 if (intel_context_nopreempt(eb->context)) 2213 __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); 2214 2215 return 0; 2216 } 2217 2218 static int num_vcs_engines(const struct drm_i915_private *i915) 2219 { 2220 return hweight64(INTEL_INFO(i915)->engine_mask & 2221 GENMASK_ULL(VCS0 + I915_MAX_VCS - 1, VCS0)); 2222 } 2223 2224 /* 2225 * Find one BSD ring to dispatch the corresponding BSD command. 2226 * The engine index is returned. 2227 */ 2228 static unsigned int 2229 gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, 2230 struct drm_file *file) 2231 { 2232 struct drm_i915_file_private *file_priv = file->driver_priv; 2233 2234 /* Check whether the file_priv has already selected one ring. */ 2235 if ((int)file_priv->bsd_engine < 0) 2236 file_priv->bsd_engine = 2237 get_random_int() % num_vcs_engines(dev_priv); 2238 2239 return file_priv->bsd_engine; 2240 } 2241 2242 static const enum intel_engine_id user_ring_map[] = { 2243 [I915_EXEC_DEFAULT] = RCS0, 2244 [I915_EXEC_RENDER] = RCS0, 2245 [I915_EXEC_BLT] = BCS0, 2246 [I915_EXEC_BSD] = VCS0, 2247 [I915_EXEC_VEBOX] = VECS0 2248 }; 2249 2250 static struct i915_request *eb_throttle(struct intel_context *ce) 2251 { 2252 struct intel_ring *ring = ce->ring; 2253 struct intel_timeline *tl = ce->timeline; 2254 struct i915_request *rq; 2255 2256 /* 2257 * Completely unscientific finger-in-the-air estimates for suitable 2258 * maximum user request size (to avoid blocking) and then backoff. 2259 */ 2260 if (intel_ring_update_space(ring) >= PAGE_SIZE) 2261 return NULL; 2262 2263 /* 2264 * Find a request that after waiting upon, there will be at least half 2265 * the ring available. The hysteresis allows us to compete for the 2266 * shared ring and should mean that we sleep less often prior to 2267 * claiming our resources, but not so long that the ring completely 2268 * drains before we can submit our next request. 2269 */ 2270 list_for_each_entry(rq, &tl->requests, link) { 2271 if (rq->ring != ring) 2272 continue; 2273 2274 if (__intel_ring_space(rq->postfix, 2275 ring->emit, ring->size) > ring->size / 2) 2276 break; 2277 } 2278 if (&rq->link == &tl->requests) 2279 return NULL; /* weird, we will check again later for real */ 2280 2281 return i915_request_get(rq); 2282 } 2283 2284 static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) 2285 { 2286 struct intel_timeline *tl; 2287 struct i915_request *rq; 2288 int err; 2289 2290 /* 2291 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report 2292 * EIO if the GPU is already wedged. 2293 */ 2294 err = intel_gt_terminally_wedged(ce->engine->gt); 2295 if (err) 2296 return err; 2297 2298 if (unlikely(intel_context_is_banned(ce))) 2299 return -EIO; 2300 2301 /* 2302 * Pinning the contexts may generate requests in order to acquire 2303 * GGTT space, so do this first before we reserve a seqno for 2304 * ourselves. 2305 */ 2306 err = intel_context_pin(ce); 2307 if (err) 2308 return err; 2309 2310 /* 2311 * Take a local wakeref for preparing to dispatch the execbuf as 2312 * we expect to access the hardware fairly frequently in the 2313 * process, and require the engine to be kept awake between accesses. 2314 * Upon dispatch, we acquire another prolonged wakeref that we hold 2315 * until the timeline is idle, which in turn releases the wakeref 2316 * taken on the engine, and the parent device. 2317 */ 2318 tl = intel_context_timeline_lock(ce); 2319 if (IS_ERR(tl)) { 2320 err = PTR_ERR(tl); 2321 goto err_unpin; 2322 } 2323 2324 intel_context_enter(ce); 2325 rq = eb_throttle(ce); 2326 2327 intel_context_timeline_unlock(tl); 2328 2329 if (rq) { 2330 if (i915_request_wait(rq, 2331 I915_WAIT_INTERRUPTIBLE, 2332 MAX_SCHEDULE_TIMEOUT) < 0) { 2333 i915_request_put(rq); 2334 err = -EINTR; 2335 goto err_exit; 2336 } 2337 2338 i915_request_put(rq); 2339 } 2340 2341 eb->engine = ce->engine; 2342 eb->context = ce; 2343 return 0; 2344 2345 err_exit: 2346 mutex_lock(&tl->mutex); 2347 intel_context_exit(ce); 2348 intel_context_timeline_unlock(tl); 2349 err_unpin: 2350 intel_context_unpin(ce); 2351 return err; 2352 } 2353 2354 static void eb_unpin_engine(struct i915_execbuffer *eb) 2355 { 2356 struct intel_context *ce = eb->context; 2357 struct intel_timeline *tl = ce->timeline; 2358 2359 mutex_lock(&tl->mutex); 2360 intel_context_exit(ce); 2361 mutex_unlock(&tl->mutex); 2362 2363 intel_context_unpin(ce); 2364 } 2365 2366 static unsigned int 2367 eb_select_legacy_ring(struct i915_execbuffer *eb, 2368 struct drm_file *file, 2369 struct drm_i915_gem_execbuffer2 *args) 2370 { 2371 struct drm_i915_private *i915 = eb->i915; 2372 unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; 2373 2374 if (user_ring_id != I915_EXEC_BSD && 2375 (args->flags & I915_EXEC_BSD_MASK)) { 2376 drm_dbg(&i915->drm, 2377 "execbuf with non bsd ring but with invalid " 2378 "bsd dispatch flags: %d\n", (int)(args->flags)); 2379 return -1; 2380 } 2381 2382 if (user_ring_id == I915_EXEC_BSD && num_vcs_engines(i915) > 1) { 2383 unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; 2384 2385 if (bsd_idx == I915_EXEC_BSD_DEFAULT) { 2386 bsd_idx = gen8_dispatch_bsd_engine(i915, file); 2387 } else if (bsd_idx >= I915_EXEC_BSD_RING1 && 2388 bsd_idx <= I915_EXEC_BSD_RING2) { 2389 bsd_idx >>= I915_EXEC_BSD_SHIFT; 2390 bsd_idx--; 2391 } else { 2392 drm_dbg(&i915->drm, 2393 "execbuf with unknown bsd ring: %u\n", 2394 bsd_idx); 2395 return -1; 2396 } 2397 2398 return _VCS(bsd_idx); 2399 } 2400 2401 if (user_ring_id >= ARRAY_SIZE(user_ring_map)) { 2402 drm_dbg(&i915->drm, "execbuf with unknown ring: %u\n", 2403 user_ring_id); 2404 return -1; 2405 } 2406 2407 return user_ring_map[user_ring_id]; 2408 } 2409 2410 static int 2411 eb_pin_engine(struct i915_execbuffer *eb, 2412 struct drm_file *file, 2413 struct drm_i915_gem_execbuffer2 *args) 2414 { 2415 struct intel_context *ce; 2416 unsigned int idx; 2417 int err; 2418 2419 if (i915_gem_context_user_engines(eb->gem_context)) 2420 idx = args->flags & I915_EXEC_RING_MASK; 2421 else 2422 idx = eb_select_legacy_ring(eb, file, args); 2423 2424 ce = i915_gem_context_get_engine(eb->gem_context, idx); 2425 if (IS_ERR(ce)) 2426 return PTR_ERR(ce); 2427 2428 err = __eb_pin_engine(eb, ce); 2429 intel_context_put(ce); 2430 2431 return err; 2432 } 2433 2434 static void 2435 __free_fence_array(struct drm_syncobj **fences, unsigned int n) 2436 { 2437 while (n--) 2438 drm_syncobj_put(ptr_mask_bits(fences[n], 2)); 2439 kvfree(fences); 2440 } 2441 2442 static struct drm_syncobj ** 2443 get_fence_array(struct drm_i915_gem_execbuffer2 *args, 2444 struct drm_file *file) 2445 { 2446 const unsigned long nfences = args->num_cliprects; 2447 struct drm_i915_gem_exec_fence __user *user; 2448 struct drm_syncobj **fences; 2449 unsigned long n; 2450 int err; 2451 2452 if (!(args->flags & I915_EXEC_FENCE_ARRAY)) 2453 return NULL; 2454 2455 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 2456 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 2457 if (nfences > min_t(unsigned long, 2458 ULONG_MAX / sizeof(*user), 2459 SIZE_MAX / sizeof(*fences))) 2460 return ERR_PTR(-EINVAL); 2461 2462 user = u64_to_user_ptr(args->cliprects_ptr); 2463 if (!access_ok(user, nfences * sizeof(*user))) 2464 return ERR_PTR(-EFAULT); 2465 2466 fences = kvmalloc_array(nfences, sizeof(*fences), 2467 __GFP_NOWARN | GFP_KERNEL); 2468 if (!fences) 2469 return ERR_PTR(-ENOMEM); 2470 2471 for (n = 0; n < nfences; n++) { 2472 struct drm_i915_gem_exec_fence fence; 2473 struct drm_syncobj *syncobj; 2474 2475 if (__copy_from_user(&fence, user++, sizeof(fence))) { 2476 err = -EFAULT; 2477 goto err; 2478 } 2479 2480 if (fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) { 2481 err = -EINVAL; 2482 goto err; 2483 } 2484 2485 syncobj = drm_syncobj_find(file, fence.handle); 2486 if (!syncobj) { 2487 DRM_DEBUG("Invalid syncobj handle provided\n"); 2488 err = -ENOENT; 2489 goto err; 2490 } 2491 2492 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 2493 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 2494 2495 fences[n] = ptr_pack_bits(syncobj, fence.flags, 2); 2496 } 2497 2498 return fences; 2499 2500 err: 2501 __free_fence_array(fences, n); 2502 return ERR_PTR(err); 2503 } 2504 2505 static void 2506 put_fence_array(struct drm_i915_gem_execbuffer2 *args, 2507 struct drm_syncobj **fences) 2508 { 2509 if (fences) 2510 __free_fence_array(fences, args->num_cliprects); 2511 } 2512 2513 static int 2514 await_fence_array(struct i915_execbuffer *eb, 2515 struct drm_syncobj **fences) 2516 { 2517 const unsigned int nfences = eb->args->num_cliprects; 2518 unsigned int n; 2519 int err; 2520 2521 for (n = 0; n < nfences; n++) { 2522 struct drm_syncobj *syncobj; 2523 struct dma_fence *fence; 2524 unsigned int flags; 2525 2526 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2527 if (!(flags & I915_EXEC_FENCE_WAIT)) 2528 continue; 2529 2530 fence = drm_syncobj_fence_get(syncobj); 2531 if (!fence) 2532 return -EINVAL; 2533 2534 err = i915_request_await_dma_fence(eb->request, fence); 2535 dma_fence_put(fence); 2536 if (err < 0) 2537 return err; 2538 } 2539 2540 return 0; 2541 } 2542 2543 static void 2544 signal_fence_array(struct i915_execbuffer *eb, 2545 struct drm_syncobj **fences) 2546 { 2547 const unsigned int nfences = eb->args->num_cliprects; 2548 struct dma_fence * const fence = &eb->request->fence; 2549 unsigned int n; 2550 2551 for (n = 0; n < nfences; n++) { 2552 struct drm_syncobj *syncobj; 2553 unsigned int flags; 2554 2555 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2556 if (!(flags & I915_EXEC_FENCE_SIGNAL)) 2557 continue; 2558 2559 drm_syncobj_replace_fence(syncobj, fence); 2560 } 2561 } 2562 2563 static int 2564 i915_gem_do_execbuffer(struct drm_device *dev, 2565 struct drm_file *file, 2566 struct drm_i915_gem_execbuffer2 *args, 2567 struct drm_i915_gem_exec_object2 *exec, 2568 struct drm_syncobj **fences) 2569 { 2570 struct drm_i915_private *i915 = to_i915(dev); 2571 struct i915_execbuffer eb; 2572 struct dma_fence *in_fence = NULL; 2573 struct dma_fence *exec_fence = NULL; 2574 struct sync_file *out_fence = NULL; 2575 int out_fence_fd = -1; 2576 int err; 2577 2578 BUILD_BUG_ON(__EXEC_INTERNAL_FLAGS & ~__I915_EXEC_ILLEGAL_FLAGS); 2579 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 2580 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 2581 2582 eb.i915 = i915; 2583 eb.file = file; 2584 eb.args = args; 2585 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2586 args->flags |= __EXEC_HAS_RELOC; 2587 2588 eb.exec = exec; 2589 eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1); 2590 eb.vma[0] = NULL; 2591 eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1); 2592 2593 eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; 2594 reloc_cache_init(&eb.reloc_cache, eb.i915); 2595 2596 eb.buffer_count = args->buffer_count; 2597 eb.batch_start_offset = args->batch_start_offset; 2598 eb.batch_len = args->batch_len; 2599 eb.trampoline = NULL; 2600 2601 eb.batch_flags = 0; 2602 if (args->flags & I915_EXEC_SECURE) { 2603 if (INTEL_GEN(i915) >= 11) 2604 return -ENODEV; 2605 2606 /* Return -EPERM to trigger fallback code on old binaries. */ 2607 if (!HAS_SECURE_BATCHES(i915)) 2608 return -EPERM; 2609 2610 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 2611 return -EPERM; 2612 2613 eb.batch_flags |= I915_DISPATCH_SECURE; 2614 } 2615 if (args->flags & I915_EXEC_IS_PINNED) 2616 eb.batch_flags |= I915_DISPATCH_PINNED; 2617 2618 if (args->flags & I915_EXEC_FENCE_IN) { 2619 in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2620 if (!in_fence) 2621 return -EINVAL; 2622 } 2623 2624 if (args->flags & I915_EXEC_FENCE_SUBMIT) { 2625 if (in_fence) { 2626 err = -EINVAL; 2627 goto err_in_fence; 2628 } 2629 2630 exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2631 if (!exec_fence) { 2632 err = -EINVAL; 2633 goto err_in_fence; 2634 } 2635 } 2636 2637 if (args->flags & I915_EXEC_FENCE_OUT) { 2638 out_fence_fd = get_unused_fd_flags(O_CLOEXEC); 2639 if (out_fence_fd < 0) { 2640 err = out_fence_fd; 2641 goto err_exec_fence; 2642 } 2643 } 2644 2645 err = eb_create(&eb); 2646 if (err) 2647 goto err_out_fence; 2648 2649 GEM_BUG_ON(!eb.lut_size); 2650 2651 err = eb_select_context(&eb); 2652 if (unlikely(err)) 2653 goto err_destroy; 2654 2655 err = eb_pin_engine(&eb, file, args); 2656 if (unlikely(err)) 2657 goto err_context; 2658 2659 err = i915_mutex_lock_interruptible(dev); 2660 if (err) 2661 goto err_engine; 2662 2663 err = eb_relocate(&eb); 2664 if (err) { 2665 /* 2666 * If the user expects the execobject.offset and 2667 * reloc.presumed_offset to be an exact match, 2668 * as for using NO_RELOC, then we cannot update 2669 * the execobject.offset until we have completed 2670 * relocation. 2671 */ 2672 args->flags &= ~__EXEC_HAS_RELOC; 2673 goto err_vma; 2674 } 2675 2676 if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) { 2677 drm_dbg(&i915->drm, 2678 "Attempting to use self-modifying batch buffer\n"); 2679 err = -EINVAL; 2680 goto err_vma; 2681 } 2682 if (eb.batch_start_offset > eb.batch->size || 2683 eb.batch_len > eb.batch->size - eb.batch_start_offset) { 2684 drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); 2685 err = -EINVAL; 2686 goto err_vma; 2687 } 2688 2689 if (eb.batch_len == 0) 2690 eb.batch_len = eb.batch->size - eb.batch_start_offset; 2691 2692 err = eb_parse(&eb); 2693 if (err) 2694 goto err_vma; 2695 2696 /* 2697 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2698 * batch" bit. Hence we need to pin secure batches into the global gtt. 2699 * hsw should have this fixed, but bdw mucks it up again. */ 2700 if (eb.batch_flags & I915_DISPATCH_SECURE) { 2701 struct i915_vma *vma; 2702 2703 /* 2704 * So on first glance it looks freaky that we pin the batch here 2705 * outside of the reservation loop. But: 2706 * - The batch is already pinned into the relevant ppgtt, so we 2707 * already have the backing storage fully allocated. 2708 * - No other BO uses the global gtt (well contexts, but meh), 2709 * so we don't really have issues with multiple objects not 2710 * fitting due to fragmentation. 2711 * So this is actually safe. 2712 */ 2713 vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0); 2714 if (IS_ERR(vma)) { 2715 err = PTR_ERR(vma); 2716 goto err_parse; 2717 } 2718 2719 eb.batch = vma; 2720 } 2721 2722 /* All GPU relocation batches must be submitted prior to the user rq */ 2723 GEM_BUG_ON(eb.reloc_cache.rq); 2724 2725 /* Allocate a request for this batch buffer nice and early. */ 2726 eb.request = i915_request_create(eb.context); 2727 if (IS_ERR(eb.request)) { 2728 err = PTR_ERR(eb.request); 2729 goto err_batch_unpin; 2730 } 2731 2732 if (in_fence) { 2733 err = i915_request_await_dma_fence(eb.request, in_fence); 2734 if (err < 0) 2735 goto err_request; 2736 } 2737 2738 if (exec_fence) { 2739 err = i915_request_await_execution(eb.request, exec_fence, 2740 eb.engine->bond_execute); 2741 if (err < 0) 2742 goto err_request; 2743 } 2744 2745 if (fences) { 2746 err = await_fence_array(&eb, fences); 2747 if (err) 2748 goto err_request; 2749 } 2750 2751 if (out_fence_fd != -1) { 2752 out_fence = sync_file_create(&eb.request->fence); 2753 if (!out_fence) { 2754 err = -ENOMEM; 2755 goto err_request; 2756 } 2757 } 2758 2759 /* 2760 * Whilst this request exists, batch_obj will be on the 2761 * active_list, and so will hold the active reference. Only when this 2762 * request is retired will the the batch_obj be moved onto the 2763 * inactive_list and lose its active reference. Hence we do not need 2764 * to explicitly hold another reference here. 2765 */ 2766 eb.request->batch = eb.batch; 2767 if (eb.batch->private) 2768 intel_engine_pool_mark_active(eb.batch->private, eb.request); 2769 2770 trace_i915_request_queue(eb.request, eb.batch_flags); 2771 err = eb_submit(&eb); 2772 err_request: 2773 add_to_client(eb.request, file); 2774 i915_request_get(eb.request); 2775 i915_request_add(eb.request); 2776 2777 if (fences) 2778 signal_fence_array(&eb, fences); 2779 2780 if (out_fence) { 2781 if (err == 0) { 2782 fd_install(out_fence_fd, out_fence->file); 2783 args->rsvd2 &= GENMASK_ULL(31, 0); /* keep in-fence */ 2784 args->rsvd2 |= (u64)out_fence_fd << 32; 2785 out_fence_fd = -1; 2786 } else { 2787 fput(out_fence->file); 2788 } 2789 } 2790 i915_request_put(eb.request); 2791 2792 err_batch_unpin: 2793 if (eb.batch_flags & I915_DISPATCH_SECURE) 2794 i915_vma_unpin(eb.batch); 2795 err_parse: 2796 if (eb.batch->private) 2797 intel_engine_pool_put(eb.batch->private); 2798 err_vma: 2799 if (eb.exec) 2800 eb_release_vmas(&eb); 2801 if (eb.trampoline) 2802 i915_vma_unpin(eb.trampoline); 2803 mutex_unlock(&dev->struct_mutex); 2804 err_engine: 2805 eb_unpin_engine(&eb); 2806 err_context: 2807 i915_gem_context_put(eb.gem_context); 2808 err_destroy: 2809 eb_destroy(&eb); 2810 err_out_fence: 2811 if (out_fence_fd != -1) 2812 put_unused_fd(out_fence_fd); 2813 err_exec_fence: 2814 dma_fence_put(exec_fence); 2815 err_in_fence: 2816 dma_fence_put(in_fence); 2817 return err; 2818 } 2819 2820 static size_t eb_element_size(void) 2821 { 2822 return (sizeof(struct drm_i915_gem_exec_object2) + 2823 sizeof(struct i915_vma *) + 2824 sizeof(unsigned int)); 2825 } 2826 2827 static bool check_buffer_count(size_t count) 2828 { 2829 const size_t sz = eb_element_size(); 2830 2831 /* 2832 * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup 2833 * array size (see eb_create()). Otherwise, we can accept an array as 2834 * large as can be addressed (though use large arrays at your peril)! 2835 */ 2836 2837 return !(count < 1 || count > INT_MAX || count > SIZE_MAX / sz - 1); 2838 } 2839 2840 /* 2841 * Legacy execbuffer just creates an exec2 list from the original exec object 2842 * list array and passes it to the real function. 2843 */ 2844 int 2845 i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data, 2846 struct drm_file *file) 2847 { 2848 struct drm_i915_private *i915 = to_i915(dev); 2849 struct drm_i915_gem_execbuffer *args = data; 2850 struct drm_i915_gem_execbuffer2 exec2; 2851 struct drm_i915_gem_exec_object *exec_list = NULL; 2852 struct drm_i915_gem_exec_object2 *exec2_list = NULL; 2853 const size_t count = args->buffer_count; 2854 unsigned int i; 2855 int err; 2856 2857 if (!check_buffer_count(count)) { 2858 drm_dbg(&i915->drm, "execbuf2 with %zd buffers\n", count); 2859 return -EINVAL; 2860 } 2861 2862 exec2.buffers_ptr = args->buffers_ptr; 2863 exec2.buffer_count = args->buffer_count; 2864 exec2.batch_start_offset = args->batch_start_offset; 2865 exec2.batch_len = args->batch_len; 2866 exec2.DR1 = args->DR1; 2867 exec2.DR4 = args->DR4; 2868 exec2.num_cliprects = args->num_cliprects; 2869 exec2.cliprects_ptr = args->cliprects_ptr; 2870 exec2.flags = I915_EXEC_RENDER; 2871 i915_execbuffer2_set_context_id(exec2, 0); 2872 2873 err = i915_gem_check_execbuffer(&exec2); 2874 if (err) 2875 return err; 2876 2877 /* Copy in the exec list from userland */ 2878 exec_list = kvmalloc_array(count, sizeof(*exec_list), 2879 __GFP_NOWARN | GFP_KERNEL); 2880 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2881 __GFP_NOWARN | GFP_KERNEL); 2882 if (exec_list == NULL || exec2_list == NULL) { 2883 drm_dbg(&i915->drm, 2884 "Failed to allocate exec list for %d buffers\n", 2885 args->buffer_count); 2886 kvfree(exec_list); 2887 kvfree(exec2_list); 2888 return -ENOMEM; 2889 } 2890 err = copy_from_user(exec_list, 2891 u64_to_user_ptr(args->buffers_ptr), 2892 sizeof(*exec_list) * count); 2893 if (err) { 2894 drm_dbg(&i915->drm, "copy %d exec entries failed %d\n", 2895 args->buffer_count, err); 2896 kvfree(exec_list); 2897 kvfree(exec2_list); 2898 return -EFAULT; 2899 } 2900 2901 for (i = 0; i < args->buffer_count; i++) { 2902 exec2_list[i].handle = exec_list[i].handle; 2903 exec2_list[i].relocation_count = exec_list[i].relocation_count; 2904 exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr; 2905 exec2_list[i].alignment = exec_list[i].alignment; 2906 exec2_list[i].offset = exec_list[i].offset; 2907 if (INTEL_GEN(to_i915(dev)) < 4) 2908 exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE; 2909 else 2910 exec2_list[i].flags = 0; 2911 } 2912 2913 err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list, NULL); 2914 if (exec2.flags & __EXEC_HAS_RELOC) { 2915 struct drm_i915_gem_exec_object __user *user_exec_list = 2916 u64_to_user_ptr(args->buffers_ptr); 2917 2918 /* Copy the new buffer offsets back to the user's exec list. */ 2919 for (i = 0; i < args->buffer_count; i++) { 2920 if (!(exec2_list[i].offset & UPDATE)) 2921 continue; 2922 2923 exec2_list[i].offset = 2924 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2925 exec2_list[i].offset &= PIN_OFFSET_MASK; 2926 if (__copy_to_user(&user_exec_list[i].offset, 2927 &exec2_list[i].offset, 2928 sizeof(user_exec_list[i].offset))) 2929 break; 2930 } 2931 } 2932 2933 kvfree(exec_list); 2934 kvfree(exec2_list); 2935 return err; 2936 } 2937 2938 int 2939 i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, 2940 struct drm_file *file) 2941 { 2942 struct drm_i915_private *i915 = to_i915(dev); 2943 struct drm_i915_gem_execbuffer2 *args = data; 2944 struct drm_i915_gem_exec_object2 *exec2_list; 2945 struct drm_syncobj **fences = NULL; 2946 const size_t count = args->buffer_count; 2947 int err; 2948 2949 if (!check_buffer_count(count)) { 2950 drm_dbg(&i915->drm, "execbuf2 with %zd buffers\n", count); 2951 return -EINVAL; 2952 } 2953 2954 err = i915_gem_check_execbuffer(args); 2955 if (err) 2956 return err; 2957 2958 /* Allocate an extra slot for use by the command parser */ 2959 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2960 __GFP_NOWARN | GFP_KERNEL); 2961 if (exec2_list == NULL) { 2962 drm_dbg(&i915->drm, "Failed to allocate exec list for %zd buffers\n", 2963 count); 2964 return -ENOMEM; 2965 } 2966 if (copy_from_user(exec2_list, 2967 u64_to_user_ptr(args->buffers_ptr), 2968 sizeof(*exec2_list) * count)) { 2969 drm_dbg(&i915->drm, "copy %zd exec entries failed\n", count); 2970 kvfree(exec2_list); 2971 return -EFAULT; 2972 } 2973 2974 if (args->flags & I915_EXEC_FENCE_ARRAY) { 2975 fences = get_fence_array(args, file); 2976 if (IS_ERR(fences)) { 2977 kvfree(exec2_list); 2978 return PTR_ERR(fences); 2979 } 2980 } 2981 2982 err = i915_gem_do_execbuffer(dev, file, args, exec2_list, fences); 2983 2984 /* 2985 * Now that we have begun execution of the batchbuffer, we ignore 2986 * any new error after this point. Also given that we have already 2987 * updated the associated relocations, we try to write out the current 2988 * object locations irrespective of any error. 2989 */ 2990 if (args->flags & __EXEC_HAS_RELOC) { 2991 struct drm_i915_gem_exec_object2 __user *user_exec_list = 2992 u64_to_user_ptr(args->buffers_ptr); 2993 unsigned int i; 2994 2995 /* Copy the new buffer offsets back to the user's exec list. */ 2996 /* 2997 * Note: count * sizeof(*user_exec_list) does not overflow, 2998 * because we checked 'count' in check_buffer_count(). 2999 * 3000 * And this range already got effectively checked earlier 3001 * when we did the "copy_from_user()" above. 3002 */ 3003 if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) 3004 goto end; 3005 3006 for (i = 0; i < args->buffer_count; i++) { 3007 if (!(exec2_list[i].offset & UPDATE)) 3008 continue; 3009 3010 exec2_list[i].offset = 3011 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 3012 unsafe_put_user(exec2_list[i].offset, 3013 &user_exec_list[i].offset, 3014 end_user); 3015 } 3016 end_user: 3017 user_access_end(); 3018 end:; 3019 } 3020 3021 args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS; 3022 put_fence_array(args, fences); 3023 kvfree(exec2_list); 3024 return err; 3025 } 3026