1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2008,2010 Intel Corporation 5 */ 6 7 #include <linux/intel-iommu.h> 8 #include <linux/dma-resv.h> 9 #include <linux/sync_file.h> 10 #include <linux/uaccess.h> 11 12 #include <drm/drm_syncobj.h> 13 #include <drm/i915_drm.h> 14 15 #include "display/intel_frontbuffer.h" 16 17 #include "gem/i915_gem_ioctls.h" 18 #include "gt/intel_context.h" 19 #include "gt/intel_engine_pool.h" 20 #include "gt/intel_gt.h" 21 #include "gt/intel_gt_pm.h" 22 #include "gt/intel_ring.h" 23 24 #include "i915_drv.h" 25 #include "i915_gem_clflush.h" 26 #include "i915_gem_context.h" 27 #include "i915_gem_ioctls.h" 28 #include "i915_trace.h" 29 30 enum { 31 FORCE_CPU_RELOC = 1, 32 FORCE_GTT_RELOC, 33 FORCE_GPU_RELOC, 34 #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 35 }; 36 37 #define __EXEC_OBJECT_HAS_REF BIT(31) 38 #define __EXEC_OBJECT_HAS_PIN BIT(30) 39 #define __EXEC_OBJECT_HAS_FENCE BIT(29) 40 #define __EXEC_OBJECT_NEEDS_MAP BIT(28) 41 #define __EXEC_OBJECT_NEEDS_BIAS BIT(27) 42 #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */ 43 #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) 44 45 #define __EXEC_HAS_RELOC BIT(31) 46 #define __EXEC_VALIDATED BIT(30) 47 #define __EXEC_INTERNAL_FLAGS (~0u << 30) 48 #define UPDATE PIN_OFFSET_FIXED 49 50 #define BATCH_OFFSET_BIAS (256*1024) 51 52 #define __I915_EXEC_ILLEGAL_FLAGS \ 53 (__I915_EXEC_UNKNOWN_FLAGS | \ 54 I915_EXEC_CONSTANTS_MASK | \ 55 I915_EXEC_RESOURCE_STREAMER) 56 57 /* Catch emission of unexpected errors for CI! */ 58 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 59 #undef EINVAL 60 #define EINVAL ({ \ 61 DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \ 62 22; \ 63 }) 64 #endif 65 66 /** 67 * DOC: User command execution 68 * 69 * Userspace submits commands to be executed on the GPU as an instruction 70 * stream within a GEM object we call a batchbuffer. This instructions may 71 * refer to other GEM objects containing auxiliary state such as kernels, 72 * samplers, render targets and even secondary batchbuffers. Userspace does 73 * not know where in the GPU memory these objects reside and so before the 74 * batchbuffer is passed to the GPU for execution, those addresses in the 75 * batchbuffer and auxiliary objects are updated. This is known as relocation, 76 * or patching. To try and avoid having to relocate each object on the next 77 * execution, userspace is told the location of those objects in this pass, 78 * but this remains just a hint as the kernel may choose a new location for 79 * any object in the future. 80 * 81 * At the level of talking to the hardware, submitting a batchbuffer for the 82 * GPU to execute is to add content to a buffer from which the HW 83 * command streamer is reading. 84 * 85 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. 86 * Execlists, this command is not placed on the same buffer as the 87 * remaining items. 88 * 89 * 2. Add a command to invalidate caches to the buffer. 90 * 91 * 3. Add a batchbuffer start command to the buffer; the start command is 92 * essentially a token together with the GPU address of the batchbuffer 93 * to be executed. 94 * 95 * 4. Add a pipeline flush to the buffer. 96 * 97 * 5. Add a memory write command to the buffer to record when the GPU 98 * is done executing the batchbuffer. The memory write writes the 99 * global sequence number of the request, ``i915_request::global_seqno``; 100 * the i915 driver uses the current value in the register to determine 101 * if the GPU has completed the batchbuffer. 102 * 103 * 6. Add a user interrupt command to the buffer. This command instructs 104 * the GPU to issue an interrupt when the command, pipeline flush and 105 * memory write are completed. 106 * 107 * 7. Inform the hardware of the additional commands added to the buffer 108 * (by updating the tail pointer). 109 * 110 * Processing an execbuf ioctl is conceptually split up into a few phases. 111 * 112 * 1. Validation - Ensure all the pointers, handles and flags are valid. 113 * 2. Reservation - Assign GPU address space for every object 114 * 3. Relocation - Update any addresses to point to the final locations 115 * 4. Serialisation - Order the request with respect to its dependencies 116 * 5. Construction - Construct a request to execute the batchbuffer 117 * 6. Submission (at some point in the future execution) 118 * 119 * Reserving resources for the execbuf is the most complicated phase. We 120 * neither want to have to migrate the object in the address space, nor do 121 * we want to have to update any relocations pointing to this object. Ideally, 122 * we want to leave the object where it is and for all the existing relocations 123 * to match. If the object is given a new address, or if userspace thinks the 124 * object is elsewhere, we have to parse all the relocation entries and update 125 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that 126 * all the target addresses in all of its objects match the value in the 127 * relocation entries and that they all match the presumed offsets given by the 128 * list of execbuffer objects. Using this knowledge, we know that if we haven't 129 * moved any buffers, all the relocation entries are valid and we can skip 130 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU 131 * hang.) The requirement for using I915_EXEC_NO_RELOC are: 132 * 133 * The addresses written in the objects must match the corresponding 134 * reloc.presumed_offset which in turn must match the corresponding 135 * execobject.offset. 136 * 137 * Any render targets written to in the batch must be flagged with 138 * EXEC_OBJECT_WRITE. 139 * 140 * To avoid stalling, execobject.offset should match the current 141 * address of that object within the active context. 142 * 143 * The reservation is done is multiple phases. First we try and keep any 144 * object already bound in its current location - so as long as meets the 145 * constraints imposed by the new execbuffer. Any object left unbound after the 146 * first pass is then fitted into any available idle space. If an object does 147 * not fit, all objects are removed from the reservation and the process rerun 148 * after sorting the objects into a priority order (more difficult to fit 149 * objects are tried first). Failing that, the entire VM is cleared and we try 150 * to fit the execbuf once last time before concluding that it simply will not 151 * fit. 152 * 153 * A small complication to all of this is that we allow userspace not only to 154 * specify an alignment and a size for the object in the address space, but 155 * we also allow userspace to specify the exact offset. This objects are 156 * simpler to place (the location is known a priori) all we have to do is make 157 * sure the space is available. 158 * 159 * Once all the objects are in place, patching up the buried pointers to point 160 * to the final locations is a fairly simple job of walking over the relocation 161 * entry arrays, looking up the right address and rewriting the value into 162 * the object. Simple! ... The relocation entries are stored in user memory 163 * and so to access them we have to copy them into a local buffer. That copy 164 * has to avoid taking any pagefaults as they may lead back to a GEM object 165 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split 166 * the relocation into multiple passes. First we try to do everything within an 167 * atomic context (avoid the pagefaults) which requires that we never wait. If 168 * we detect that we may wait, or if we need to fault, then we have to fallback 169 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm 170 * bells yet?) Dropping the mutex means that we lose all the state we have 171 * built up so far for the execbuf and we must reset any global data. However, 172 * we do leave the objects pinned in their final locations - which is a 173 * potential issue for concurrent execbufs. Once we have left the mutex, we can 174 * allocate and copy all the relocation entries into a large array at our 175 * leisure, reacquire the mutex, reclaim all the objects and other state and 176 * then proceed to update any incorrect addresses with the objects. 177 * 178 * As we process the relocation entries, we maintain a record of whether the 179 * object is being written to. Using NORELOC, we expect userspace to provide 180 * this information instead. We also check whether we can skip the relocation 181 * by comparing the expected value inside the relocation entry with the target's 182 * final address. If they differ, we have to map the current object and rewrite 183 * the 4 or 8 byte pointer within. 184 * 185 * Serialising an execbuf is quite simple according to the rules of the GEM 186 * ABI. Execution within each context is ordered by the order of submission. 187 * Writes to any GEM object are in order of submission and are exclusive. Reads 188 * from a GEM object are unordered with respect to other reads, but ordered by 189 * writes. A write submitted after a read cannot occur before the read, and 190 * similarly any read submitted after a write cannot occur before the write. 191 * Writes are ordered between engines such that only one write occurs at any 192 * time (completing any reads beforehand) - using semaphores where available 193 * and CPU serialisation otherwise. Other GEM access obey the same rules, any 194 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU 195 * reads before starting, and any read (either using set-domain or pread) must 196 * flush all GPU writes before starting. (Note we only employ a barrier before, 197 * we currently rely on userspace not concurrently starting a new execution 198 * whilst reading or writing to an object. This may be an advantage or not 199 * depending on how much you trust userspace not to shoot themselves in the 200 * foot.) Serialisation may just result in the request being inserted into 201 * a DAG awaiting its turn, but most simple is to wait on the CPU until 202 * all dependencies are resolved. 203 * 204 * After all of that, is just a matter of closing the request and handing it to 205 * the hardware (well, leaving it in a queue to be executed). However, we also 206 * offer the ability for batchbuffers to be run with elevated privileges so 207 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) 208 * Before any batch is given extra privileges we first must check that it 209 * contains no nefarious instructions, we check that each instruction is from 210 * our whitelist and all registers are also from an allowed list. We first 211 * copy the user's batchbuffer to a shadow (so that the user doesn't have 212 * access to it, either by the CPU or GPU as we scan it) and then parse each 213 * instruction. If everything is ok, we set a flag telling the hardware to run 214 * the batchbuffer in trusted mode, otherwise the ioctl is rejected. 215 */ 216 217 struct i915_execbuffer { 218 struct drm_i915_private *i915; /** i915 backpointer */ 219 struct drm_file *file; /** per-file lookup tables and limits */ 220 struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ 221 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 222 struct i915_vma **vma; 223 unsigned int *flags; 224 225 struct intel_engine_cs *engine; /** engine to queue the request to */ 226 struct intel_context *context; /* logical state for the request */ 227 struct i915_gem_context *gem_context; /** caller's context */ 228 229 struct i915_request *request; /** our request to build */ 230 struct i915_vma *batch; /** identity of the batch obj/vma */ 231 232 /** actual size of execobj[] as we may extend it for the cmdparser */ 233 unsigned int buffer_count; 234 235 /** list of vma not yet bound during reservation phase */ 236 struct list_head unbound; 237 238 /** list of vma that have execobj.relocation_count */ 239 struct list_head relocs; 240 241 /** 242 * Track the most recently used object for relocations, as we 243 * frequently have to perform multiple relocations within the same 244 * obj/page 245 */ 246 struct reloc_cache { 247 struct drm_mm_node node; /** temporary GTT binding */ 248 unsigned long vaddr; /** Current kmap address */ 249 unsigned long page; /** Currently mapped page index */ 250 unsigned int gen; /** Cached value of INTEL_GEN */ 251 bool use_64bit_reloc : 1; 252 bool has_llc : 1; 253 bool has_fence : 1; 254 bool needs_unfenced : 1; 255 256 struct intel_context *ce; 257 struct i915_request *rq; 258 u32 *rq_cmd; 259 unsigned int rq_size; 260 } reloc_cache; 261 262 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 263 u32 context_flags; /** Set of execobj.flags to insert from the ctx */ 264 265 u32 batch_start_offset; /** Location within object of batch */ 266 u32 batch_len; /** Length of batch within object */ 267 u32 batch_flags; /** Flags composed for emit_bb_start() */ 268 269 /** 270 * Indicate either the size of the hastable used to resolve 271 * relocation handles, or if negative that we are using a direct 272 * index into the execobj[]. 273 */ 274 int lut_size; 275 struct hlist_head *buckets; /** ht for relocation handles */ 276 }; 277 278 #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) 279 280 /* 281 * Used to convert any address to canonical form. 282 * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS, 283 * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the 284 * addresses to be in a canonical form: 285 * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct 286 * canonical form [63:48] == [47]." 287 */ 288 #define GEN8_HIGH_ADDRESS_BIT 47 289 static inline u64 gen8_canonical_addr(u64 address) 290 { 291 return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT); 292 } 293 294 static inline u64 gen8_noncanonical_addr(u64 address) 295 { 296 return address & GENMASK_ULL(GEN8_HIGH_ADDRESS_BIT, 0); 297 } 298 299 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 300 { 301 return intel_engine_requires_cmd_parser(eb->engine) || 302 (intel_engine_using_cmd_parser(eb->engine) && 303 eb->args->batch_len); 304 } 305 306 static int eb_create(struct i915_execbuffer *eb) 307 { 308 if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { 309 unsigned int size = 1 + ilog2(eb->buffer_count); 310 311 /* 312 * Without a 1:1 association between relocation handles and 313 * the execobject[] index, we instead create a hashtable. 314 * We size it dynamically based on available memory, starting 315 * first with 1:1 assocative hash and scaling back until 316 * the allocation succeeds. 317 * 318 * Later on we use a positive lut_size to indicate we are 319 * using this hashtable, and a negative value to indicate a 320 * direct lookup. 321 */ 322 do { 323 gfp_t flags; 324 325 /* While we can still reduce the allocation size, don't 326 * raise a warning and allow the allocation to fail. 327 * On the last pass though, we want to try as hard 328 * as possible to perform the allocation and warn 329 * if it fails. 330 */ 331 flags = GFP_KERNEL; 332 if (size > 1) 333 flags |= __GFP_NORETRY | __GFP_NOWARN; 334 335 eb->buckets = kzalloc(sizeof(struct hlist_head) << size, 336 flags); 337 if (eb->buckets) 338 break; 339 } while (--size); 340 341 if (unlikely(!size)) 342 return -ENOMEM; 343 344 eb->lut_size = size; 345 } else { 346 eb->lut_size = -eb->buffer_count; 347 } 348 349 return 0; 350 } 351 352 static bool 353 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, 354 const struct i915_vma *vma, 355 unsigned int flags) 356 { 357 if (vma->node.size < entry->pad_to_size) 358 return true; 359 360 if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) 361 return true; 362 363 if (flags & EXEC_OBJECT_PINNED && 364 vma->node.start != entry->offset) 365 return true; 366 367 if (flags & __EXEC_OBJECT_NEEDS_BIAS && 368 vma->node.start < BATCH_OFFSET_BIAS) 369 return true; 370 371 if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && 372 (vma->node.start + vma->node.size - 1) >> 32) 373 return true; 374 375 if (flags & __EXEC_OBJECT_NEEDS_MAP && 376 !i915_vma_is_map_and_fenceable(vma)) 377 return true; 378 379 return false; 380 } 381 382 static inline bool 383 eb_pin_vma(struct i915_execbuffer *eb, 384 const struct drm_i915_gem_exec_object2 *entry, 385 struct i915_vma *vma) 386 { 387 unsigned int exec_flags = *vma->exec_flags; 388 u64 pin_flags; 389 390 if (vma->node.size) 391 pin_flags = vma->node.start; 392 else 393 pin_flags = entry->offset & PIN_OFFSET_MASK; 394 395 pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED; 396 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT)) 397 pin_flags |= PIN_GLOBAL; 398 399 if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) 400 return false; 401 402 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 403 if (unlikely(i915_vma_pin_fence(vma))) { 404 i915_vma_unpin(vma); 405 return false; 406 } 407 408 if (vma->fence) 409 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 410 } 411 412 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 413 return !eb_vma_misplaced(entry, vma, exec_flags); 414 } 415 416 static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags) 417 { 418 GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN)); 419 420 if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) 421 __i915_vma_unpin_fence(vma); 422 423 __i915_vma_unpin(vma); 424 } 425 426 static inline void 427 eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags) 428 { 429 if (!(*flags & __EXEC_OBJECT_HAS_PIN)) 430 return; 431 432 __eb_unreserve_vma(vma, *flags); 433 *flags &= ~__EXEC_OBJECT_RESERVED; 434 } 435 436 static int 437 eb_validate_vma(struct i915_execbuffer *eb, 438 struct drm_i915_gem_exec_object2 *entry, 439 struct i915_vma *vma) 440 { 441 if (unlikely(entry->flags & eb->invalid_flags)) 442 return -EINVAL; 443 444 if (unlikely(entry->alignment && !is_power_of_2(entry->alignment))) 445 return -EINVAL; 446 447 /* 448 * Offset can be used as input (EXEC_OBJECT_PINNED), reject 449 * any non-page-aligned or non-canonical addresses. 450 */ 451 if (unlikely(entry->flags & EXEC_OBJECT_PINNED && 452 entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) 453 return -EINVAL; 454 455 /* pad_to_size was once a reserved field, so sanitize it */ 456 if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { 457 if (unlikely(offset_in_page(entry->pad_to_size))) 458 return -EINVAL; 459 } else { 460 entry->pad_to_size = 0; 461 } 462 463 if (unlikely(vma->exec_flags)) { 464 DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n", 465 entry->handle, (int)(entry - eb->exec)); 466 return -EINVAL; 467 } 468 469 /* 470 * From drm_mm perspective address space is continuous, 471 * so from this point we're always using non-canonical 472 * form internally. 473 */ 474 entry->offset = gen8_noncanonical_addr(entry->offset); 475 476 if (!eb->reloc_cache.has_fence) { 477 entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; 478 } else { 479 if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || 480 eb->reloc_cache.needs_unfenced) && 481 i915_gem_object_is_tiled(vma->obj)) 482 entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; 483 } 484 485 if (!(entry->flags & EXEC_OBJECT_PINNED)) 486 entry->flags |= eb->context_flags; 487 488 return 0; 489 } 490 491 static int 492 eb_add_vma(struct i915_execbuffer *eb, 493 unsigned int i, unsigned batch_idx, 494 struct i915_vma *vma) 495 { 496 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 497 int err; 498 499 GEM_BUG_ON(i915_vma_is_closed(vma)); 500 501 if (!(eb->args->flags & __EXEC_VALIDATED)) { 502 err = eb_validate_vma(eb, entry, vma); 503 if (unlikely(err)) 504 return err; 505 } 506 507 if (eb->lut_size > 0) { 508 vma->exec_handle = entry->handle; 509 hlist_add_head(&vma->exec_node, 510 &eb->buckets[hash_32(entry->handle, 511 eb->lut_size)]); 512 } 513 514 if (entry->relocation_count) 515 list_add_tail(&vma->reloc_link, &eb->relocs); 516 517 /* 518 * Stash a pointer from the vma to execobj, so we can query its flags, 519 * size, alignment etc as provided by the user. Also we stash a pointer 520 * to the vma inside the execobj so that we can use a direct lookup 521 * to find the right target VMA when doing relocations. 522 */ 523 eb->vma[i] = vma; 524 eb->flags[i] = entry->flags; 525 vma->exec_flags = &eb->flags[i]; 526 527 /* 528 * SNA is doing fancy tricks with compressing batch buffers, which leads 529 * to negative relocation deltas. Usually that works out ok since the 530 * relocate address is still positive, except when the batch is placed 531 * very low in the GTT. Ensure this doesn't happen. 532 * 533 * Note that actual hangs have only been observed on gen7, but for 534 * paranoia do it everywhere. 535 */ 536 if (i == batch_idx) { 537 if (entry->relocation_count && 538 !(eb->flags[i] & EXEC_OBJECT_PINNED)) 539 eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; 540 if (eb->reloc_cache.has_fence) 541 eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; 542 543 eb->batch = vma; 544 } 545 546 err = 0; 547 if (eb_pin_vma(eb, entry, vma)) { 548 if (entry->offset != vma->node.start) { 549 entry->offset = vma->node.start | UPDATE; 550 eb->args->flags |= __EXEC_HAS_RELOC; 551 } 552 } else { 553 eb_unreserve_vma(vma, vma->exec_flags); 554 555 list_add_tail(&vma->exec_link, &eb->unbound); 556 if (drm_mm_node_allocated(&vma->node)) 557 err = i915_vma_unbind(vma); 558 if (unlikely(err)) 559 vma->exec_flags = NULL; 560 } 561 return err; 562 } 563 564 static inline int use_cpu_reloc(const struct reloc_cache *cache, 565 const struct drm_i915_gem_object *obj) 566 { 567 if (!i915_gem_object_has_struct_page(obj)) 568 return false; 569 570 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 571 return true; 572 573 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 574 return false; 575 576 return (cache->has_llc || 577 obj->cache_dirty || 578 obj->cache_level != I915_CACHE_NONE); 579 } 580 581 static int eb_reserve_vma(const struct i915_execbuffer *eb, 582 struct i915_vma *vma) 583 { 584 struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 585 unsigned int exec_flags = *vma->exec_flags; 586 u64 pin_flags; 587 int err; 588 589 pin_flags = PIN_USER | PIN_NONBLOCK; 590 if (exec_flags & EXEC_OBJECT_NEEDS_GTT) 591 pin_flags |= PIN_GLOBAL; 592 593 /* 594 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, 595 * limit address to the first 4GBs for unflagged objects. 596 */ 597 if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 598 pin_flags |= PIN_ZONE_4G; 599 600 if (exec_flags & __EXEC_OBJECT_NEEDS_MAP) 601 pin_flags |= PIN_MAPPABLE; 602 603 if (exec_flags & EXEC_OBJECT_PINNED) { 604 pin_flags |= entry->offset | PIN_OFFSET_FIXED; 605 pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */ 606 } else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) { 607 pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; 608 } 609 610 err = i915_vma_pin(vma, 611 entry->pad_to_size, entry->alignment, 612 pin_flags); 613 if (err) 614 return err; 615 616 if (entry->offset != vma->node.start) { 617 entry->offset = vma->node.start | UPDATE; 618 eb->args->flags |= __EXEC_HAS_RELOC; 619 } 620 621 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 622 err = i915_vma_pin_fence(vma); 623 if (unlikely(err)) { 624 i915_vma_unpin(vma); 625 return err; 626 } 627 628 if (vma->fence) 629 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 630 } 631 632 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 633 GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags)); 634 635 return 0; 636 } 637 638 static int eb_reserve(struct i915_execbuffer *eb) 639 { 640 const unsigned int count = eb->buffer_count; 641 struct list_head last; 642 struct i915_vma *vma; 643 unsigned int i, pass; 644 int err; 645 646 /* 647 * Attempt to pin all of the buffers into the GTT. 648 * This is done in 3 phases: 649 * 650 * 1a. Unbind all objects that do not match the GTT constraints for 651 * the execbuffer (fenceable, mappable, alignment etc). 652 * 1b. Increment pin count for already bound objects. 653 * 2. Bind new objects. 654 * 3. Decrement pin count. 655 * 656 * This avoid unnecessary unbinding of later objects in order to make 657 * room for the earlier objects *unless* we need to defragment. 658 */ 659 660 pass = 0; 661 err = 0; 662 do { 663 list_for_each_entry(vma, &eb->unbound, exec_link) { 664 err = eb_reserve_vma(eb, vma); 665 if (err) 666 break; 667 } 668 if (err != -ENOSPC) 669 return err; 670 671 /* Resort *all* the objects into priority order */ 672 INIT_LIST_HEAD(&eb->unbound); 673 INIT_LIST_HEAD(&last); 674 for (i = 0; i < count; i++) { 675 unsigned int flags = eb->flags[i]; 676 struct i915_vma *vma = eb->vma[i]; 677 678 if (flags & EXEC_OBJECT_PINNED && 679 flags & __EXEC_OBJECT_HAS_PIN) 680 continue; 681 682 eb_unreserve_vma(vma, &eb->flags[i]); 683 684 if (flags & EXEC_OBJECT_PINNED) 685 /* Pinned must have their slot */ 686 list_add(&vma->exec_link, &eb->unbound); 687 else if (flags & __EXEC_OBJECT_NEEDS_MAP) 688 /* Map require the lowest 256MiB (aperture) */ 689 list_add_tail(&vma->exec_link, &eb->unbound); 690 else if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 691 /* Prioritise 4GiB region for restricted bo */ 692 list_add(&vma->exec_link, &last); 693 else 694 list_add_tail(&vma->exec_link, &last); 695 } 696 list_splice_tail(&last, &eb->unbound); 697 698 switch (pass++) { 699 case 0: 700 break; 701 702 case 1: 703 /* Too fragmented, unbind everything and retry */ 704 mutex_lock(&eb->context->vm->mutex); 705 err = i915_gem_evict_vm(eb->context->vm); 706 mutex_unlock(&eb->context->vm->mutex); 707 if (err) 708 return err; 709 break; 710 711 default: 712 return -ENOSPC; 713 } 714 } while (1); 715 } 716 717 static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 718 { 719 if (eb->args->flags & I915_EXEC_BATCH_FIRST) 720 return 0; 721 else 722 return eb->buffer_count - 1; 723 } 724 725 static int eb_select_context(struct i915_execbuffer *eb) 726 { 727 struct i915_gem_context *ctx; 728 729 ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); 730 if (unlikely(!ctx)) 731 return -ENOENT; 732 733 eb->gem_context = ctx; 734 if (rcu_access_pointer(ctx->vm)) 735 eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT; 736 737 eb->context_flags = 0; 738 if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) 739 eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; 740 741 return 0; 742 } 743 744 static int eb_lookup_vmas(struct i915_execbuffer *eb) 745 { 746 struct radix_tree_root *handles_vma = &eb->gem_context->handles_vma; 747 struct drm_i915_gem_object *obj; 748 unsigned int i, batch; 749 int err; 750 751 if (unlikely(i915_gem_context_is_banned(eb->gem_context))) 752 return -EIO; 753 754 INIT_LIST_HEAD(&eb->relocs); 755 INIT_LIST_HEAD(&eb->unbound); 756 757 batch = eb_batch_index(eb); 758 759 mutex_lock(&eb->gem_context->mutex); 760 if (unlikely(i915_gem_context_is_closed(eb->gem_context))) { 761 err = -ENOENT; 762 goto err_ctx; 763 } 764 765 for (i = 0; i < eb->buffer_count; i++) { 766 u32 handle = eb->exec[i].handle; 767 struct i915_lut_handle *lut; 768 struct i915_vma *vma; 769 770 vma = radix_tree_lookup(handles_vma, handle); 771 if (likely(vma)) 772 goto add_vma; 773 774 obj = i915_gem_object_lookup(eb->file, handle); 775 if (unlikely(!obj)) { 776 err = -ENOENT; 777 goto err_vma; 778 } 779 780 vma = i915_vma_instance(obj, eb->context->vm, NULL); 781 if (IS_ERR(vma)) { 782 err = PTR_ERR(vma); 783 goto err_obj; 784 } 785 786 lut = i915_lut_handle_alloc(); 787 if (unlikely(!lut)) { 788 err = -ENOMEM; 789 goto err_obj; 790 } 791 792 err = radix_tree_insert(handles_vma, handle, vma); 793 if (unlikely(err)) { 794 i915_lut_handle_free(lut); 795 goto err_obj; 796 } 797 798 /* transfer ref to lut */ 799 if (!atomic_fetch_inc(&vma->open_count)) 800 i915_vma_reopen(vma); 801 lut->handle = handle; 802 lut->ctx = eb->gem_context; 803 804 i915_gem_object_lock(obj); 805 list_add(&lut->obj_link, &obj->lut_list); 806 i915_gem_object_unlock(obj); 807 808 add_vma: 809 err = eb_add_vma(eb, i, batch, vma); 810 if (unlikely(err)) 811 goto err_vma; 812 813 GEM_BUG_ON(vma != eb->vma[i]); 814 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 815 GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && 816 eb_vma_misplaced(&eb->exec[i], vma, eb->flags[i])); 817 } 818 819 mutex_unlock(&eb->gem_context->mutex); 820 821 eb->args->flags |= __EXEC_VALIDATED; 822 return eb_reserve(eb); 823 824 err_obj: 825 i915_gem_object_put(obj); 826 err_vma: 827 eb->vma[i] = NULL; 828 err_ctx: 829 mutex_unlock(&eb->gem_context->mutex); 830 return err; 831 } 832 833 static struct i915_vma * 834 eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) 835 { 836 if (eb->lut_size < 0) { 837 if (handle >= -eb->lut_size) 838 return NULL; 839 return eb->vma[handle]; 840 } else { 841 struct hlist_head *head; 842 struct i915_vma *vma; 843 844 head = &eb->buckets[hash_32(handle, eb->lut_size)]; 845 hlist_for_each_entry(vma, head, exec_node) { 846 if (vma->exec_handle == handle) 847 return vma; 848 } 849 return NULL; 850 } 851 } 852 853 static void eb_release_vmas(const struct i915_execbuffer *eb) 854 { 855 const unsigned int count = eb->buffer_count; 856 unsigned int i; 857 858 for (i = 0; i < count; i++) { 859 struct i915_vma *vma = eb->vma[i]; 860 unsigned int flags = eb->flags[i]; 861 862 if (!vma) 863 break; 864 865 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 866 vma->exec_flags = NULL; 867 eb->vma[i] = NULL; 868 869 if (flags & __EXEC_OBJECT_HAS_PIN) 870 __eb_unreserve_vma(vma, flags); 871 872 if (flags & __EXEC_OBJECT_HAS_REF) 873 i915_vma_put(vma); 874 } 875 } 876 877 static void eb_reset_vmas(const struct i915_execbuffer *eb) 878 { 879 eb_release_vmas(eb); 880 if (eb->lut_size > 0) 881 memset(eb->buckets, 0, 882 sizeof(struct hlist_head) << eb->lut_size); 883 } 884 885 static void eb_destroy(const struct i915_execbuffer *eb) 886 { 887 GEM_BUG_ON(eb->reloc_cache.rq); 888 889 if (eb->reloc_cache.ce) 890 intel_context_put(eb->reloc_cache.ce); 891 892 if (eb->lut_size > 0) 893 kfree(eb->buckets); 894 } 895 896 static inline u64 897 relocation_target(const struct drm_i915_gem_relocation_entry *reloc, 898 const struct i915_vma *target) 899 { 900 return gen8_canonical_addr((int)reloc->delta + target->node.start); 901 } 902 903 static void reloc_cache_init(struct reloc_cache *cache, 904 struct drm_i915_private *i915) 905 { 906 cache->page = -1; 907 cache->vaddr = 0; 908 /* Must be a variable in the struct to allow GCC to unroll. */ 909 cache->gen = INTEL_GEN(i915); 910 cache->has_llc = HAS_LLC(i915); 911 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 912 cache->has_fence = cache->gen < 4; 913 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 914 cache->node.flags = 0; 915 cache->ce = NULL; 916 cache->rq = NULL; 917 cache->rq_size = 0; 918 } 919 920 static inline void *unmask_page(unsigned long p) 921 { 922 return (void *)(uintptr_t)(p & PAGE_MASK); 923 } 924 925 static inline unsigned int unmask_flags(unsigned long p) 926 { 927 return p & ~PAGE_MASK; 928 } 929 930 #define KMAP 0x4 /* after CLFLUSH_FLAGS */ 931 932 static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) 933 { 934 struct drm_i915_private *i915 = 935 container_of(cache, struct i915_execbuffer, reloc_cache)->i915; 936 return &i915->ggtt; 937 } 938 939 static void reloc_gpu_flush(struct reloc_cache *cache) 940 { 941 GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32)); 942 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; 943 944 __i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size); 945 i915_gem_object_unpin_map(cache->rq->batch->obj); 946 947 intel_gt_chipset_flush(cache->rq->engine->gt); 948 949 i915_request_add(cache->rq); 950 cache->rq = NULL; 951 } 952 953 static void reloc_cache_reset(struct reloc_cache *cache) 954 { 955 void *vaddr; 956 957 if (cache->rq) 958 reloc_gpu_flush(cache); 959 960 if (!cache->vaddr) 961 return; 962 963 vaddr = unmask_page(cache->vaddr); 964 if (cache->vaddr & KMAP) { 965 if (cache->vaddr & CLFLUSH_AFTER) 966 mb(); 967 968 kunmap_atomic(vaddr); 969 i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); 970 } else { 971 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 972 973 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 974 io_mapping_unmap_atomic((void __iomem *)vaddr); 975 976 if (drm_mm_node_allocated(&cache->node)) { 977 ggtt->vm.clear_range(&ggtt->vm, 978 cache->node.start, 979 cache->node.size); 980 mutex_lock(&ggtt->vm.mutex); 981 drm_mm_remove_node(&cache->node); 982 mutex_unlock(&ggtt->vm.mutex); 983 } else { 984 i915_vma_unpin((struct i915_vma *)cache->node.mm); 985 } 986 } 987 988 cache->vaddr = 0; 989 cache->page = -1; 990 } 991 992 static void *reloc_kmap(struct drm_i915_gem_object *obj, 993 struct reloc_cache *cache, 994 unsigned long page) 995 { 996 void *vaddr; 997 998 if (cache->vaddr) { 999 kunmap_atomic(unmask_page(cache->vaddr)); 1000 } else { 1001 unsigned int flushes; 1002 int err; 1003 1004 err = i915_gem_object_prepare_write(obj, &flushes); 1005 if (err) 1006 return ERR_PTR(err); 1007 1008 BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 1009 BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 1010 1011 cache->vaddr = flushes | KMAP; 1012 cache->node.mm = (void *)obj; 1013 if (flushes) 1014 mb(); 1015 } 1016 1017 vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); 1018 cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 1019 cache->page = page; 1020 1021 return vaddr; 1022 } 1023 1024 static void *reloc_iomap(struct drm_i915_gem_object *obj, 1025 struct reloc_cache *cache, 1026 unsigned long page) 1027 { 1028 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1029 unsigned long offset; 1030 void *vaddr; 1031 1032 if (cache->vaddr) { 1033 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1034 io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1035 } else { 1036 struct i915_vma *vma; 1037 int err; 1038 1039 if (i915_gem_object_is_tiled(obj)) 1040 return ERR_PTR(-EINVAL); 1041 1042 if (use_cpu_reloc(cache, obj)) 1043 return NULL; 1044 1045 i915_gem_object_lock(obj); 1046 err = i915_gem_object_set_to_gtt_domain(obj, true); 1047 i915_gem_object_unlock(obj); 1048 if (err) 1049 return ERR_PTR(err); 1050 1051 vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 1052 PIN_MAPPABLE | 1053 PIN_NONBLOCK /* NOWARN */ | 1054 PIN_NOEVICT); 1055 if (IS_ERR(vma)) { 1056 memset(&cache->node, 0, sizeof(cache->node)); 1057 mutex_lock(&ggtt->vm.mutex); 1058 err = drm_mm_insert_node_in_range 1059 (&ggtt->vm.mm, &cache->node, 1060 PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1061 0, ggtt->mappable_end, 1062 DRM_MM_INSERT_LOW); 1063 mutex_unlock(&ggtt->vm.mutex); 1064 if (err) /* no inactive aperture space, use cpu reloc */ 1065 return NULL; 1066 } else { 1067 cache->node.start = vma->node.start; 1068 cache->node.mm = (void *)vma; 1069 } 1070 } 1071 1072 offset = cache->node.start; 1073 if (drm_mm_node_allocated(&cache->node)) { 1074 ggtt->vm.insert_page(&ggtt->vm, 1075 i915_gem_object_get_dma_address(obj, page), 1076 offset, I915_CACHE_NONE, 0); 1077 } else { 1078 offset += page << PAGE_SHIFT; 1079 } 1080 1081 vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1082 offset); 1083 cache->page = page; 1084 cache->vaddr = (unsigned long)vaddr; 1085 1086 return vaddr; 1087 } 1088 1089 static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1090 struct reloc_cache *cache, 1091 unsigned long page) 1092 { 1093 void *vaddr; 1094 1095 if (cache->page == page) { 1096 vaddr = unmask_page(cache->vaddr); 1097 } else { 1098 vaddr = NULL; 1099 if ((cache->vaddr & KMAP) == 0) 1100 vaddr = reloc_iomap(obj, cache, page); 1101 if (!vaddr) 1102 vaddr = reloc_kmap(obj, cache, page); 1103 } 1104 1105 return vaddr; 1106 } 1107 1108 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1109 { 1110 if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1111 if (flushes & CLFLUSH_BEFORE) { 1112 clflushopt(addr); 1113 mb(); 1114 } 1115 1116 *addr = value; 1117 1118 /* 1119 * Writes to the same cacheline are serialised by the CPU 1120 * (including clflush). On the write path, we only require 1121 * that it hits memory in an orderly fashion and place 1122 * mb barriers at the start and end of the relocation phase 1123 * to ensure ordering of clflush wrt to the system. 1124 */ 1125 if (flushes & CLFLUSH_AFTER) 1126 clflushopt(addr); 1127 } else 1128 *addr = value; 1129 } 1130 1131 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) 1132 { 1133 struct drm_i915_gem_object *obj = vma->obj; 1134 int err; 1135 1136 i915_vma_lock(vma); 1137 1138 if (obj->cache_dirty & ~obj->cache_coherent) 1139 i915_gem_clflush_object(obj, 0); 1140 obj->write_domain = 0; 1141 1142 err = i915_request_await_object(rq, vma->obj, true); 1143 if (err == 0) 1144 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 1145 1146 i915_vma_unlock(vma); 1147 1148 return err; 1149 } 1150 1151 static int __reloc_gpu_alloc(struct i915_execbuffer *eb, 1152 struct i915_vma *vma, 1153 unsigned int len) 1154 { 1155 struct reloc_cache *cache = &eb->reloc_cache; 1156 struct intel_engine_pool_node *pool; 1157 struct i915_request *rq; 1158 struct i915_vma *batch; 1159 u32 *cmd; 1160 int err; 1161 1162 pool = intel_engine_get_pool(eb->engine, PAGE_SIZE); 1163 if (IS_ERR(pool)) 1164 return PTR_ERR(pool); 1165 1166 cmd = i915_gem_object_pin_map(pool->obj, 1167 cache->has_llc ? 1168 I915_MAP_FORCE_WB : 1169 I915_MAP_FORCE_WC); 1170 if (IS_ERR(cmd)) { 1171 err = PTR_ERR(cmd); 1172 goto out_pool; 1173 } 1174 1175 batch = i915_vma_instance(pool->obj, vma->vm, NULL); 1176 if (IS_ERR(batch)) { 1177 err = PTR_ERR(batch); 1178 goto err_unmap; 1179 } 1180 1181 err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); 1182 if (err) 1183 goto err_unmap; 1184 1185 rq = intel_context_create_request(cache->ce); 1186 if (IS_ERR(rq)) { 1187 err = PTR_ERR(rq); 1188 goto err_unpin; 1189 } 1190 1191 err = intel_engine_pool_mark_active(pool, rq); 1192 if (err) 1193 goto err_request; 1194 1195 err = reloc_move_to_gpu(rq, vma); 1196 if (err) 1197 goto err_request; 1198 1199 err = eb->engine->emit_bb_start(rq, 1200 batch->node.start, PAGE_SIZE, 1201 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); 1202 if (err) 1203 goto skip_request; 1204 1205 i915_vma_lock(batch); 1206 err = i915_request_await_object(rq, batch->obj, false); 1207 if (err == 0) 1208 err = i915_vma_move_to_active(batch, rq, 0); 1209 i915_vma_unlock(batch); 1210 if (err) 1211 goto skip_request; 1212 1213 rq->batch = batch; 1214 i915_vma_unpin(batch); 1215 1216 cache->rq = rq; 1217 cache->rq_cmd = cmd; 1218 cache->rq_size = 0; 1219 1220 /* Return with batch mapping (cmd) still pinned */ 1221 goto out_pool; 1222 1223 skip_request: 1224 i915_request_skip(rq, err); 1225 err_request: 1226 i915_request_add(rq); 1227 err_unpin: 1228 i915_vma_unpin(batch); 1229 err_unmap: 1230 i915_gem_object_unpin_map(pool->obj); 1231 out_pool: 1232 intel_engine_pool_put(pool); 1233 return err; 1234 } 1235 1236 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1237 struct i915_vma *vma, 1238 unsigned int len) 1239 { 1240 struct reloc_cache *cache = &eb->reloc_cache; 1241 u32 *cmd; 1242 1243 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) 1244 reloc_gpu_flush(cache); 1245 1246 if (unlikely(!cache->rq)) { 1247 int err; 1248 1249 /* If we need to copy for the cmdparser, we will stall anyway */ 1250 if (eb_use_cmdparser(eb)) 1251 return ERR_PTR(-EWOULDBLOCK); 1252 1253 if (!intel_engine_can_store_dword(eb->engine)) 1254 return ERR_PTR(-ENODEV); 1255 1256 if (!cache->ce) { 1257 struct intel_context *ce; 1258 1259 /* 1260 * The CS pre-parser can pre-fetch commands across 1261 * memory sync points and starting gen12 it is able to 1262 * pre-fetch across BB_START and BB_END boundaries 1263 * (within the same context). We therefore use a 1264 * separate context gen12+ to guarantee that the reloc 1265 * writes land before the parser gets to the target 1266 * memory location. 1267 */ 1268 if (cache->gen >= 12) 1269 ce = intel_context_create(eb->context->gem_context, 1270 eb->engine); 1271 else 1272 ce = intel_context_get(eb->context); 1273 if (IS_ERR(ce)) 1274 return ERR_CAST(ce); 1275 1276 cache->ce = ce; 1277 } 1278 1279 err = __reloc_gpu_alloc(eb, vma, len); 1280 if (unlikely(err)) 1281 return ERR_PTR(err); 1282 } 1283 1284 cmd = cache->rq_cmd + cache->rq_size; 1285 cache->rq_size += len; 1286 1287 return cmd; 1288 } 1289 1290 static u64 1291 relocate_entry(struct i915_vma *vma, 1292 const struct drm_i915_gem_relocation_entry *reloc, 1293 struct i915_execbuffer *eb, 1294 const struct i915_vma *target) 1295 { 1296 u64 offset = reloc->offset; 1297 u64 target_offset = relocation_target(reloc, target); 1298 bool wide = eb->reloc_cache.use_64bit_reloc; 1299 void *vaddr; 1300 1301 if (!eb->reloc_cache.vaddr && 1302 (DBG_FORCE_RELOC == FORCE_GPU_RELOC || 1303 !dma_resv_test_signaled_rcu(vma->resv, true))) { 1304 const unsigned int gen = eb->reloc_cache.gen; 1305 unsigned int len; 1306 u32 *batch; 1307 u64 addr; 1308 1309 if (wide) 1310 len = offset & 7 ? 8 : 5; 1311 else if (gen >= 4) 1312 len = 4; 1313 else 1314 len = 3; 1315 1316 batch = reloc_gpu(eb, vma, len); 1317 if (IS_ERR(batch)) 1318 goto repeat; 1319 1320 addr = gen8_canonical_addr(vma->node.start + offset); 1321 if (wide) { 1322 if (offset & 7) { 1323 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1324 *batch++ = lower_32_bits(addr); 1325 *batch++ = upper_32_bits(addr); 1326 *batch++ = lower_32_bits(target_offset); 1327 1328 addr = gen8_canonical_addr(addr + 4); 1329 1330 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1331 *batch++ = lower_32_bits(addr); 1332 *batch++ = upper_32_bits(addr); 1333 *batch++ = upper_32_bits(target_offset); 1334 } else { 1335 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1336 *batch++ = lower_32_bits(addr); 1337 *batch++ = upper_32_bits(addr); 1338 *batch++ = lower_32_bits(target_offset); 1339 *batch++ = upper_32_bits(target_offset); 1340 } 1341 } else if (gen >= 6) { 1342 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1343 *batch++ = 0; 1344 *batch++ = addr; 1345 *batch++ = target_offset; 1346 } else if (gen >= 4) { 1347 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1348 *batch++ = 0; 1349 *batch++ = addr; 1350 *batch++ = target_offset; 1351 } else { 1352 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1353 *batch++ = addr; 1354 *batch++ = target_offset; 1355 } 1356 1357 goto out; 1358 } 1359 1360 repeat: 1361 vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1362 if (IS_ERR(vaddr)) 1363 return PTR_ERR(vaddr); 1364 1365 clflush_write32(vaddr + offset_in_page(offset), 1366 lower_32_bits(target_offset), 1367 eb->reloc_cache.vaddr); 1368 1369 if (wide) { 1370 offset += sizeof(u32); 1371 target_offset >>= 32; 1372 wide = false; 1373 goto repeat; 1374 } 1375 1376 out: 1377 return target->node.start | UPDATE; 1378 } 1379 1380 static u64 1381 eb_relocate_entry(struct i915_execbuffer *eb, 1382 struct i915_vma *vma, 1383 const struct drm_i915_gem_relocation_entry *reloc) 1384 { 1385 struct i915_vma *target; 1386 int err; 1387 1388 /* we've already hold a reference to all valid objects */ 1389 target = eb_get_vma(eb, reloc->target_handle); 1390 if (unlikely(!target)) 1391 return -ENOENT; 1392 1393 /* Validate that the target is in a valid r/w GPU domain */ 1394 if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) { 1395 DRM_DEBUG("reloc with multiple write domains: " 1396 "target %d offset %d " 1397 "read %08x write %08x", 1398 reloc->target_handle, 1399 (int) reloc->offset, 1400 reloc->read_domains, 1401 reloc->write_domain); 1402 return -EINVAL; 1403 } 1404 if (unlikely((reloc->write_domain | reloc->read_domains) 1405 & ~I915_GEM_GPU_DOMAINS)) { 1406 DRM_DEBUG("reloc with read/write non-GPU domains: " 1407 "target %d offset %d " 1408 "read %08x write %08x", 1409 reloc->target_handle, 1410 (int) reloc->offset, 1411 reloc->read_domains, 1412 reloc->write_domain); 1413 return -EINVAL; 1414 } 1415 1416 if (reloc->write_domain) { 1417 *target->exec_flags |= EXEC_OBJECT_WRITE; 1418 1419 /* 1420 * Sandybridge PPGTT errata: We need a global gtt mapping 1421 * for MI and pipe_control writes because the gpu doesn't 1422 * properly redirect them through the ppgtt for non_secure 1423 * batchbuffers. 1424 */ 1425 if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && 1426 IS_GEN(eb->i915, 6)) { 1427 err = i915_vma_bind(target, target->obj->cache_level, 1428 PIN_GLOBAL, NULL); 1429 if (WARN_ONCE(err, 1430 "Unexpected failure to bind target VMA!")) 1431 return err; 1432 } 1433 } 1434 1435 /* 1436 * If the relocation already has the right value in it, no 1437 * more work needs to be done. 1438 */ 1439 if (!DBG_FORCE_RELOC && 1440 gen8_canonical_addr(target->node.start) == reloc->presumed_offset) 1441 return 0; 1442 1443 /* Check that the relocation address is valid... */ 1444 if (unlikely(reloc->offset > 1445 vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) { 1446 DRM_DEBUG("Relocation beyond object bounds: " 1447 "target %d offset %d size %d.\n", 1448 reloc->target_handle, 1449 (int)reloc->offset, 1450 (int)vma->size); 1451 return -EINVAL; 1452 } 1453 if (unlikely(reloc->offset & 3)) { 1454 DRM_DEBUG("Relocation not 4-byte aligned: " 1455 "target %d offset %d.\n", 1456 reloc->target_handle, 1457 (int)reloc->offset); 1458 return -EINVAL; 1459 } 1460 1461 /* 1462 * If we write into the object, we need to force the synchronisation 1463 * barrier, either with an asynchronous clflush or if we executed the 1464 * patching using the GPU (though that should be serialised by the 1465 * timeline). To be completely sure, and since we are required to 1466 * do relocations we are already stalling, disable the user's opt 1467 * out of our synchronisation. 1468 */ 1469 *vma->exec_flags &= ~EXEC_OBJECT_ASYNC; 1470 1471 /* and update the user's relocation entry */ 1472 return relocate_entry(vma, reloc, eb, target); 1473 } 1474 1475 static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) 1476 { 1477 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) 1478 struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; 1479 struct drm_i915_gem_relocation_entry __user *urelocs; 1480 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1481 unsigned int remain; 1482 1483 urelocs = u64_to_user_ptr(entry->relocs_ptr); 1484 remain = entry->relocation_count; 1485 if (unlikely(remain > N_RELOC(ULONG_MAX))) 1486 return -EINVAL; 1487 1488 /* 1489 * We must check that the entire relocation array is safe 1490 * to read. However, if the array is not writable the user loses 1491 * the updated relocation values. 1492 */ 1493 if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs)))) 1494 return -EFAULT; 1495 1496 do { 1497 struct drm_i915_gem_relocation_entry *r = stack; 1498 unsigned int count = 1499 min_t(unsigned int, remain, ARRAY_SIZE(stack)); 1500 unsigned int copied; 1501 1502 /* 1503 * This is the fast path and we cannot handle a pagefault 1504 * whilst holding the struct mutex lest the user pass in the 1505 * relocations contained within a mmaped bo. For in such a case 1506 * we, the page fault handler would call i915_gem_fault() and 1507 * we would try to acquire the struct mutex again. Obviously 1508 * this is bad and so lockdep complains vehemently. 1509 */ 1510 pagefault_disable(); 1511 copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); 1512 pagefault_enable(); 1513 if (unlikely(copied)) { 1514 remain = -EFAULT; 1515 goto out; 1516 } 1517 1518 remain -= count; 1519 do { 1520 u64 offset = eb_relocate_entry(eb, vma, r); 1521 1522 if (likely(offset == 0)) { 1523 } else if ((s64)offset < 0) { 1524 remain = (int)offset; 1525 goto out; 1526 } else { 1527 /* 1528 * Note that reporting an error now 1529 * leaves everything in an inconsistent 1530 * state as we have *already* changed 1531 * the relocation value inside the 1532 * object. As we have not changed the 1533 * reloc.presumed_offset or will not 1534 * change the execobject.offset, on the 1535 * call we may not rewrite the value 1536 * inside the object, leaving it 1537 * dangling and causing a GPU hang. Unless 1538 * userspace dynamically rebuilds the 1539 * relocations on each execbuf rather than 1540 * presume a static tree. 1541 * 1542 * We did previously check if the relocations 1543 * were writable (access_ok), an error now 1544 * would be a strange race with mprotect, 1545 * having already demonstrated that we 1546 * can read from this userspace address. 1547 */ 1548 offset = gen8_canonical_addr(offset & ~UPDATE); 1549 if (unlikely(__put_user(offset, &urelocs[r-stack].presumed_offset))) { 1550 remain = -EFAULT; 1551 goto out; 1552 } 1553 } 1554 } while (r++, --count); 1555 urelocs += ARRAY_SIZE(stack); 1556 } while (remain); 1557 out: 1558 reloc_cache_reset(&eb->reloc_cache); 1559 return remain; 1560 } 1561 1562 static int 1563 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma) 1564 { 1565 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1566 struct drm_i915_gem_relocation_entry *relocs = 1567 u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1568 unsigned int i; 1569 int err; 1570 1571 for (i = 0; i < entry->relocation_count; i++) { 1572 u64 offset = eb_relocate_entry(eb, vma, &relocs[i]); 1573 1574 if ((s64)offset < 0) { 1575 err = (int)offset; 1576 goto err; 1577 } 1578 } 1579 err = 0; 1580 err: 1581 reloc_cache_reset(&eb->reloc_cache); 1582 return err; 1583 } 1584 1585 static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) 1586 { 1587 const char __user *addr, *end; 1588 unsigned long size; 1589 char __maybe_unused c; 1590 1591 size = entry->relocation_count; 1592 if (size == 0) 1593 return 0; 1594 1595 if (size > N_RELOC(ULONG_MAX)) 1596 return -EINVAL; 1597 1598 addr = u64_to_user_ptr(entry->relocs_ptr); 1599 size *= sizeof(struct drm_i915_gem_relocation_entry); 1600 if (!access_ok(addr, size)) 1601 return -EFAULT; 1602 1603 end = addr + size; 1604 for (; addr < end; addr += PAGE_SIZE) { 1605 int err = __get_user(c, addr); 1606 if (err) 1607 return err; 1608 } 1609 return __get_user(c, end - 1); 1610 } 1611 1612 static int eb_copy_relocations(const struct i915_execbuffer *eb) 1613 { 1614 struct drm_i915_gem_relocation_entry *relocs; 1615 const unsigned int count = eb->buffer_count; 1616 unsigned int i; 1617 int err; 1618 1619 for (i = 0; i < count; i++) { 1620 const unsigned int nreloc = eb->exec[i].relocation_count; 1621 struct drm_i915_gem_relocation_entry __user *urelocs; 1622 unsigned long size; 1623 unsigned long copied; 1624 1625 if (nreloc == 0) 1626 continue; 1627 1628 err = check_relocations(&eb->exec[i]); 1629 if (err) 1630 goto err; 1631 1632 urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); 1633 size = nreloc * sizeof(*relocs); 1634 1635 relocs = kvmalloc_array(size, 1, GFP_KERNEL); 1636 if (!relocs) { 1637 err = -ENOMEM; 1638 goto err; 1639 } 1640 1641 /* copy_from_user is limited to < 4GiB */ 1642 copied = 0; 1643 do { 1644 unsigned int len = 1645 min_t(u64, BIT_ULL(31), size - copied); 1646 1647 if (__copy_from_user((char *)relocs + copied, 1648 (char __user *)urelocs + copied, 1649 len)) 1650 goto end; 1651 1652 copied += len; 1653 } while (copied < size); 1654 1655 /* 1656 * As we do not update the known relocation offsets after 1657 * relocating (due to the complexities in lock handling), 1658 * we need to mark them as invalid now so that we force the 1659 * relocation processing next time. Just in case the target 1660 * object is evicted and then rebound into its old 1661 * presumed_offset before the next execbuffer - if that 1662 * happened we would make the mistake of assuming that the 1663 * relocations were valid. 1664 */ 1665 if (!user_access_begin(urelocs, size)) 1666 goto end; 1667 1668 for (copied = 0; copied < nreloc; copied++) 1669 unsafe_put_user(-1, 1670 &urelocs[copied].presumed_offset, 1671 end_user); 1672 user_access_end(); 1673 1674 eb->exec[i].relocs_ptr = (uintptr_t)relocs; 1675 } 1676 1677 return 0; 1678 1679 end_user: 1680 user_access_end(); 1681 end: 1682 kvfree(relocs); 1683 err = -EFAULT; 1684 err: 1685 while (i--) { 1686 relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); 1687 if (eb->exec[i].relocation_count) 1688 kvfree(relocs); 1689 } 1690 return err; 1691 } 1692 1693 static int eb_prefault_relocations(const struct i915_execbuffer *eb) 1694 { 1695 const unsigned int count = eb->buffer_count; 1696 unsigned int i; 1697 1698 if (unlikely(i915_modparams.prefault_disable)) 1699 return 0; 1700 1701 for (i = 0; i < count; i++) { 1702 int err; 1703 1704 err = check_relocations(&eb->exec[i]); 1705 if (err) 1706 return err; 1707 } 1708 1709 return 0; 1710 } 1711 1712 static noinline int eb_relocate_slow(struct i915_execbuffer *eb) 1713 { 1714 struct drm_device *dev = &eb->i915->drm; 1715 bool have_copy = false; 1716 struct i915_vma *vma; 1717 int err = 0; 1718 1719 repeat: 1720 if (signal_pending(current)) { 1721 err = -ERESTARTSYS; 1722 goto out; 1723 } 1724 1725 /* We may process another execbuffer during the unlock... */ 1726 eb_reset_vmas(eb); 1727 mutex_unlock(&dev->struct_mutex); 1728 1729 /* 1730 * We take 3 passes through the slowpatch. 1731 * 1732 * 1 - we try to just prefault all the user relocation entries and 1733 * then attempt to reuse the atomic pagefault disabled fast path again. 1734 * 1735 * 2 - we copy the user entries to a local buffer here outside of the 1736 * local and allow ourselves to wait upon any rendering before 1737 * relocations 1738 * 1739 * 3 - we already have a local copy of the relocation entries, but 1740 * were interrupted (EAGAIN) whilst waiting for the objects, try again. 1741 */ 1742 if (!err) { 1743 err = eb_prefault_relocations(eb); 1744 } else if (!have_copy) { 1745 err = eb_copy_relocations(eb); 1746 have_copy = err == 0; 1747 } else { 1748 cond_resched(); 1749 err = 0; 1750 } 1751 if (err) { 1752 mutex_lock(&dev->struct_mutex); 1753 goto out; 1754 } 1755 1756 /* A frequent cause for EAGAIN are currently unavailable client pages */ 1757 flush_workqueue(eb->i915->mm.userptr_wq); 1758 1759 err = i915_mutex_lock_interruptible(dev); 1760 if (err) { 1761 mutex_lock(&dev->struct_mutex); 1762 goto out; 1763 } 1764 1765 /* reacquire the objects */ 1766 err = eb_lookup_vmas(eb); 1767 if (err) 1768 goto err; 1769 1770 GEM_BUG_ON(!eb->batch); 1771 1772 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1773 if (!have_copy) { 1774 pagefault_disable(); 1775 err = eb_relocate_vma(eb, vma); 1776 pagefault_enable(); 1777 if (err) 1778 goto repeat; 1779 } else { 1780 err = eb_relocate_vma_slow(eb, vma); 1781 if (err) 1782 goto err; 1783 } 1784 } 1785 1786 /* 1787 * Leave the user relocations as are, this is the painfully slow path, 1788 * and we want to avoid the complication of dropping the lock whilst 1789 * having buffers reserved in the aperture and so causing spurious 1790 * ENOSPC for random operations. 1791 */ 1792 1793 err: 1794 if (err == -EAGAIN) 1795 goto repeat; 1796 1797 out: 1798 if (have_copy) { 1799 const unsigned int count = eb->buffer_count; 1800 unsigned int i; 1801 1802 for (i = 0; i < count; i++) { 1803 const struct drm_i915_gem_exec_object2 *entry = 1804 &eb->exec[i]; 1805 struct drm_i915_gem_relocation_entry *relocs; 1806 1807 if (!entry->relocation_count) 1808 continue; 1809 1810 relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1811 kvfree(relocs); 1812 } 1813 } 1814 1815 return err; 1816 } 1817 1818 static int eb_relocate(struct i915_execbuffer *eb) 1819 { 1820 if (eb_lookup_vmas(eb)) 1821 goto slow; 1822 1823 /* The objects are in their final locations, apply the relocations. */ 1824 if (eb->args->flags & __EXEC_HAS_RELOC) { 1825 struct i915_vma *vma; 1826 1827 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1828 if (eb_relocate_vma(eb, vma)) 1829 goto slow; 1830 } 1831 } 1832 1833 return 0; 1834 1835 slow: 1836 return eb_relocate_slow(eb); 1837 } 1838 1839 static int eb_move_to_gpu(struct i915_execbuffer *eb) 1840 { 1841 const unsigned int count = eb->buffer_count; 1842 struct ww_acquire_ctx acquire; 1843 unsigned int i; 1844 int err = 0; 1845 1846 ww_acquire_init(&acquire, &reservation_ww_class); 1847 1848 for (i = 0; i < count; i++) { 1849 struct i915_vma *vma = eb->vma[i]; 1850 1851 err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); 1852 if (!err) 1853 continue; 1854 1855 GEM_BUG_ON(err == -EALREADY); /* No duplicate vma */ 1856 1857 if (err == -EDEADLK) { 1858 GEM_BUG_ON(i == 0); 1859 do { 1860 int j = i - 1; 1861 1862 ww_mutex_unlock(&eb->vma[j]->resv->lock); 1863 1864 swap(eb->flags[i], eb->flags[j]); 1865 swap(eb->vma[i], eb->vma[j]); 1866 eb->vma[i]->exec_flags = &eb->flags[i]; 1867 } while (--i); 1868 GEM_BUG_ON(vma != eb->vma[0]); 1869 vma->exec_flags = &eb->flags[0]; 1870 1871 err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, 1872 &acquire); 1873 } 1874 if (err) 1875 break; 1876 } 1877 ww_acquire_done(&acquire); 1878 1879 while (i--) { 1880 unsigned int flags = eb->flags[i]; 1881 struct i915_vma *vma = eb->vma[i]; 1882 struct drm_i915_gem_object *obj = vma->obj; 1883 1884 assert_vma_held(vma); 1885 1886 if (flags & EXEC_OBJECT_CAPTURE) { 1887 struct i915_capture_list *capture; 1888 1889 capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1890 if (capture) { 1891 capture->next = eb->request->capture_list; 1892 capture->vma = vma; 1893 eb->request->capture_list = capture; 1894 } 1895 } 1896 1897 /* 1898 * If the GPU is not _reading_ through the CPU cache, we need 1899 * to make sure that any writes (both previous GPU writes from 1900 * before a change in snooping levels and normal CPU writes) 1901 * caught in that cache are flushed to main memory. 1902 * 1903 * We want to say 1904 * obj->cache_dirty && 1905 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 1906 * but gcc's optimiser doesn't handle that as well and emits 1907 * two jumps instead of one. Maybe one day... 1908 */ 1909 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 1910 if (i915_gem_clflush_object(obj, 0)) 1911 flags &= ~EXEC_OBJECT_ASYNC; 1912 } 1913 1914 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 1915 err = i915_request_await_object 1916 (eb->request, obj, flags & EXEC_OBJECT_WRITE); 1917 } 1918 1919 if (err == 0) 1920 err = i915_vma_move_to_active(vma, eb->request, flags); 1921 1922 i915_vma_unlock(vma); 1923 1924 __eb_unreserve_vma(vma, flags); 1925 vma->exec_flags = NULL; 1926 1927 if (unlikely(flags & __EXEC_OBJECT_HAS_REF)) 1928 i915_vma_put(vma); 1929 } 1930 ww_acquire_fini(&acquire); 1931 1932 if (unlikely(err)) 1933 goto err_skip; 1934 1935 eb->exec = NULL; 1936 1937 /* Unconditionally flush any chipset caches (for streaming writes). */ 1938 intel_gt_chipset_flush(eb->engine->gt); 1939 return 0; 1940 1941 err_skip: 1942 i915_request_skip(eb->request, err); 1943 return err; 1944 } 1945 1946 static bool i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 1947 { 1948 if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) 1949 return false; 1950 1951 /* Kernel clipping was a DRI1 misfeature */ 1952 if (!(exec->flags & I915_EXEC_FENCE_ARRAY)) { 1953 if (exec->num_cliprects || exec->cliprects_ptr) 1954 return false; 1955 } 1956 1957 if (exec->DR4 == 0xffffffff) { 1958 DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); 1959 exec->DR4 = 0; 1960 } 1961 if (exec->DR1 || exec->DR4) 1962 return false; 1963 1964 if ((exec->batch_start_offset | exec->batch_len) & 0x7) 1965 return false; 1966 1967 return true; 1968 } 1969 1970 static int i915_reset_gen7_sol_offsets(struct i915_request *rq) 1971 { 1972 u32 *cs; 1973 int i; 1974 1975 if (!IS_GEN(rq->i915, 7) || rq->engine->id != RCS0) { 1976 DRM_DEBUG("sol reset is gen7/rcs only\n"); 1977 return -EINVAL; 1978 } 1979 1980 cs = intel_ring_begin(rq, 4 * 2 + 2); 1981 if (IS_ERR(cs)) 1982 return PTR_ERR(cs); 1983 1984 *cs++ = MI_LOAD_REGISTER_IMM(4); 1985 for (i = 0; i < 4; i++) { 1986 *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); 1987 *cs++ = 0; 1988 } 1989 *cs++ = MI_NOOP; 1990 intel_ring_advance(rq, cs); 1991 1992 return 0; 1993 } 1994 1995 static struct i915_vma * 1996 shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj) 1997 { 1998 struct drm_i915_private *dev_priv = eb->i915; 1999 struct i915_vma * const vma = *eb->vma; 2000 struct i915_address_space *vm; 2001 u64 flags; 2002 2003 /* 2004 * PPGTT backed shadow buffers must be mapped RO, to prevent 2005 * post-scan tampering 2006 */ 2007 if (CMDPARSER_USES_GGTT(dev_priv)) { 2008 flags = PIN_GLOBAL; 2009 vm = &dev_priv->ggtt.vm; 2010 } else if (vma->vm->has_read_only) { 2011 flags = PIN_USER; 2012 vm = vma->vm; 2013 i915_gem_object_set_readonly(obj); 2014 } else { 2015 DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); 2016 return ERR_PTR(-EINVAL); 2017 } 2018 2019 return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags); 2020 } 2021 2022 static struct i915_vma *eb_parse(struct i915_execbuffer *eb) 2023 { 2024 struct intel_engine_pool_node *pool; 2025 struct i915_vma *vma; 2026 u64 batch_start; 2027 u64 shadow_batch_start; 2028 int err; 2029 2030 pool = intel_engine_get_pool(eb->engine, eb->batch_len); 2031 if (IS_ERR(pool)) 2032 return ERR_CAST(pool); 2033 2034 vma = shadow_batch_pin(eb, pool->obj); 2035 if (IS_ERR(vma)) 2036 goto err; 2037 2038 batch_start = gen8_canonical_addr(eb->batch->node.start) + 2039 eb->batch_start_offset; 2040 2041 shadow_batch_start = gen8_canonical_addr(vma->node.start); 2042 2043 err = intel_engine_cmd_parser(eb->gem_context, 2044 eb->engine, 2045 eb->batch->obj, 2046 batch_start, 2047 eb->batch_start_offset, 2048 eb->batch_len, 2049 pool->obj, 2050 shadow_batch_start); 2051 2052 if (err) { 2053 i915_vma_unpin(vma); 2054 2055 /* 2056 * Unsafe GGTT-backed buffers can still be submitted safely 2057 * as non-secure. 2058 * For PPGTT backing however, we have no choice but to forcibly 2059 * reject unsafe buffers 2060 */ 2061 if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES)) 2062 /* Execute original buffer non-secure */ 2063 vma = NULL; 2064 else 2065 vma = ERR_PTR(err); 2066 goto err; 2067 } 2068 2069 eb->vma[eb->buffer_count] = i915_vma_get(vma); 2070 eb->flags[eb->buffer_count] = 2071 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; 2072 vma->exec_flags = &eb->flags[eb->buffer_count]; 2073 eb->buffer_count++; 2074 2075 eb->batch_start_offset = 0; 2076 eb->batch = vma; 2077 2078 if (CMDPARSER_USES_GGTT(eb->i915)) 2079 eb->batch_flags |= I915_DISPATCH_SECURE; 2080 2081 /* eb->batch_len unchanged */ 2082 2083 vma->private = pool; 2084 return vma; 2085 2086 err: 2087 intel_engine_pool_put(pool); 2088 return vma; 2089 } 2090 2091 static void 2092 add_to_client(struct i915_request *rq, struct drm_file *file) 2093 { 2094 struct drm_i915_file_private *file_priv = file->driver_priv; 2095 2096 rq->file_priv = file_priv; 2097 2098 spin_lock(&file_priv->mm.lock); 2099 list_add_tail(&rq->client_link, &file_priv->mm.request_list); 2100 spin_unlock(&file_priv->mm.lock); 2101 } 2102 2103 static int eb_submit(struct i915_execbuffer *eb) 2104 { 2105 int err; 2106 2107 err = eb_move_to_gpu(eb); 2108 if (err) 2109 return err; 2110 2111 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2112 err = i915_reset_gen7_sol_offsets(eb->request); 2113 if (err) 2114 return err; 2115 } 2116 2117 /* 2118 * After we completed waiting for other engines (using HW semaphores) 2119 * then we can signal that this request/batch is ready to run. This 2120 * allows us to determine if the batch is still waiting on the GPU 2121 * or actually running by checking the breadcrumb. 2122 */ 2123 if (eb->engine->emit_init_breadcrumb) { 2124 err = eb->engine->emit_init_breadcrumb(eb->request); 2125 if (err) 2126 return err; 2127 } 2128 2129 err = eb->engine->emit_bb_start(eb->request, 2130 eb->batch->node.start + 2131 eb->batch_start_offset, 2132 eb->batch_len, 2133 eb->batch_flags); 2134 if (err) 2135 return err; 2136 2137 if (i915_gem_context_nopreempt(eb->gem_context)) 2138 eb->request->flags |= I915_REQUEST_NOPREEMPT; 2139 2140 return 0; 2141 } 2142 2143 static int num_vcs_engines(const struct drm_i915_private *i915) 2144 { 2145 return hweight64(INTEL_INFO(i915)->engine_mask & 2146 GENMASK_ULL(VCS0 + I915_MAX_VCS - 1, VCS0)); 2147 } 2148 2149 /* 2150 * Find one BSD ring to dispatch the corresponding BSD command. 2151 * The engine index is returned. 2152 */ 2153 static unsigned int 2154 gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, 2155 struct drm_file *file) 2156 { 2157 struct drm_i915_file_private *file_priv = file->driver_priv; 2158 2159 /* Check whether the file_priv has already selected one ring. */ 2160 if ((int)file_priv->bsd_engine < 0) 2161 file_priv->bsd_engine = 2162 get_random_int() % num_vcs_engines(dev_priv); 2163 2164 return file_priv->bsd_engine; 2165 } 2166 2167 static const enum intel_engine_id user_ring_map[] = { 2168 [I915_EXEC_DEFAULT] = RCS0, 2169 [I915_EXEC_RENDER] = RCS0, 2170 [I915_EXEC_BLT] = BCS0, 2171 [I915_EXEC_BSD] = VCS0, 2172 [I915_EXEC_VEBOX] = VECS0 2173 }; 2174 2175 static struct i915_request *eb_throttle(struct intel_context *ce) 2176 { 2177 struct intel_ring *ring = ce->ring; 2178 struct intel_timeline *tl = ce->timeline; 2179 struct i915_request *rq; 2180 2181 /* 2182 * Completely unscientific finger-in-the-air estimates for suitable 2183 * maximum user request size (to avoid blocking) and then backoff. 2184 */ 2185 if (intel_ring_update_space(ring) >= PAGE_SIZE) 2186 return NULL; 2187 2188 /* 2189 * Find a request that after waiting upon, there will be at least half 2190 * the ring available. The hysteresis allows us to compete for the 2191 * shared ring and should mean that we sleep less often prior to 2192 * claiming our resources, but not so long that the ring completely 2193 * drains before we can submit our next request. 2194 */ 2195 list_for_each_entry(rq, &tl->requests, link) { 2196 if (rq->ring != ring) 2197 continue; 2198 2199 if (__intel_ring_space(rq->postfix, 2200 ring->emit, ring->size) > ring->size / 2) 2201 break; 2202 } 2203 if (&rq->link == &tl->requests) 2204 return NULL; /* weird, we will check again later for real */ 2205 2206 return i915_request_get(rq); 2207 } 2208 2209 static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) 2210 { 2211 struct intel_timeline *tl; 2212 struct i915_request *rq; 2213 int err; 2214 2215 /* 2216 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report 2217 * EIO if the GPU is already wedged. 2218 */ 2219 err = intel_gt_terminally_wedged(ce->engine->gt); 2220 if (err) 2221 return err; 2222 2223 /* 2224 * Pinning the contexts may generate requests in order to acquire 2225 * GGTT space, so do this first before we reserve a seqno for 2226 * ourselves. 2227 */ 2228 err = intel_context_pin(ce); 2229 if (err) 2230 return err; 2231 2232 /* 2233 * Take a local wakeref for preparing to dispatch the execbuf as 2234 * we expect to access the hardware fairly frequently in the 2235 * process, and require the engine to be kept awake between accesses. 2236 * Upon dispatch, we acquire another prolonged wakeref that we hold 2237 * until the timeline is idle, which in turn releases the wakeref 2238 * taken on the engine, and the parent device. 2239 */ 2240 tl = intel_context_timeline_lock(ce); 2241 if (IS_ERR(tl)) { 2242 err = PTR_ERR(tl); 2243 goto err_unpin; 2244 } 2245 2246 intel_context_enter(ce); 2247 rq = eb_throttle(ce); 2248 2249 intel_context_timeline_unlock(tl); 2250 2251 if (rq) { 2252 if (i915_request_wait(rq, 2253 I915_WAIT_INTERRUPTIBLE, 2254 MAX_SCHEDULE_TIMEOUT) < 0) { 2255 i915_request_put(rq); 2256 err = -EINTR; 2257 goto err_exit; 2258 } 2259 2260 i915_request_put(rq); 2261 } 2262 2263 eb->engine = ce->engine; 2264 eb->context = ce; 2265 return 0; 2266 2267 err_exit: 2268 mutex_lock(&tl->mutex); 2269 intel_context_exit(ce); 2270 intel_context_timeline_unlock(tl); 2271 err_unpin: 2272 intel_context_unpin(ce); 2273 return err; 2274 } 2275 2276 static void eb_unpin_engine(struct i915_execbuffer *eb) 2277 { 2278 struct intel_context *ce = eb->context; 2279 struct intel_timeline *tl = ce->timeline; 2280 2281 mutex_lock(&tl->mutex); 2282 intel_context_exit(ce); 2283 mutex_unlock(&tl->mutex); 2284 2285 intel_context_unpin(ce); 2286 } 2287 2288 static unsigned int 2289 eb_select_legacy_ring(struct i915_execbuffer *eb, 2290 struct drm_file *file, 2291 struct drm_i915_gem_execbuffer2 *args) 2292 { 2293 struct drm_i915_private *i915 = eb->i915; 2294 unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; 2295 2296 if (user_ring_id != I915_EXEC_BSD && 2297 (args->flags & I915_EXEC_BSD_MASK)) { 2298 DRM_DEBUG("execbuf with non bsd ring but with invalid " 2299 "bsd dispatch flags: %d\n", (int)(args->flags)); 2300 return -1; 2301 } 2302 2303 if (user_ring_id == I915_EXEC_BSD && num_vcs_engines(i915) > 1) { 2304 unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; 2305 2306 if (bsd_idx == I915_EXEC_BSD_DEFAULT) { 2307 bsd_idx = gen8_dispatch_bsd_engine(i915, file); 2308 } else if (bsd_idx >= I915_EXEC_BSD_RING1 && 2309 bsd_idx <= I915_EXEC_BSD_RING2) { 2310 bsd_idx >>= I915_EXEC_BSD_SHIFT; 2311 bsd_idx--; 2312 } else { 2313 DRM_DEBUG("execbuf with unknown bsd ring: %u\n", 2314 bsd_idx); 2315 return -1; 2316 } 2317 2318 return _VCS(bsd_idx); 2319 } 2320 2321 if (user_ring_id >= ARRAY_SIZE(user_ring_map)) { 2322 DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id); 2323 return -1; 2324 } 2325 2326 return user_ring_map[user_ring_id]; 2327 } 2328 2329 static int 2330 eb_pin_engine(struct i915_execbuffer *eb, 2331 struct drm_file *file, 2332 struct drm_i915_gem_execbuffer2 *args) 2333 { 2334 struct intel_context *ce; 2335 unsigned int idx; 2336 int err; 2337 2338 if (i915_gem_context_user_engines(eb->gem_context)) 2339 idx = args->flags & I915_EXEC_RING_MASK; 2340 else 2341 idx = eb_select_legacy_ring(eb, file, args); 2342 2343 ce = i915_gem_context_get_engine(eb->gem_context, idx); 2344 if (IS_ERR(ce)) 2345 return PTR_ERR(ce); 2346 2347 err = __eb_pin_engine(eb, ce); 2348 intel_context_put(ce); 2349 2350 return err; 2351 } 2352 2353 static void 2354 __free_fence_array(struct drm_syncobj **fences, unsigned int n) 2355 { 2356 while (n--) 2357 drm_syncobj_put(ptr_mask_bits(fences[n], 2)); 2358 kvfree(fences); 2359 } 2360 2361 static struct drm_syncobj ** 2362 get_fence_array(struct drm_i915_gem_execbuffer2 *args, 2363 struct drm_file *file) 2364 { 2365 const unsigned long nfences = args->num_cliprects; 2366 struct drm_i915_gem_exec_fence __user *user; 2367 struct drm_syncobj **fences; 2368 unsigned long n; 2369 int err; 2370 2371 if (!(args->flags & I915_EXEC_FENCE_ARRAY)) 2372 return NULL; 2373 2374 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 2375 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 2376 if (nfences > min_t(unsigned long, 2377 ULONG_MAX / sizeof(*user), 2378 SIZE_MAX / sizeof(*fences))) 2379 return ERR_PTR(-EINVAL); 2380 2381 user = u64_to_user_ptr(args->cliprects_ptr); 2382 if (!access_ok(user, nfences * sizeof(*user))) 2383 return ERR_PTR(-EFAULT); 2384 2385 fences = kvmalloc_array(nfences, sizeof(*fences), 2386 __GFP_NOWARN | GFP_KERNEL); 2387 if (!fences) 2388 return ERR_PTR(-ENOMEM); 2389 2390 for (n = 0; n < nfences; n++) { 2391 struct drm_i915_gem_exec_fence fence; 2392 struct drm_syncobj *syncobj; 2393 2394 if (__copy_from_user(&fence, user++, sizeof(fence))) { 2395 err = -EFAULT; 2396 goto err; 2397 } 2398 2399 if (fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) { 2400 err = -EINVAL; 2401 goto err; 2402 } 2403 2404 syncobj = drm_syncobj_find(file, fence.handle); 2405 if (!syncobj) { 2406 DRM_DEBUG("Invalid syncobj handle provided\n"); 2407 err = -ENOENT; 2408 goto err; 2409 } 2410 2411 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 2412 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 2413 2414 fences[n] = ptr_pack_bits(syncobj, fence.flags, 2); 2415 } 2416 2417 return fences; 2418 2419 err: 2420 __free_fence_array(fences, n); 2421 return ERR_PTR(err); 2422 } 2423 2424 static void 2425 put_fence_array(struct drm_i915_gem_execbuffer2 *args, 2426 struct drm_syncobj **fences) 2427 { 2428 if (fences) 2429 __free_fence_array(fences, args->num_cliprects); 2430 } 2431 2432 static int 2433 await_fence_array(struct i915_execbuffer *eb, 2434 struct drm_syncobj **fences) 2435 { 2436 const unsigned int nfences = eb->args->num_cliprects; 2437 unsigned int n; 2438 int err; 2439 2440 for (n = 0; n < nfences; n++) { 2441 struct drm_syncobj *syncobj; 2442 struct dma_fence *fence; 2443 unsigned int flags; 2444 2445 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2446 if (!(flags & I915_EXEC_FENCE_WAIT)) 2447 continue; 2448 2449 fence = drm_syncobj_fence_get(syncobj); 2450 if (!fence) 2451 return -EINVAL; 2452 2453 err = i915_request_await_dma_fence(eb->request, fence); 2454 dma_fence_put(fence); 2455 if (err < 0) 2456 return err; 2457 } 2458 2459 return 0; 2460 } 2461 2462 static void 2463 signal_fence_array(struct i915_execbuffer *eb, 2464 struct drm_syncobj **fences) 2465 { 2466 const unsigned int nfences = eb->args->num_cliprects; 2467 struct dma_fence * const fence = &eb->request->fence; 2468 unsigned int n; 2469 2470 for (n = 0; n < nfences; n++) { 2471 struct drm_syncobj *syncobj; 2472 unsigned int flags; 2473 2474 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2475 if (!(flags & I915_EXEC_FENCE_SIGNAL)) 2476 continue; 2477 2478 drm_syncobj_replace_fence(syncobj, fence); 2479 } 2480 } 2481 2482 static int 2483 i915_gem_do_execbuffer(struct drm_device *dev, 2484 struct drm_file *file, 2485 struct drm_i915_gem_execbuffer2 *args, 2486 struct drm_i915_gem_exec_object2 *exec, 2487 struct drm_syncobj **fences) 2488 { 2489 struct drm_i915_private *i915 = to_i915(dev); 2490 struct i915_execbuffer eb; 2491 struct dma_fence *in_fence = NULL; 2492 struct dma_fence *exec_fence = NULL; 2493 struct sync_file *out_fence = NULL; 2494 int out_fence_fd = -1; 2495 int err; 2496 2497 BUILD_BUG_ON(__EXEC_INTERNAL_FLAGS & ~__I915_EXEC_ILLEGAL_FLAGS); 2498 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 2499 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 2500 2501 eb.i915 = i915; 2502 eb.file = file; 2503 eb.args = args; 2504 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2505 args->flags |= __EXEC_HAS_RELOC; 2506 2507 eb.exec = exec; 2508 eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1); 2509 eb.vma[0] = NULL; 2510 eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1); 2511 2512 eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; 2513 reloc_cache_init(&eb.reloc_cache, eb.i915); 2514 2515 eb.buffer_count = args->buffer_count; 2516 eb.batch_start_offset = args->batch_start_offset; 2517 eb.batch_len = args->batch_len; 2518 2519 eb.batch_flags = 0; 2520 if (args->flags & I915_EXEC_SECURE) { 2521 if (INTEL_GEN(i915) >= 11) 2522 return -ENODEV; 2523 2524 /* Return -EPERM to trigger fallback code on old binaries. */ 2525 if (!HAS_SECURE_BATCHES(i915)) 2526 return -EPERM; 2527 2528 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 2529 return -EPERM; 2530 2531 eb.batch_flags |= I915_DISPATCH_SECURE; 2532 } 2533 if (args->flags & I915_EXEC_IS_PINNED) 2534 eb.batch_flags |= I915_DISPATCH_PINNED; 2535 2536 if (args->flags & I915_EXEC_FENCE_IN) { 2537 in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2538 if (!in_fence) 2539 return -EINVAL; 2540 } 2541 2542 if (args->flags & I915_EXEC_FENCE_SUBMIT) { 2543 if (in_fence) { 2544 err = -EINVAL; 2545 goto err_in_fence; 2546 } 2547 2548 exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2549 if (!exec_fence) { 2550 err = -EINVAL; 2551 goto err_in_fence; 2552 } 2553 } 2554 2555 if (args->flags & I915_EXEC_FENCE_OUT) { 2556 out_fence_fd = get_unused_fd_flags(O_CLOEXEC); 2557 if (out_fence_fd < 0) { 2558 err = out_fence_fd; 2559 goto err_exec_fence; 2560 } 2561 } 2562 2563 err = eb_create(&eb); 2564 if (err) 2565 goto err_out_fence; 2566 2567 GEM_BUG_ON(!eb.lut_size); 2568 2569 err = eb_select_context(&eb); 2570 if (unlikely(err)) 2571 goto err_destroy; 2572 2573 err = eb_pin_engine(&eb, file, args); 2574 if (unlikely(err)) 2575 goto err_context; 2576 2577 err = i915_mutex_lock_interruptible(dev); 2578 if (err) 2579 goto err_engine; 2580 2581 err = eb_relocate(&eb); 2582 if (err) { 2583 /* 2584 * If the user expects the execobject.offset and 2585 * reloc.presumed_offset to be an exact match, 2586 * as for using NO_RELOC, then we cannot update 2587 * the execobject.offset until we have completed 2588 * relocation. 2589 */ 2590 args->flags &= ~__EXEC_HAS_RELOC; 2591 goto err_vma; 2592 } 2593 2594 if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) { 2595 DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); 2596 err = -EINVAL; 2597 goto err_vma; 2598 } 2599 if (eb.batch_start_offset > eb.batch->size || 2600 eb.batch_len > eb.batch->size - eb.batch_start_offset) { 2601 DRM_DEBUG("Attempting to use out-of-bounds batch\n"); 2602 err = -EINVAL; 2603 goto err_vma; 2604 } 2605 2606 if (eb.batch_len == 0) 2607 eb.batch_len = eb.batch->size - eb.batch_start_offset; 2608 2609 if (eb_use_cmdparser(&eb)) { 2610 struct i915_vma *vma; 2611 2612 vma = eb_parse(&eb); 2613 if (IS_ERR(vma)) { 2614 err = PTR_ERR(vma); 2615 goto err_vma; 2616 } 2617 } 2618 2619 /* 2620 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2621 * batch" bit. Hence we need to pin secure batches into the global gtt. 2622 * hsw should have this fixed, but bdw mucks it up again. */ 2623 if (eb.batch_flags & I915_DISPATCH_SECURE) { 2624 struct i915_vma *vma; 2625 2626 /* 2627 * So on first glance it looks freaky that we pin the batch here 2628 * outside of the reservation loop. But: 2629 * - The batch is already pinned into the relevant ppgtt, so we 2630 * already have the backing storage fully allocated. 2631 * - No other BO uses the global gtt (well contexts, but meh), 2632 * so we don't really have issues with multiple objects not 2633 * fitting due to fragmentation. 2634 * So this is actually safe. 2635 */ 2636 vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0); 2637 if (IS_ERR(vma)) { 2638 err = PTR_ERR(vma); 2639 goto err_vma; 2640 } 2641 2642 eb.batch = vma; 2643 } 2644 2645 /* All GPU relocation batches must be submitted prior to the user rq */ 2646 GEM_BUG_ON(eb.reloc_cache.rq); 2647 2648 /* Allocate a request for this batch buffer nice and early. */ 2649 eb.request = i915_request_create(eb.context); 2650 if (IS_ERR(eb.request)) { 2651 err = PTR_ERR(eb.request); 2652 goto err_batch_unpin; 2653 } 2654 2655 if (in_fence) { 2656 err = i915_request_await_dma_fence(eb.request, in_fence); 2657 if (err < 0) 2658 goto err_request; 2659 } 2660 2661 if (exec_fence) { 2662 err = i915_request_await_execution(eb.request, exec_fence, 2663 eb.engine->bond_execute); 2664 if (err < 0) 2665 goto err_request; 2666 } 2667 2668 if (fences) { 2669 err = await_fence_array(&eb, fences); 2670 if (err) 2671 goto err_request; 2672 } 2673 2674 if (out_fence_fd != -1) { 2675 out_fence = sync_file_create(&eb.request->fence); 2676 if (!out_fence) { 2677 err = -ENOMEM; 2678 goto err_request; 2679 } 2680 } 2681 2682 /* 2683 * Whilst this request exists, batch_obj will be on the 2684 * active_list, and so will hold the active reference. Only when this 2685 * request is retired will the the batch_obj be moved onto the 2686 * inactive_list and lose its active reference. Hence we do not need 2687 * to explicitly hold another reference here. 2688 */ 2689 eb.request->batch = eb.batch; 2690 if (eb.batch->private) 2691 intel_engine_pool_mark_active(eb.batch->private, eb.request); 2692 2693 trace_i915_request_queue(eb.request, eb.batch_flags); 2694 err = eb_submit(&eb); 2695 err_request: 2696 add_to_client(eb.request, file); 2697 i915_request_get(eb.request); 2698 i915_request_add(eb.request); 2699 2700 if (fences) 2701 signal_fence_array(&eb, fences); 2702 2703 if (out_fence) { 2704 if (err == 0) { 2705 fd_install(out_fence_fd, out_fence->file); 2706 args->rsvd2 &= GENMASK_ULL(31, 0); /* keep in-fence */ 2707 args->rsvd2 |= (u64)out_fence_fd << 32; 2708 out_fence_fd = -1; 2709 } else { 2710 fput(out_fence->file); 2711 } 2712 } 2713 i915_request_put(eb.request); 2714 2715 err_batch_unpin: 2716 if (eb.batch_flags & I915_DISPATCH_SECURE) 2717 i915_vma_unpin(eb.batch); 2718 if (eb.batch->private) 2719 intel_engine_pool_put(eb.batch->private); 2720 err_vma: 2721 if (eb.exec) 2722 eb_release_vmas(&eb); 2723 mutex_unlock(&dev->struct_mutex); 2724 err_engine: 2725 eb_unpin_engine(&eb); 2726 err_context: 2727 i915_gem_context_put(eb.gem_context); 2728 err_destroy: 2729 eb_destroy(&eb); 2730 err_out_fence: 2731 if (out_fence_fd != -1) 2732 put_unused_fd(out_fence_fd); 2733 err_exec_fence: 2734 dma_fence_put(exec_fence); 2735 err_in_fence: 2736 dma_fence_put(in_fence); 2737 return err; 2738 } 2739 2740 static size_t eb_element_size(void) 2741 { 2742 return (sizeof(struct drm_i915_gem_exec_object2) + 2743 sizeof(struct i915_vma *) + 2744 sizeof(unsigned int)); 2745 } 2746 2747 static bool check_buffer_count(size_t count) 2748 { 2749 const size_t sz = eb_element_size(); 2750 2751 /* 2752 * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup 2753 * array size (see eb_create()). Otherwise, we can accept an array as 2754 * large as can be addressed (though use large arrays at your peril)! 2755 */ 2756 2757 return !(count < 1 || count > INT_MAX || count > SIZE_MAX / sz - 1); 2758 } 2759 2760 /* 2761 * Legacy execbuffer just creates an exec2 list from the original exec object 2762 * list array and passes it to the real function. 2763 */ 2764 int 2765 i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data, 2766 struct drm_file *file) 2767 { 2768 struct drm_i915_gem_execbuffer *args = data; 2769 struct drm_i915_gem_execbuffer2 exec2; 2770 struct drm_i915_gem_exec_object *exec_list = NULL; 2771 struct drm_i915_gem_exec_object2 *exec2_list = NULL; 2772 const size_t count = args->buffer_count; 2773 unsigned int i; 2774 int err; 2775 2776 if (!check_buffer_count(count)) { 2777 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2778 return -EINVAL; 2779 } 2780 2781 exec2.buffers_ptr = args->buffers_ptr; 2782 exec2.buffer_count = args->buffer_count; 2783 exec2.batch_start_offset = args->batch_start_offset; 2784 exec2.batch_len = args->batch_len; 2785 exec2.DR1 = args->DR1; 2786 exec2.DR4 = args->DR4; 2787 exec2.num_cliprects = args->num_cliprects; 2788 exec2.cliprects_ptr = args->cliprects_ptr; 2789 exec2.flags = I915_EXEC_RENDER; 2790 i915_execbuffer2_set_context_id(exec2, 0); 2791 2792 if (!i915_gem_check_execbuffer(&exec2)) 2793 return -EINVAL; 2794 2795 /* Copy in the exec list from userland */ 2796 exec_list = kvmalloc_array(count, sizeof(*exec_list), 2797 __GFP_NOWARN | GFP_KERNEL); 2798 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2799 __GFP_NOWARN | GFP_KERNEL); 2800 if (exec_list == NULL || exec2_list == NULL) { 2801 DRM_DEBUG("Failed to allocate exec list for %d buffers\n", 2802 args->buffer_count); 2803 kvfree(exec_list); 2804 kvfree(exec2_list); 2805 return -ENOMEM; 2806 } 2807 err = copy_from_user(exec_list, 2808 u64_to_user_ptr(args->buffers_ptr), 2809 sizeof(*exec_list) * count); 2810 if (err) { 2811 DRM_DEBUG("copy %d exec entries failed %d\n", 2812 args->buffer_count, err); 2813 kvfree(exec_list); 2814 kvfree(exec2_list); 2815 return -EFAULT; 2816 } 2817 2818 for (i = 0; i < args->buffer_count; i++) { 2819 exec2_list[i].handle = exec_list[i].handle; 2820 exec2_list[i].relocation_count = exec_list[i].relocation_count; 2821 exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr; 2822 exec2_list[i].alignment = exec_list[i].alignment; 2823 exec2_list[i].offset = exec_list[i].offset; 2824 if (INTEL_GEN(to_i915(dev)) < 4) 2825 exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE; 2826 else 2827 exec2_list[i].flags = 0; 2828 } 2829 2830 err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list, NULL); 2831 if (exec2.flags & __EXEC_HAS_RELOC) { 2832 struct drm_i915_gem_exec_object __user *user_exec_list = 2833 u64_to_user_ptr(args->buffers_ptr); 2834 2835 /* Copy the new buffer offsets back to the user's exec list. */ 2836 for (i = 0; i < args->buffer_count; i++) { 2837 if (!(exec2_list[i].offset & UPDATE)) 2838 continue; 2839 2840 exec2_list[i].offset = 2841 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2842 exec2_list[i].offset &= PIN_OFFSET_MASK; 2843 if (__copy_to_user(&user_exec_list[i].offset, 2844 &exec2_list[i].offset, 2845 sizeof(user_exec_list[i].offset))) 2846 break; 2847 } 2848 } 2849 2850 kvfree(exec_list); 2851 kvfree(exec2_list); 2852 return err; 2853 } 2854 2855 int 2856 i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, 2857 struct drm_file *file) 2858 { 2859 struct drm_i915_gem_execbuffer2 *args = data; 2860 struct drm_i915_gem_exec_object2 *exec2_list; 2861 struct drm_syncobj **fences = NULL; 2862 const size_t count = args->buffer_count; 2863 int err; 2864 2865 if (!check_buffer_count(count)) { 2866 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2867 return -EINVAL; 2868 } 2869 2870 if (!i915_gem_check_execbuffer(args)) 2871 return -EINVAL; 2872 2873 /* Allocate an extra slot for use by the command parser */ 2874 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2875 __GFP_NOWARN | GFP_KERNEL); 2876 if (exec2_list == NULL) { 2877 DRM_DEBUG("Failed to allocate exec list for %zd buffers\n", 2878 count); 2879 return -ENOMEM; 2880 } 2881 if (copy_from_user(exec2_list, 2882 u64_to_user_ptr(args->buffers_ptr), 2883 sizeof(*exec2_list) * count)) { 2884 DRM_DEBUG("copy %zd exec entries failed\n", count); 2885 kvfree(exec2_list); 2886 return -EFAULT; 2887 } 2888 2889 if (args->flags & I915_EXEC_FENCE_ARRAY) { 2890 fences = get_fence_array(args, file); 2891 if (IS_ERR(fences)) { 2892 kvfree(exec2_list); 2893 return PTR_ERR(fences); 2894 } 2895 } 2896 2897 err = i915_gem_do_execbuffer(dev, file, args, exec2_list, fences); 2898 2899 /* 2900 * Now that we have begun execution of the batchbuffer, we ignore 2901 * any new error after this point. Also given that we have already 2902 * updated the associated relocations, we try to write out the current 2903 * object locations irrespective of any error. 2904 */ 2905 if (args->flags & __EXEC_HAS_RELOC) { 2906 struct drm_i915_gem_exec_object2 __user *user_exec_list = 2907 u64_to_user_ptr(args->buffers_ptr); 2908 unsigned int i; 2909 2910 /* Copy the new buffer offsets back to the user's exec list. */ 2911 /* 2912 * Note: count * sizeof(*user_exec_list) does not overflow, 2913 * because we checked 'count' in check_buffer_count(). 2914 * 2915 * And this range already got effectively checked earlier 2916 * when we did the "copy_from_user()" above. 2917 */ 2918 if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) 2919 goto end; 2920 2921 for (i = 0; i < args->buffer_count; i++) { 2922 if (!(exec2_list[i].offset & UPDATE)) 2923 continue; 2924 2925 exec2_list[i].offset = 2926 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2927 unsafe_put_user(exec2_list[i].offset, 2928 &user_exec_list[i].offset, 2929 end_user); 2930 } 2931 end_user: 2932 user_access_end(); 2933 end:; 2934 } 2935 2936 args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS; 2937 put_fence_array(args, fences); 2938 kvfree(exec2_list); 2939 return err; 2940 } 2941