1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014-2016 Intel Corporation 4 */ 5 6 #include <linux/pagevec.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/swap.h> 9 #include <linux/uio.h> 10 11 #include <drm/drm_cache.h> 12 13 #include "gem/i915_gem_region.h" 14 #include "i915_drv.h" 15 #include "i915_gem_object.h" 16 #include "i915_gem_tiling.h" 17 #include "i915_gemfs.h" 18 #include "i915_scatterlist.h" 19 #include "i915_trace.h" 20 21 /* 22 * Move folios to appropriate lru and release the batch, decrementing the 23 * ref count of those folios. 24 */ 25 static void check_release_folio_batch(struct folio_batch *fbatch) 26 { 27 check_move_unevictable_folios(fbatch); 28 __folio_batch_release(fbatch); 29 cond_resched(); 30 } 31 32 void shmem_sg_free_table(struct sg_table *st, struct address_space *mapping, 33 bool dirty, bool backup) 34 { 35 struct sgt_iter sgt_iter; 36 struct folio_batch fbatch; 37 struct folio *last = NULL; 38 struct page *page; 39 40 mapping_clear_unevictable(mapping); 41 42 folio_batch_init(&fbatch); 43 for_each_sgt_page(page, sgt_iter, st) { 44 struct folio *folio = page_folio(page); 45 46 if (folio == last) 47 continue; 48 last = folio; 49 if (dirty) 50 folio_mark_dirty(folio); 51 if (backup) 52 folio_mark_accessed(folio); 53 54 if (!folio_batch_add(&fbatch, folio)) 55 check_release_folio_batch(&fbatch); 56 } 57 if (fbatch.nr) 58 check_release_folio_batch(&fbatch); 59 60 sg_free_table(st); 61 } 62 63 int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, 64 size_t size, struct intel_memory_region *mr, 65 struct address_space *mapping, 66 unsigned int max_segment) 67 { 68 unsigned int page_count; /* restricted by sg_alloc_table */ 69 unsigned long i; 70 struct scatterlist *sg; 71 unsigned long next_pfn = 0; /* suppress gcc warning */ 72 gfp_t noreclaim; 73 int ret; 74 75 if (overflows_type(size / PAGE_SIZE, page_count)) 76 return -E2BIG; 77 78 page_count = size / PAGE_SIZE; 79 /* 80 * If there's no chance of allocating enough pages for the whole 81 * object, bail early. 82 */ 83 if (size > resource_size(&mr->region)) 84 return -ENOMEM; 85 86 if (sg_alloc_table(st, page_count, GFP_KERNEL | __GFP_NOWARN)) 87 return -ENOMEM; 88 89 /* 90 * Get the list of pages out of our struct file. They'll be pinned 91 * at this point until we release them. 92 * 93 * Fail silently without starting the shrinker 94 */ 95 mapping_set_unevictable(mapping); 96 noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM); 97 noreclaim |= __GFP_NORETRY | __GFP_NOWARN; 98 99 sg = st->sgl; 100 st->nents = 0; 101 for (i = 0; i < page_count; i++) { 102 struct folio *folio; 103 unsigned long nr_pages; 104 const unsigned int shrink[] = { 105 I915_SHRINK_BOUND | I915_SHRINK_UNBOUND, 106 0, 107 }, *s = shrink; 108 gfp_t gfp = noreclaim; 109 110 do { 111 cond_resched(); 112 folio = shmem_read_folio_gfp(mapping, i, gfp); 113 if (!IS_ERR(folio)) 114 break; 115 116 if (!*s) { 117 ret = PTR_ERR(folio); 118 goto err_sg; 119 } 120 121 i915_gem_shrink(NULL, i915, 2 * page_count, NULL, *s++); 122 123 /* 124 * We've tried hard to allocate the memory by reaping 125 * our own buffer, now let the real VM do its job and 126 * go down in flames if truly OOM. 127 * 128 * However, since graphics tend to be disposable, 129 * defer the oom here by reporting the ENOMEM back 130 * to userspace. 131 */ 132 if (!*s) { 133 /* reclaim and warn, but no oom */ 134 gfp = mapping_gfp_mask(mapping); 135 136 /* 137 * Our bo are always dirty and so we require 138 * kswapd to reclaim our pages (direct reclaim 139 * does not effectively begin pageout of our 140 * buffers on its own). However, direct reclaim 141 * only waits for kswapd when under allocation 142 * congestion. So as a result __GFP_RECLAIM is 143 * unreliable and fails to actually reclaim our 144 * dirty pages -- unless you try over and over 145 * again with !__GFP_NORETRY. However, we still 146 * want to fail this allocation rather than 147 * trigger the out-of-memory killer and for 148 * this we want __GFP_RETRY_MAYFAIL. 149 */ 150 gfp |= __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 151 } 152 } while (1); 153 154 nr_pages = min_t(unsigned long, 155 folio_nr_pages(folio), page_count - i); 156 if (!i || 157 sg->length >= max_segment || 158 folio_pfn(folio) != next_pfn) { 159 if (i) 160 sg = sg_next(sg); 161 162 st->nents++; 163 sg_set_folio(sg, folio, nr_pages * PAGE_SIZE, 0); 164 } else { 165 /* XXX: could overflow? */ 166 sg->length += nr_pages * PAGE_SIZE; 167 } 168 next_pfn = folio_pfn(folio) + nr_pages; 169 i += nr_pages - 1; 170 171 /* Check that the i965g/gm workaround works. */ 172 GEM_BUG_ON(gfp & __GFP_DMA32 && next_pfn >= 0x00100000UL); 173 } 174 if (sg) /* loop terminated early; short sg table */ 175 sg_mark_end(sg); 176 177 /* Trim unused sg entries to avoid wasting memory. */ 178 i915_sg_trim(st); 179 180 return 0; 181 err_sg: 182 sg_mark_end(sg); 183 if (sg != st->sgl) { 184 shmem_sg_free_table(st, mapping, false, false); 185 } else { 186 mapping_clear_unevictable(mapping); 187 sg_free_table(st); 188 } 189 190 /* 191 * shmemfs first checks if there is enough memory to allocate the page 192 * and reports ENOSPC should there be insufficient, along with the usual 193 * ENOMEM for a genuine allocation failure. 194 * 195 * We use ENOSPC in our driver to mean that we have run out of aperture 196 * space and so want to translate the error from shmemfs back to our 197 * usual understanding of ENOMEM. 198 */ 199 if (ret == -ENOSPC) 200 ret = -ENOMEM; 201 202 return ret; 203 } 204 205 static int shmem_get_pages(struct drm_i915_gem_object *obj) 206 { 207 struct drm_i915_private *i915 = to_i915(obj->base.dev); 208 struct intel_memory_region *mem = obj->mm.region; 209 struct address_space *mapping = obj->base.filp->f_mapping; 210 unsigned int max_segment = i915_sg_segment_size(i915->drm.dev); 211 struct sg_table *st; 212 int ret; 213 214 /* 215 * Assert that the object is not currently in any GPU domain. As it 216 * wasn't in the GTT, there shouldn't be any way it could have been in 217 * a GPU cache 218 */ 219 GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS); 220 GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS); 221 222 rebuild_st: 223 st = kmalloc(sizeof(*st), GFP_KERNEL | __GFP_NOWARN); 224 if (!st) 225 return -ENOMEM; 226 227 ret = shmem_sg_alloc_table(i915, st, obj->base.size, mem, mapping, 228 max_segment); 229 if (ret) 230 goto err_st; 231 232 ret = i915_gem_gtt_prepare_pages(obj, st); 233 if (ret) { 234 /* 235 * DMA remapping failed? One possible cause is that 236 * it could not reserve enough large entries, asking 237 * for PAGE_SIZE chunks instead may be helpful. 238 */ 239 if (max_segment > PAGE_SIZE) { 240 shmem_sg_free_table(st, mapping, false, false); 241 kfree(st); 242 243 max_segment = PAGE_SIZE; 244 goto rebuild_st; 245 } else { 246 dev_warn(i915->drm.dev, 247 "Failed to DMA remap %zu pages\n", 248 obj->base.size >> PAGE_SHIFT); 249 goto err_pages; 250 } 251 } 252 253 if (i915_gem_object_needs_bit17_swizzle(obj)) 254 i915_gem_object_do_bit_17_swizzle(obj, st); 255 256 if (i915_gem_object_can_bypass_llc(obj)) 257 obj->cache_dirty = true; 258 259 __i915_gem_object_set_pages(obj, st); 260 261 return 0; 262 263 err_pages: 264 shmem_sg_free_table(st, mapping, false, false); 265 /* 266 * shmemfs first checks if there is enough memory to allocate the page 267 * and reports ENOSPC should there be insufficient, along with the usual 268 * ENOMEM for a genuine allocation failure. 269 * 270 * We use ENOSPC in our driver to mean that we have run out of aperture 271 * space and so want to translate the error from shmemfs back to our 272 * usual understanding of ENOMEM. 273 */ 274 err_st: 275 if (ret == -ENOSPC) 276 ret = -ENOMEM; 277 278 kfree(st); 279 280 return ret; 281 } 282 283 static int 284 shmem_truncate(struct drm_i915_gem_object *obj) 285 { 286 /* 287 * Our goal here is to return as much of the memory as 288 * is possible back to the system as we are called from OOM. 289 * To do this we must instruct the shmfs to drop all of its 290 * backing pages, *now*. 291 */ 292 shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1); 293 obj->mm.madv = __I915_MADV_PURGED; 294 obj->mm.pages = ERR_PTR(-EFAULT); 295 296 return 0; 297 } 298 299 void __shmem_writeback(size_t size, struct address_space *mapping) 300 { 301 struct writeback_control wbc = { 302 .sync_mode = WB_SYNC_NONE, 303 .nr_to_write = SWAP_CLUSTER_MAX, 304 .range_start = 0, 305 .range_end = LLONG_MAX, 306 .for_reclaim = 1, 307 }; 308 struct folio *folio = NULL; 309 int error = 0; 310 311 /* 312 * Leave mmapings intact (GTT will have been revoked on unbinding, 313 * leaving only CPU mmapings around) and add those folios to the LRU 314 * instead of invoking writeback so they are aged and paged out 315 * as normal. 316 */ 317 while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { 318 if (folio_mapped(folio)) 319 folio_redirty_for_writepage(&wbc, folio); 320 else 321 error = shmem_writeout(folio, &wbc); 322 } 323 } 324 325 static void 326 shmem_writeback(struct drm_i915_gem_object *obj) 327 { 328 __shmem_writeback(obj->base.size, obj->base.filp->f_mapping); 329 } 330 331 static int shmem_shrink(struct drm_i915_gem_object *obj, unsigned int flags) 332 { 333 switch (obj->mm.madv) { 334 case I915_MADV_DONTNEED: 335 return i915_gem_object_truncate(obj); 336 case __I915_MADV_PURGED: 337 return 0; 338 } 339 340 if (flags & I915_GEM_OBJECT_SHRINK_WRITEBACK) 341 shmem_writeback(obj); 342 343 return 0; 344 } 345 346 void 347 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, 348 struct sg_table *pages, 349 bool needs_clflush) 350 { 351 struct drm_i915_private *i915 = to_i915(obj->base.dev); 352 353 GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED); 354 355 if (obj->mm.madv == I915_MADV_DONTNEED) 356 obj->mm.dirty = false; 357 358 if (needs_clflush && 359 (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 && 360 !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) 361 drm_clflush_sg(pages); 362 363 __start_cpu_write(obj); 364 /* 365 * On non-LLC igfx platforms, force the flush-on-acquire if this is ever 366 * swapped-in. Our async flush path is not trust worthy enough yet(and 367 * happens in the wrong order), and with some tricks it's conceivable 368 * for userspace to change the cache-level to I915_CACHE_NONE after the 369 * pages are swapped-in, and since execbuf binds the object before doing 370 * the async flush, we have a race window. 371 */ 372 if (!HAS_LLC(i915) && !IS_DGFX(i915)) 373 obj->cache_dirty = true; 374 } 375 376 void i915_gem_object_put_pages_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages) 377 { 378 __i915_gem_object_release_shmem(obj, pages, true); 379 380 i915_gem_gtt_finish_pages(obj, pages); 381 382 if (i915_gem_object_needs_bit17_swizzle(obj)) 383 i915_gem_object_save_bit_17_swizzle(obj, pages); 384 385 shmem_sg_free_table(pages, file_inode(obj->base.filp)->i_mapping, 386 obj->mm.dirty, obj->mm.madv == I915_MADV_WILLNEED); 387 kfree(pages); 388 obj->mm.dirty = false; 389 } 390 391 static void 392 shmem_put_pages(struct drm_i915_gem_object *obj, struct sg_table *pages) 393 { 394 if (likely(i915_gem_object_has_struct_page(obj))) 395 i915_gem_object_put_pages_shmem(obj, pages); 396 else 397 i915_gem_object_put_pages_phys(obj, pages); 398 } 399 400 static int 401 shmem_pwrite(struct drm_i915_gem_object *obj, 402 const struct drm_i915_gem_pwrite *arg) 403 { 404 char __user *user_data = u64_to_user_ptr(arg->data_ptr); 405 struct file *file = obj->base.filp; 406 struct kiocb kiocb; 407 struct iov_iter iter; 408 ssize_t written; 409 u64 size = arg->size; 410 411 /* Caller already validated user args */ 412 GEM_BUG_ON(!access_ok(user_data, arg->size)); 413 414 if (!i915_gem_object_has_struct_page(obj)) 415 return i915_gem_object_pwrite_phys(obj, arg); 416 417 /* 418 * Before we instantiate/pin the backing store for our use, we 419 * can prepopulate the shmemfs filp efficiently using a write into 420 * the pagecache. We avoid the penalty of instantiating all the 421 * pages, important if the user is just writing to a few and never 422 * uses the object on the GPU, and using a direct write into shmemfs 423 * allows it to avoid the cost of retrieving a page (either swapin 424 * or clearing-before-use) before it is overwritten. 425 */ 426 if (i915_gem_object_has_pages(obj)) 427 return -ENODEV; 428 429 if (obj->mm.madv != I915_MADV_WILLNEED) 430 return -EFAULT; 431 432 if (size > MAX_RW_COUNT) 433 return -EFBIG; 434 435 if (!file->f_op->write_iter) 436 return -EINVAL; 437 438 init_sync_kiocb(&kiocb, file); 439 kiocb.ki_pos = arg->offset; 440 iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)user_data, size); 441 442 written = file->f_op->write_iter(&kiocb, &iter); 443 BUG_ON(written == -EIOCBQUEUED); 444 445 if (written != size) 446 return -EIO; 447 448 if (written < 0) 449 return written; 450 451 return 0; 452 } 453 454 static int 455 shmem_pread(struct drm_i915_gem_object *obj, 456 const struct drm_i915_gem_pread *arg) 457 { 458 if (!i915_gem_object_has_struct_page(obj)) 459 return i915_gem_object_pread_phys(obj, arg); 460 461 return -ENODEV; 462 } 463 464 static void shmem_release(struct drm_i915_gem_object *obj) 465 { 466 if (i915_gem_object_has_struct_page(obj)) 467 i915_gem_object_release_memory_region(obj); 468 469 fput(obj->base.filp); 470 } 471 472 const struct drm_i915_gem_object_ops i915_gem_shmem_ops = { 473 .name = "i915_gem_object_shmem", 474 .flags = I915_GEM_OBJECT_IS_SHRINKABLE, 475 476 .get_pages = shmem_get_pages, 477 .put_pages = shmem_put_pages, 478 .truncate = shmem_truncate, 479 .shrink = shmem_shrink, 480 481 .pwrite = shmem_pwrite, 482 .pread = shmem_pread, 483 484 .release = shmem_release, 485 }; 486 487 static int __create_shmem(struct drm_i915_private *i915, 488 struct drm_gem_object *obj, 489 resource_size_t size) 490 { 491 unsigned long flags = VM_NORESERVE; 492 struct file *filp; 493 494 drm_gem_private_object_init(&i915->drm, obj, size); 495 496 /* XXX: The __shmem_file_setup() function returns -EINVAL if size is 497 * greater than MAX_LFS_FILESIZE. 498 * To handle the same error as other code that returns -E2BIG when 499 * the size is too large, we add a code that returns -E2BIG when the 500 * size is larger than the size that can be handled. 501 * If BITS_PER_LONG is 32, size > MAX_LFS_FILESIZE is always false, 502 * so we only needs to check when BITS_PER_LONG is 64. 503 * If BITS_PER_LONG is 32, E2BIG checks are processed when 504 * i915_gem_object_size_2big() is called before init_object() callback 505 * is called. 506 */ 507 if (BITS_PER_LONG == 64 && size > MAX_LFS_FILESIZE) 508 return -E2BIG; 509 510 if (i915->mm.gemfs) 511 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size, 512 flags); 513 else 514 filp = shmem_file_setup("i915", size, flags); 515 if (IS_ERR(filp)) 516 return PTR_ERR(filp); 517 518 obj->filp = filp; 519 return 0; 520 } 521 522 static int shmem_object_init(struct intel_memory_region *mem, 523 struct drm_i915_gem_object *obj, 524 resource_size_t offset, 525 resource_size_t size, 526 resource_size_t page_size, 527 unsigned int flags) 528 { 529 static struct lock_class_key lock_class; 530 struct drm_i915_private *i915 = mem->i915; 531 struct address_space *mapping; 532 unsigned int cache_level; 533 gfp_t mask; 534 int ret; 535 536 ret = __create_shmem(i915, &obj->base, size); 537 if (ret) 538 return ret; 539 540 mask = GFP_HIGHUSER | __GFP_RECLAIMABLE; 541 if (IS_I965GM(i915) || IS_I965G(i915)) { 542 /* 965gm cannot relocate objects above 4GiB. */ 543 mask &= ~__GFP_HIGHMEM; 544 mask |= __GFP_DMA32; 545 } 546 547 mapping = obj->base.filp->f_mapping; 548 mapping_set_gfp_mask(mapping, mask); 549 GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM)); 550 551 i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, flags); 552 obj->mem_flags |= I915_BO_FLAG_STRUCT_PAGE; 553 obj->write_domain = I915_GEM_DOMAIN_CPU; 554 obj->read_domains = I915_GEM_DOMAIN_CPU; 555 556 /* 557 * MTL doesn't snoop CPU cache by default for GPU access (namely 558 * 1-way coherency). However some UMD's are currently depending on 559 * that. Make 1-way coherent the default setting for MTL. A follow 560 * up patch will extend the GEM_CREATE uAPI to allow UMD's specify 561 * caching mode at BO creation time 562 */ 563 if (HAS_LLC(i915) || (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))) 564 /* On some devices, we can have the GPU use the LLC (the CPU 565 * cache) for about a 10% performance improvement 566 * compared to uncached. Graphics requests other than 567 * display scanout are coherent with the CPU in 568 * accessing this cache. This means in this mode we 569 * don't need to clflush on the CPU side, and on the 570 * GPU side we only need to flush internal caches to 571 * get data visible to the CPU. 572 * 573 * However, we maintain the display planes as UC, and so 574 * need to rebind when first used as such. 575 */ 576 cache_level = I915_CACHE_LLC; 577 else 578 cache_level = I915_CACHE_NONE; 579 580 i915_gem_object_set_cache_coherency(obj, cache_level); 581 582 i915_gem_object_init_memory_region(obj, mem); 583 584 return 0; 585 } 586 587 struct drm_i915_gem_object * 588 i915_gem_object_create_shmem(struct drm_i915_private *i915, 589 resource_size_t size) 590 { 591 return i915_gem_object_create_region(i915->mm.regions[INTEL_REGION_SMEM], 592 size, 0, 0); 593 } 594 595 /* Allocate a new GEM object and fill it with the supplied data */ 596 struct drm_i915_gem_object * 597 i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, 598 const void *data, resource_size_t size) 599 { 600 struct drm_i915_gem_object *obj; 601 struct file *file; 602 loff_t pos = 0; 603 ssize_t err; 604 605 GEM_WARN_ON(IS_DGFX(i915)); 606 obj = i915_gem_object_create_shmem(i915, round_up(size, PAGE_SIZE)); 607 if (IS_ERR(obj)) 608 return obj; 609 610 GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU); 611 612 file = obj->base.filp; 613 err = kernel_write(file, data, size, &pos); 614 615 if (err < 0) 616 goto fail; 617 618 if (err != size) { 619 err = -EIO; 620 goto fail; 621 } 622 623 return obj; 624 625 fail: 626 i915_gem_object_put(obj); 627 return ERR_PTR(err); 628 } 629 630 static int init_shmem(struct intel_memory_region *mem) 631 { 632 i915_gemfs_init(mem->i915); 633 intel_memory_region_set_name(mem, "system"); 634 635 return 0; /* We have fallback to the kernel mnt if gemfs init failed. */ 636 } 637 638 static int release_shmem(struct intel_memory_region *mem) 639 { 640 i915_gemfs_fini(mem->i915); 641 return 0; 642 } 643 644 static const struct intel_memory_region_ops shmem_region_ops = { 645 .init = init_shmem, 646 .release = release_shmem, 647 .init_object = shmem_object_init, 648 }; 649 650 struct intel_memory_region *i915_gem_shmem_setup(struct drm_i915_private *i915, 651 u16 type, u16 instance) 652 { 653 return intel_memory_region_create(i915, 0, 654 totalram_pages() << PAGE_SHIFT, 655 PAGE_SIZE, 0, 0, 656 type, instance, 657 &shmem_region_ops); 658 } 659 660 bool i915_gem_object_is_shmem(const struct drm_i915_gem_object *obj) 661 { 662 return obj->ops == &i915_gem_shmem_ops; 663 } 664