1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014-2016 Intel Corporation 4 */ 5 6 #include <linux/pagevec.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/swap.h> 9 #include <linux/uio.h> 10 11 #include <drm/drm_cache.h> 12 13 #include "gem/i915_gem_region.h" 14 #include "i915_drv.h" 15 #include "i915_gem_object.h" 16 #include "i915_gem_tiling.h" 17 #include "i915_gemfs.h" 18 #include "i915_scatterlist.h" 19 #include "i915_trace.h" 20 21 /* 22 * Move folios to appropriate lru and release the batch, decrementing the 23 * ref count of those folios. 24 */ 25 static void check_release_folio_batch(struct folio_batch *fbatch) 26 { 27 check_move_unevictable_folios(fbatch); 28 __folio_batch_release(fbatch); 29 cond_resched(); 30 } 31 32 void shmem_sg_free_table(struct sg_table *st, struct address_space *mapping, 33 bool dirty, bool backup) 34 { 35 struct sgt_iter sgt_iter; 36 struct folio_batch fbatch; 37 struct folio *last = NULL; 38 struct page *page; 39 40 mapping_clear_unevictable(mapping); 41 42 folio_batch_init(&fbatch); 43 for_each_sgt_page(page, sgt_iter, st) { 44 struct folio *folio = page_folio(page); 45 46 if (folio == last) 47 continue; 48 last = folio; 49 if (dirty) 50 folio_mark_dirty(folio); 51 if (backup) 52 folio_mark_accessed(folio); 53 54 if (!folio_batch_add(&fbatch, folio)) 55 check_release_folio_batch(&fbatch); 56 } 57 if (fbatch.nr) 58 check_release_folio_batch(&fbatch); 59 60 sg_free_table(st); 61 } 62 63 int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, 64 size_t size, struct intel_memory_region *mr, 65 struct address_space *mapping, 66 unsigned int max_segment) 67 { 68 unsigned int page_count; /* restricted by sg_alloc_table */ 69 unsigned long i; 70 struct scatterlist *sg; 71 unsigned long next_pfn = 0; /* suppress gcc warning */ 72 gfp_t noreclaim; 73 int ret; 74 75 if (overflows_type(size / PAGE_SIZE, page_count)) 76 return -E2BIG; 77 78 page_count = size / PAGE_SIZE; 79 /* 80 * If there's no chance of allocating enough pages for the whole 81 * object, bail early. 82 */ 83 if (size > resource_size(&mr->region)) 84 return -ENOMEM; 85 86 if (sg_alloc_table(st, page_count, GFP_KERNEL | __GFP_NOWARN)) 87 return -ENOMEM; 88 89 /* 90 * Get the list of pages out of our struct file. They'll be pinned 91 * at this point until we release them. 92 * 93 * Fail silently without starting the shrinker 94 */ 95 mapping_set_unevictable(mapping); 96 noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM); 97 noreclaim |= __GFP_NORETRY | __GFP_NOWARN; 98 99 sg = st->sgl; 100 st->nents = 0; 101 for (i = 0; i < page_count; i++) { 102 struct folio *folio; 103 unsigned long nr_pages; 104 const unsigned int shrink[] = { 105 I915_SHRINK_BOUND | I915_SHRINK_UNBOUND, 106 0, 107 }, *s = shrink; 108 gfp_t gfp = noreclaim; 109 110 do { 111 cond_resched(); 112 folio = shmem_read_folio_gfp(mapping, i, gfp); 113 if (!IS_ERR(folio)) 114 break; 115 116 if (!*s) { 117 ret = PTR_ERR(folio); 118 goto err_sg; 119 } 120 121 i915_gem_shrink(NULL, i915, 2 * page_count, NULL, *s++); 122 123 /* 124 * We've tried hard to allocate the memory by reaping 125 * our own buffer, now let the real VM do its job and 126 * go down in flames if truly OOM. 127 * 128 * However, since graphics tend to be disposable, 129 * defer the oom here by reporting the ENOMEM back 130 * to userspace. 131 */ 132 if (!*s) { 133 /* reclaim and warn, but no oom */ 134 gfp = mapping_gfp_mask(mapping); 135 136 /* 137 * Our bo are always dirty and so we require 138 * kswapd to reclaim our pages (direct reclaim 139 * does not effectively begin pageout of our 140 * buffers on its own). However, direct reclaim 141 * only waits for kswapd when under allocation 142 * congestion. So as a result __GFP_RECLAIM is 143 * unreliable and fails to actually reclaim our 144 * dirty pages -- unless you try over and over 145 * again with !__GFP_NORETRY. However, we still 146 * want to fail this allocation rather than 147 * trigger the out-of-memory killer and for 148 * this we want __GFP_RETRY_MAYFAIL. 149 */ 150 gfp |= __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 151 } 152 } while (1); 153 154 nr_pages = min_t(unsigned long, 155 folio_nr_pages(folio), page_count - i); 156 if (!i || 157 sg->length >= max_segment || 158 folio_pfn(folio) != next_pfn) { 159 if (i) 160 sg = sg_next(sg); 161 162 st->nents++; 163 sg_set_folio(sg, folio, nr_pages * PAGE_SIZE, 0); 164 } else { 165 /* XXX: could overflow? */ 166 sg->length += nr_pages * PAGE_SIZE; 167 } 168 next_pfn = folio_pfn(folio) + nr_pages; 169 i += nr_pages - 1; 170 171 /* Check that the i965g/gm workaround works. */ 172 GEM_BUG_ON(gfp & __GFP_DMA32 && next_pfn >= 0x00100000UL); 173 } 174 if (sg) /* loop terminated early; short sg table */ 175 sg_mark_end(sg); 176 177 /* Trim unused sg entries to avoid wasting memory. */ 178 i915_sg_trim(st); 179 180 return 0; 181 err_sg: 182 sg_mark_end(sg); 183 if (sg != st->sgl) { 184 shmem_sg_free_table(st, mapping, false, false); 185 } else { 186 mapping_clear_unevictable(mapping); 187 sg_free_table(st); 188 } 189 190 /* 191 * shmemfs first checks if there is enough memory to allocate the page 192 * and reports ENOSPC should there be insufficient, along with the usual 193 * ENOMEM for a genuine allocation failure. 194 * 195 * We use ENOSPC in our driver to mean that we have run out of aperture 196 * space and so want to translate the error from shmemfs back to our 197 * usual understanding of ENOMEM. 198 */ 199 if (ret == -ENOSPC) 200 ret = -ENOMEM; 201 202 return ret; 203 } 204 205 static int shmem_get_pages(struct drm_i915_gem_object *obj) 206 { 207 struct drm_i915_private *i915 = to_i915(obj->base.dev); 208 struct intel_memory_region *mem = obj->mm.region; 209 struct address_space *mapping = obj->base.filp->f_mapping; 210 unsigned int max_segment = i915_sg_segment_size(i915->drm.dev); 211 struct sg_table *st; 212 int ret; 213 214 /* 215 * Assert that the object is not currently in any GPU domain. As it 216 * wasn't in the GTT, there shouldn't be any way it could have been in 217 * a GPU cache 218 */ 219 GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS); 220 GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS); 221 222 rebuild_st: 223 st = kmalloc(sizeof(*st), GFP_KERNEL | __GFP_NOWARN); 224 if (!st) 225 return -ENOMEM; 226 227 ret = shmem_sg_alloc_table(i915, st, obj->base.size, mem, mapping, 228 max_segment); 229 if (ret) 230 goto err_st; 231 232 ret = i915_gem_gtt_prepare_pages(obj, st); 233 if (ret) { 234 /* 235 * DMA remapping failed? One possible cause is that 236 * it could not reserve enough large entries, asking 237 * for PAGE_SIZE chunks instead may be helpful. 238 */ 239 if (max_segment > PAGE_SIZE) { 240 shmem_sg_free_table(st, mapping, false, false); 241 kfree(st); 242 243 max_segment = PAGE_SIZE; 244 goto rebuild_st; 245 } else { 246 dev_warn(i915->drm.dev, 247 "Failed to DMA remap %zu pages\n", 248 obj->base.size >> PAGE_SHIFT); 249 goto err_pages; 250 } 251 } 252 253 if (i915_gem_object_needs_bit17_swizzle(obj)) 254 i915_gem_object_do_bit_17_swizzle(obj, st); 255 256 if (i915_gem_object_can_bypass_llc(obj)) 257 obj->cache_dirty = true; 258 259 __i915_gem_object_set_pages(obj, st); 260 261 return 0; 262 263 err_pages: 264 shmem_sg_free_table(st, mapping, false, false); 265 /* 266 * shmemfs first checks if there is enough memory to allocate the page 267 * and reports ENOSPC should there be insufficient, along with the usual 268 * ENOMEM for a genuine allocation failure. 269 * 270 * We use ENOSPC in our driver to mean that we have run out of aperture 271 * space and so want to translate the error from shmemfs back to our 272 * usual understanding of ENOMEM. 273 */ 274 err_st: 275 if (ret == -ENOSPC) 276 ret = -ENOMEM; 277 278 kfree(st); 279 280 return ret; 281 } 282 283 static int 284 shmem_truncate(struct drm_i915_gem_object *obj) 285 { 286 /* 287 * Our goal here is to return as much of the memory as 288 * is possible back to the system as we are called from OOM. 289 * To do this we must instruct the shmfs to drop all of its 290 * backing pages, *now*. 291 */ 292 shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1); 293 obj->mm.madv = __I915_MADV_PURGED; 294 obj->mm.pages = ERR_PTR(-EFAULT); 295 296 return 0; 297 } 298 299 void __shmem_writeback(size_t size, struct address_space *mapping) 300 { 301 struct writeback_control wbc = { 302 .sync_mode = WB_SYNC_NONE, 303 .nr_to_write = SWAP_CLUSTER_MAX, 304 .range_start = 0, 305 .range_end = LLONG_MAX, 306 }; 307 struct folio *folio = NULL; 308 int error = 0; 309 310 /* 311 * Leave mmapings intact (GTT will have been revoked on unbinding, 312 * leaving only CPU mmapings around) and add those folios to the LRU 313 * instead of invoking writeback so they are aged and paged out 314 * as normal. 315 */ 316 while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { 317 if (folio_mapped(folio)) 318 folio_redirty_for_writepage(&wbc, folio); 319 else 320 error = shmem_writeout(folio, NULL, NULL); 321 } 322 } 323 324 static void 325 shmem_writeback(struct drm_i915_gem_object *obj) 326 { 327 __shmem_writeback(obj->base.size, obj->base.filp->f_mapping); 328 } 329 330 static int shmem_shrink(struct drm_i915_gem_object *obj, unsigned int flags) 331 { 332 switch (obj->mm.madv) { 333 case I915_MADV_DONTNEED: 334 return i915_gem_object_truncate(obj); 335 case __I915_MADV_PURGED: 336 return 0; 337 } 338 339 if (flags & I915_GEM_OBJECT_SHRINK_WRITEBACK) 340 shmem_writeback(obj); 341 342 return 0; 343 } 344 345 void 346 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, 347 struct sg_table *pages, 348 bool needs_clflush) 349 { 350 struct drm_i915_private *i915 = to_i915(obj->base.dev); 351 352 GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED); 353 354 if (obj->mm.madv == I915_MADV_DONTNEED) 355 obj->mm.dirty = false; 356 357 if (needs_clflush && 358 (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 && 359 !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) 360 drm_clflush_sg(pages); 361 362 __start_cpu_write(obj); 363 /* 364 * On non-LLC igfx platforms, force the flush-on-acquire if this is ever 365 * swapped-in. Our async flush path is not trust worthy enough yet(and 366 * happens in the wrong order), and with some tricks it's conceivable 367 * for userspace to change the cache-level to I915_CACHE_NONE after the 368 * pages are swapped-in, and since execbuf binds the object before doing 369 * the async flush, we have a race window. 370 */ 371 if (!HAS_LLC(i915) && !IS_DGFX(i915)) 372 obj->cache_dirty = true; 373 } 374 375 void i915_gem_object_put_pages_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages) 376 { 377 __i915_gem_object_release_shmem(obj, pages, true); 378 379 i915_gem_gtt_finish_pages(obj, pages); 380 381 if (i915_gem_object_needs_bit17_swizzle(obj)) 382 i915_gem_object_save_bit_17_swizzle(obj, pages); 383 384 shmem_sg_free_table(pages, file_inode(obj->base.filp)->i_mapping, 385 obj->mm.dirty, obj->mm.madv == I915_MADV_WILLNEED); 386 kfree(pages); 387 obj->mm.dirty = false; 388 } 389 390 static void 391 shmem_put_pages(struct drm_i915_gem_object *obj, struct sg_table *pages) 392 { 393 if (likely(i915_gem_object_has_struct_page(obj))) 394 i915_gem_object_put_pages_shmem(obj, pages); 395 else 396 i915_gem_object_put_pages_phys(obj, pages); 397 } 398 399 static int 400 shmem_pwrite(struct drm_i915_gem_object *obj, 401 const struct drm_i915_gem_pwrite *arg) 402 { 403 char __user *user_data = u64_to_user_ptr(arg->data_ptr); 404 struct file *file = obj->base.filp; 405 struct kiocb kiocb; 406 struct iov_iter iter; 407 ssize_t written; 408 u64 size = arg->size; 409 410 /* Caller already validated user args */ 411 GEM_BUG_ON(!access_ok(user_data, arg->size)); 412 413 if (!i915_gem_object_has_struct_page(obj)) 414 return i915_gem_object_pwrite_phys(obj, arg); 415 416 /* 417 * Before we instantiate/pin the backing store for our use, we 418 * can prepopulate the shmemfs filp efficiently using a write into 419 * the pagecache. We avoid the penalty of instantiating all the 420 * pages, important if the user is just writing to a few and never 421 * uses the object on the GPU, and using a direct write into shmemfs 422 * allows it to avoid the cost of retrieving a page (either swapin 423 * or clearing-before-use) before it is overwritten. 424 */ 425 if (i915_gem_object_has_pages(obj)) 426 return -ENODEV; 427 428 if (obj->mm.madv != I915_MADV_WILLNEED) 429 return -EFAULT; 430 431 if (size > MAX_RW_COUNT) 432 return -EFBIG; 433 434 if (!file->f_op->write_iter) 435 return -EINVAL; 436 437 init_sync_kiocb(&kiocb, file); 438 kiocb.ki_pos = arg->offset; 439 iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)user_data, size); 440 441 written = file->f_op->write_iter(&kiocb, &iter); 442 BUG_ON(written == -EIOCBQUEUED); 443 444 if (written != size) 445 return -EIO; 446 447 if (written < 0) 448 return written; 449 450 return 0; 451 } 452 453 static int 454 shmem_pread(struct drm_i915_gem_object *obj, 455 const struct drm_i915_gem_pread *arg) 456 { 457 if (!i915_gem_object_has_struct_page(obj)) 458 return i915_gem_object_pread_phys(obj, arg); 459 460 return -ENODEV; 461 } 462 463 static void shmem_release(struct drm_i915_gem_object *obj) 464 { 465 if (i915_gem_object_has_struct_page(obj)) 466 i915_gem_object_release_memory_region(obj); 467 468 fput(obj->base.filp); 469 } 470 471 const struct drm_i915_gem_object_ops i915_gem_shmem_ops = { 472 .name = "i915_gem_object_shmem", 473 .flags = I915_GEM_OBJECT_IS_SHRINKABLE, 474 475 .get_pages = shmem_get_pages, 476 .put_pages = shmem_put_pages, 477 .truncate = shmem_truncate, 478 .shrink = shmem_shrink, 479 480 .pwrite = shmem_pwrite, 481 .pread = shmem_pread, 482 483 .release = shmem_release, 484 }; 485 486 static int __create_shmem(struct drm_i915_private *i915, 487 struct drm_gem_object *obj, 488 resource_size_t size) 489 { 490 unsigned long flags = VM_NORESERVE; 491 struct file *filp; 492 493 drm_gem_private_object_init(&i915->drm, obj, size); 494 495 /* XXX: The __shmem_file_setup() function returns -EINVAL if size is 496 * greater than MAX_LFS_FILESIZE. 497 * To handle the same error as other code that returns -E2BIG when 498 * the size is too large, we add a code that returns -E2BIG when the 499 * size is larger than the size that can be handled. 500 * If BITS_PER_LONG is 32, size > MAX_LFS_FILESIZE is always false, 501 * so we only needs to check when BITS_PER_LONG is 64. 502 * If BITS_PER_LONG is 32, E2BIG checks are processed when 503 * i915_gem_object_size_2big() is called before init_object() callback 504 * is called. 505 */ 506 if (BITS_PER_LONG == 64 && size > MAX_LFS_FILESIZE) 507 return -E2BIG; 508 509 if (i915->mm.gemfs) 510 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size, 511 flags); 512 else 513 filp = shmem_file_setup("i915", size, flags); 514 if (IS_ERR(filp)) 515 return PTR_ERR(filp); 516 517 obj->filp = filp; 518 return 0; 519 } 520 521 static int shmem_object_init(struct intel_memory_region *mem, 522 struct drm_i915_gem_object *obj, 523 resource_size_t offset, 524 resource_size_t size, 525 resource_size_t page_size, 526 unsigned int flags) 527 { 528 static struct lock_class_key lock_class; 529 struct drm_i915_private *i915 = mem->i915; 530 struct address_space *mapping; 531 unsigned int cache_level; 532 gfp_t mask; 533 int ret; 534 535 ret = __create_shmem(i915, &obj->base, size); 536 if (ret) 537 return ret; 538 539 mask = GFP_HIGHUSER | __GFP_RECLAIMABLE; 540 if (IS_I965GM(i915) || IS_I965G(i915)) { 541 /* 965gm cannot relocate objects above 4GiB. */ 542 mask &= ~__GFP_HIGHMEM; 543 mask |= __GFP_DMA32; 544 } 545 546 mapping = obj->base.filp->f_mapping; 547 mapping_set_gfp_mask(mapping, mask); 548 GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM)); 549 550 i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, flags); 551 obj->mem_flags |= I915_BO_FLAG_STRUCT_PAGE; 552 obj->write_domain = I915_GEM_DOMAIN_CPU; 553 obj->read_domains = I915_GEM_DOMAIN_CPU; 554 555 /* 556 * MTL doesn't snoop CPU cache by default for GPU access (namely 557 * 1-way coherency). However some UMD's are currently depending on 558 * that. Make 1-way coherent the default setting for MTL. A follow 559 * up patch will extend the GEM_CREATE uAPI to allow UMD's specify 560 * caching mode at BO creation time 561 */ 562 if (HAS_LLC(i915) || (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))) 563 /* On some devices, we can have the GPU use the LLC (the CPU 564 * cache) for about a 10% performance improvement 565 * compared to uncached. Graphics requests other than 566 * display scanout are coherent with the CPU in 567 * accessing this cache. This means in this mode we 568 * don't need to clflush on the CPU side, and on the 569 * GPU side we only need to flush internal caches to 570 * get data visible to the CPU. 571 * 572 * However, we maintain the display planes as UC, and so 573 * need to rebind when first used as such. 574 */ 575 cache_level = I915_CACHE_LLC; 576 else 577 cache_level = I915_CACHE_NONE; 578 579 i915_gem_object_set_cache_coherency(obj, cache_level); 580 581 i915_gem_object_init_memory_region(obj, mem); 582 583 return 0; 584 } 585 586 struct drm_i915_gem_object * 587 i915_gem_object_create_shmem(struct drm_i915_private *i915, 588 resource_size_t size) 589 { 590 return i915_gem_object_create_region(i915->mm.regions[INTEL_REGION_SMEM], 591 size, 0, 0); 592 } 593 594 /* Allocate a new GEM object and fill it with the supplied data */ 595 struct drm_i915_gem_object * 596 i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, 597 const void *data, resource_size_t size) 598 { 599 struct drm_i915_gem_object *obj; 600 struct file *file; 601 loff_t pos = 0; 602 ssize_t err; 603 604 GEM_WARN_ON(IS_DGFX(i915)); 605 obj = i915_gem_object_create_shmem(i915, round_up(size, PAGE_SIZE)); 606 if (IS_ERR(obj)) 607 return obj; 608 609 GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU); 610 611 file = obj->base.filp; 612 err = kernel_write(file, data, size, &pos); 613 614 if (err < 0) 615 goto fail; 616 617 if (err != size) { 618 err = -EIO; 619 goto fail; 620 } 621 622 return obj; 623 624 fail: 625 i915_gem_object_put(obj); 626 return ERR_PTR(err); 627 } 628 629 static int init_shmem(struct intel_memory_region *mem) 630 { 631 i915_gemfs_init(mem->i915); 632 intel_memory_region_set_name(mem, "system"); 633 634 return 0; /* We have fallback to the kernel mnt if gemfs init failed. */ 635 } 636 637 static int release_shmem(struct intel_memory_region *mem) 638 { 639 i915_gemfs_fini(mem->i915); 640 return 0; 641 } 642 643 static const struct intel_memory_region_ops shmem_region_ops = { 644 .init = init_shmem, 645 .release = release_shmem, 646 .init_object = shmem_object_init, 647 }; 648 649 struct intel_memory_region *i915_gem_shmem_setup(struct drm_i915_private *i915, 650 u16 type, u16 instance) 651 { 652 return intel_memory_region_create(i915, 0, 653 totalram_pages() << PAGE_SHIFT, 654 PAGE_SIZE, 0, 0, 655 type, instance, 656 &shmem_region_ops); 657 } 658 659 bool i915_gem_object_is_shmem(const struct drm_i915_gem_object *obj) 660 { 661 return obj->ops == &i915_gem_shmem_ops; 662 } 663