1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/truncate.c - code for taking down pages from address_spaces 4 * 5 * Copyright (C) 2002, Linus Torvalds 6 * 7 * 10Sep2002 Andrew Morton 8 * Initial version. 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/backing-dev.h> 13 #include <linux/dax.h> 14 #include <linux/gfp.h> 15 #include <linux/mm.h> 16 #include <linux/swap.h> 17 #include <linux/export.h> 18 #include <linux/pagemap.h> 19 #include <linux/highmem.h> 20 #include <linux/pagevec.h> 21 #include <linux/task_io_accounting_ops.h> 22 #include <linux/buffer_head.h> /* grr. try_to_release_page, 23 do_invalidatepage */ 24 #include <linux/shmem_fs.h> 25 #include <linux/cleancache.h> 26 #include <linux/rmap.h> 27 #include "internal.h" 28 29 /* 30 * Regular page slots are stabilized by the page lock even without the tree 31 * itself locked. These unlocked entries need verification under the tree 32 * lock. 33 */ 34 static inline void __clear_shadow_entry(struct address_space *mapping, 35 pgoff_t index, void *entry) 36 { 37 XA_STATE(xas, &mapping->i_pages, index); 38 39 xas_set_update(&xas, workingset_update_node); 40 if (xas_load(&xas) != entry) 41 return; 42 xas_store(&xas, NULL); 43 } 44 45 static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, 46 void *entry) 47 { 48 xa_lock_irq(&mapping->i_pages); 49 __clear_shadow_entry(mapping, index, entry); 50 xa_unlock_irq(&mapping->i_pages); 51 } 52 53 /* 54 * Unconditionally remove exceptional entries. Usually called from truncate 55 * path. Note that the pagevec may be altered by this function by removing 56 * exceptional entries similar to what pagevec_remove_exceptionals does. 57 */ 58 static void truncate_exceptional_pvec_entries(struct address_space *mapping, 59 struct pagevec *pvec, pgoff_t *indices) 60 { 61 int i, j; 62 bool dax; 63 64 /* Handled by shmem itself */ 65 if (shmem_mapping(mapping)) 66 return; 67 68 for (j = 0; j < pagevec_count(pvec); j++) 69 if (xa_is_value(pvec->pages[j])) 70 break; 71 72 if (j == pagevec_count(pvec)) 73 return; 74 75 dax = dax_mapping(mapping); 76 if (!dax) 77 xa_lock_irq(&mapping->i_pages); 78 79 for (i = j; i < pagevec_count(pvec); i++) { 80 struct page *page = pvec->pages[i]; 81 pgoff_t index = indices[i]; 82 83 if (!xa_is_value(page)) { 84 pvec->pages[j++] = page; 85 continue; 86 } 87 88 if (unlikely(dax)) { 89 dax_delete_mapping_entry(mapping, index); 90 continue; 91 } 92 93 __clear_shadow_entry(mapping, index, page); 94 } 95 96 if (!dax) 97 xa_unlock_irq(&mapping->i_pages); 98 pvec->nr = j; 99 } 100 101 /* 102 * Invalidate exceptional entry if easily possible. This handles exceptional 103 * entries for invalidate_inode_pages(). 104 */ 105 static int invalidate_exceptional_entry(struct address_space *mapping, 106 pgoff_t index, void *entry) 107 { 108 /* Handled by shmem itself, or for DAX we do nothing. */ 109 if (shmem_mapping(mapping) || dax_mapping(mapping)) 110 return 1; 111 clear_shadow_entry(mapping, index, entry); 112 return 1; 113 } 114 115 /* 116 * Invalidate exceptional entry if clean. This handles exceptional entries for 117 * invalidate_inode_pages2() so for DAX it evicts only clean entries. 118 */ 119 static int invalidate_exceptional_entry2(struct address_space *mapping, 120 pgoff_t index, void *entry) 121 { 122 /* Handled by shmem itself */ 123 if (shmem_mapping(mapping)) 124 return 1; 125 if (dax_mapping(mapping)) 126 return dax_invalidate_mapping_entry_sync(mapping, index); 127 clear_shadow_entry(mapping, index, entry); 128 return 1; 129 } 130 131 /** 132 * do_invalidatepage - invalidate part or all of a page 133 * @page: the page which is affected 134 * @offset: start of the range to invalidate 135 * @length: length of the range to invalidate 136 * 137 * do_invalidatepage() is called when all or part of the page has become 138 * invalidated by a truncate operation. 139 * 140 * do_invalidatepage() does not have to release all buffers, but it must 141 * ensure that no dirty buffer is left outside @offset and that no I/O 142 * is underway against any of the blocks which are outside the truncation 143 * point. Because the caller is about to free (and possibly reuse) those 144 * blocks on-disk. 145 */ 146 void do_invalidatepage(struct page *page, unsigned int offset, 147 unsigned int length) 148 { 149 void (*invalidatepage)(struct page *, unsigned int, unsigned int); 150 151 invalidatepage = page->mapping->a_ops->invalidatepage; 152 #ifdef CONFIG_BLOCK 153 if (!invalidatepage) 154 invalidatepage = block_invalidatepage; 155 #endif 156 if (invalidatepage) 157 (*invalidatepage)(page, offset, length); 158 } 159 160 /* 161 * If truncate cannot remove the fs-private metadata from the page, the page 162 * becomes orphaned. It will be left on the LRU and may even be mapped into 163 * user pagetables if we're racing with filemap_fault(). 164 * 165 * We need to bail out if page->mapping is no longer equal to the original 166 * mapping. This happens a) when the VM reclaimed the page while we waited on 167 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 168 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 169 */ 170 static void 171 truncate_cleanup_page(struct address_space *mapping, struct page *page) 172 { 173 if (page_mapped(page)) { 174 unsigned int nr = thp_nr_pages(page); 175 unmap_mapping_pages(mapping, page->index, nr, false); 176 } 177 178 if (page_has_private(page)) 179 do_invalidatepage(page, 0, thp_size(page)); 180 181 /* 182 * Some filesystems seem to re-dirty the page even after 183 * the VM has canceled the dirty bit (eg ext3 journaling). 184 * Hence dirty accounting check is placed after invalidation. 185 */ 186 cancel_dirty_page(page); 187 ClearPageMappedToDisk(page); 188 } 189 190 /* 191 * This is for invalidate_mapping_pages(). That function can be called at 192 * any time, and is not supposed to throw away dirty pages. But pages can 193 * be marked dirty at any time too, so use remove_mapping which safely 194 * discards clean, unused pages. 195 * 196 * Returns non-zero if the page was successfully invalidated. 197 */ 198 static int 199 invalidate_complete_page(struct address_space *mapping, struct page *page) 200 { 201 int ret; 202 203 if (page->mapping != mapping) 204 return 0; 205 206 if (page_has_private(page) && !try_to_release_page(page, 0)) 207 return 0; 208 209 ret = remove_mapping(mapping, page); 210 211 return ret; 212 } 213 214 int truncate_inode_page(struct address_space *mapping, struct page *page) 215 { 216 VM_BUG_ON_PAGE(PageTail(page), page); 217 218 if (page->mapping != mapping) 219 return -EIO; 220 221 truncate_cleanup_page(mapping, page); 222 delete_from_page_cache(page); 223 return 0; 224 } 225 226 /* 227 * Used to get rid of pages on hardware memory corruption. 228 */ 229 int generic_error_remove_page(struct address_space *mapping, struct page *page) 230 { 231 if (!mapping) 232 return -EINVAL; 233 /* 234 * Only punch for normal data pages for now. 235 * Handling other types like directories would need more auditing. 236 */ 237 if (!S_ISREG(mapping->host->i_mode)) 238 return -EIO; 239 return truncate_inode_page(mapping, page); 240 } 241 EXPORT_SYMBOL(generic_error_remove_page); 242 243 /* 244 * Safely invalidate one page from its pagecache mapping. 245 * It only drops clean, unused pages. The page must be locked. 246 * 247 * Returns 1 if the page is successfully invalidated, otherwise 0. 248 */ 249 int invalidate_inode_page(struct page *page) 250 { 251 struct address_space *mapping = page_mapping(page); 252 if (!mapping) 253 return 0; 254 if (PageDirty(page) || PageWriteback(page)) 255 return 0; 256 if (page_mapped(page)) 257 return 0; 258 return invalidate_complete_page(mapping, page); 259 } 260 261 /** 262 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets 263 * @mapping: mapping to truncate 264 * @lstart: offset from which to truncate 265 * @lend: offset to which to truncate (inclusive) 266 * 267 * Truncate the page cache, removing the pages that are between 268 * specified offsets (and zeroing out partial pages 269 * if lstart or lend + 1 is not page aligned). 270 * 271 * Truncate takes two passes - the first pass is nonblocking. It will not 272 * block on page locks and it will not block on writeback. The second pass 273 * will wait. This is to prevent as much IO as possible in the affected region. 274 * The first pass will remove most pages, so the search cost of the second pass 275 * is low. 276 * 277 * We pass down the cache-hot hint to the page freeing code. Even if the 278 * mapping is large, it is probably the case that the final pages are the most 279 * recently touched, and freeing happens in ascending file offset order. 280 * 281 * Note that since ->invalidatepage() accepts range to invalidate 282 * truncate_inode_pages_range is able to handle cases where lend + 1 is not 283 * page aligned properly. 284 */ 285 void truncate_inode_pages_range(struct address_space *mapping, 286 loff_t lstart, loff_t lend) 287 { 288 pgoff_t start; /* inclusive */ 289 pgoff_t end; /* exclusive */ 290 unsigned int partial_start; /* inclusive */ 291 unsigned int partial_end; /* exclusive */ 292 struct pagevec pvec; 293 pgoff_t indices[PAGEVEC_SIZE]; 294 pgoff_t index; 295 int i; 296 297 if (mapping_empty(mapping)) 298 goto out; 299 300 /* Offsets within partial pages */ 301 partial_start = lstart & (PAGE_SIZE - 1); 302 partial_end = (lend + 1) & (PAGE_SIZE - 1); 303 304 /* 305 * 'start' and 'end' always covers the range of pages to be fully 306 * truncated. Partial pages are covered with 'partial_start' at the 307 * start of the range and 'partial_end' at the end of the range. 308 * Note that 'end' is exclusive while 'lend' is inclusive. 309 */ 310 start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 311 if (lend == -1) 312 /* 313 * lend == -1 indicates end-of-file so we have to set 'end' 314 * to the highest possible pgoff_t and since the type is 315 * unsigned we're using -1. 316 */ 317 end = -1; 318 else 319 end = (lend + 1) >> PAGE_SHIFT; 320 321 pagevec_init(&pvec); 322 index = start; 323 while (index < end && find_lock_entries(mapping, index, end - 1, 324 &pvec, indices)) { 325 index = indices[pagevec_count(&pvec) - 1] + 1; 326 truncate_exceptional_pvec_entries(mapping, &pvec, indices); 327 for (i = 0; i < pagevec_count(&pvec); i++) 328 truncate_cleanup_page(mapping, pvec.pages[i]); 329 delete_from_page_cache_batch(mapping, &pvec); 330 for (i = 0; i < pagevec_count(&pvec); i++) 331 unlock_page(pvec.pages[i]); 332 pagevec_release(&pvec); 333 cond_resched(); 334 } 335 336 if (partial_start) { 337 struct page *page = find_lock_page(mapping, start - 1); 338 if (page) { 339 unsigned int top = PAGE_SIZE; 340 if (start > end) { 341 /* Truncation within a single page */ 342 top = partial_end; 343 partial_end = 0; 344 } 345 wait_on_page_writeback(page); 346 zero_user_segment(page, partial_start, top); 347 cleancache_invalidate_page(mapping, page); 348 if (page_has_private(page)) 349 do_invalidatepage(page, partial_start, 350 top - partial_start); 351 unlock_page(page); 352 put_page(page); 353 } 354 } 355 if (partial_end) { 356 struct page *page = find_lock_page(mapping, end); 357 if (page) { 358 wait_on_page_writeback(page); 359 zero_user_segment(page, 0, partial_end); 360 cleancache_invalidate_page(mapping, page); 361 if (page_has_private(page)) 362 do_invalidatepage(page, 0, 363 partial_end); 364 unlock_page(page); 365 put_page(page); 366 } 367 } 368 /* 369 * If the truncation happened within a single page no pages 370 * will be released, just zeroed, so we can bail out now. 371 */ 372 if (start >= end) 373 goto out; 374 375 index = start; 376 for ( ; ; ) { 377 cond_resched(); 378 if (!find_get_entries(mapping, index, end - 1, &pvec, 379 indices)) { 380 /* If all gone from start onwards, we're done */ 381 if (index == start) 382 break; 383 /* Otherwise restart to make sure all gone */ 384 index = start; 385 continue; 386 } 387 388 for (i = 0; i < pagevec_count(&pvec); i++) { 389 struct page *page = pvec.pages[i]; 390 391 /* We rely upon deletion not changing page->index */ 392 index = indices[i]; 393 394 if (xa_is_value(page)) 395 continue; 396 397 lock_page(page); 398 WARN_ON(page_to_index(page) != index); 399 wait_on_page_writeback(page); 400 truncate_inode_page(mapping, page); 401 unlock_page(page); 402 } 403 truncate_exceptional_pvec_entries(mapping, &pvec, indices); 404 pagevec_release(&pvec); 405 index++; 406 } 407 408 out: 409 cleancache_invalidate_inode(mapping); 410 } 411 EXPORT_SYMBOL(truncate_inode_pages_range); 412 413 /** 414 * truncate_inode_pages - truncate *all* the pages from an offset 415 * @mapping: mapping to truncate 416 * @lstart: offset from which to truncate 417 * 418 * Called under (and serialised by) inode->i_mutex. 419 * 420 * Note: When this function returns, there can be a page in the process of 421 * deletion (inside __delete_from_page_cache()) in the specified range. Thus 422 * mapping->nrpages can be non-zero when this function returns even after 423 * truncation of the whole mapping. 424 */ 425 void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 426 { 427 truncate_inode_pages_range(mapping, lstart, (loff_t)-1); 428 } 429 EXPORT_SYMBOL(truncate_inode_pages); 430 431 /** 432 * truncate_inode_pages_final - truncate *all* pages before inode dies 433 * @mapping: mapping to truncate 434 * 435 * Called under (and serialized by) inode->i_mutex. 436 * 437 * Filesystems have to use this in the .evict_inode path to inform the 438 * VM that this is the final truncate and the inode is going away. 439 */ 440 void truncate_inode_pages_final(struct address_space *mapping) 441 { 442 /* 443 * Page reclaim can not participate in regular inode lifetime 444 * management (can't call iput()) and thus can race with the 445 * inode teardown. Tell it when the address space is exiting, 446 * so that it does not install eviction information after the 447 * final truncate has begun. 448 */ 449 mapping_set_exiting(mapping); 450 451 if (!mapping_empty(mapping)) { 452 /* 453 * As truncation uses a lockless tree lookup, cycle 454 * the tree lock to make sure any ongoing tree 455 * modification that does not see AS_EXITING is 456 * completed before starting the final truncate. 457 */ 458 xa_lock_irq(&mapping->i_pages); 459 xa_unlock_irq(&mapping->i_pages); 460 } 461 462 /* 463 * Cleancache needs notification even if there are no pages or shadow 464 * entries. 465 */ 466 truncate_inode_pages(mapping, 0); 467 } 468 EXPORT_SYMBOL(truncate_inode_pages_final); 469 470 static unsigned long __invalidate_mapping_pages(struct address_space *mapping, 471 pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) 472 { 473 pgoff_t indices[PAGEVEC_SIZE]; 474 struct pagevec pvec; 475 pgoff_t index = start; 476 unsigned long ret; 477 unsigned long count = 0; 478 int i; 479 480 pagevec_init(&pvec); 481 while (find_lock_entries(mapping, index, end, &pvec, indices)) { 482 for (i = 0; i < pagevec_count(&pvec); i++) { 483 struct page *page = pvec.pages[i]; 484 485 /* We rely upon deletion not changing page->index */ 486 index = indices[i]; 487 488 if (xa_is_value(page)) { 489 invalidate_exceptional_entry(mapping, index, 490 page); 491 continue; 492 } 493 index += thp_nr_pages(page) - 1; 494 495 ret = invalidate_inode_page(page); 496 unlock_page(page); 497 /* 498 * Invalidation is a hint that the page is no longer 499 * of interest and try to speed up its reclaim. 500 */ 501 if (!ret) { 502 deactivate_file_page(page); 503 /* It is likely on the pagevec of a remote CPU */ 504 if (nr_pagevec) 505 (*nr_pagevec)++; 506 } 507 count += ret; 508 } 509 pagevec_remove_exceptionals(&pvec); 510 pagevec_release(&pvec); 511 cond_resched(); 512 index++; 513 } 514 return count; 515 } 516 517 /** 518 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode 519 * @mapping: the address_space which holds the pages to invalidate 520 * @start: the offset 'from' which to invalidate 521 * @end: the offset 'to' which to invalidate (inclusive) 522 * 523 * This function only removes the unlocked pages, if you want to 524 * remove all the pages of one inode, you must call truncate_inode_pages. 525 * 526 * invalidate_mapping_pages() will not block on IO activity. It will not 527 * invalidate pages which are dirty, locked, under writeback or mapped into 528 * pagetables. 529 * 530 * Return: the number of the pages that were invalidated 531 */ 532 unsigned long invalidate_mapping_pages(struct address_space *mapping, 533 pgoff_t start, pgoff_t end) 534 { 535 return __invalidate_mapping_pages(mapping, start, end, NULL); 536 } 537 EXPORT_SYMBOL(invalidate_mapping_pages); 538 539 /** 540 * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode 541 * @mapping: the address_space which holds the pages to invalidate 542 * @start: the offset 'from' which to invalidate 543 * @end: the offset 'to' which to invalidate (inclusive) 544 * @nr_pagevec: invalidate failed page number for caller 545 * 546 * This helper is similar to invalidate_mapping_pages(), except that it accounts 547 * for pages that are likely on a pagevec and counts them in @nr_pagevec, which 548 * will be used by the caller. 549 */ 550 void invalidate_mapping_pagevec(struct address_space *mapping, 551 pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) 552 { 553 __invalidate_mapping_pages(mapping, start, end, nr_pagevec); 554 } 555 556 /* 557 * This is like invalidate_complete_page(), except it ignores the page's 558 * refcount. We do this because invalidate_inode_pages2() needs stronger 559 * invalidation guarantees, and cannot afford to leave pages behind because 560 * shrink_page_list() has a temp ref on them, or because they're transiently 561 * sitting in the lru_cache_add() pagevecs. 562 */ 563 static int 564 invalidate_complete_page2(struct address_space *mapping, struct page *page) 565 { 566 unsigned long flags; 567 568 if (page->mapping != mapping) 569 return 0; 570 571 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 572 return 0; 573 574 xa_lock_irqsave(&mapping->i_pages, flags); 575 if (PageDirty(page)) 576 goto failed; 577 578 BUG_ON(page_has_private(page)); 579 __delete_from_page_cache(page, NULL); 580 xa_unlock_irqrestore(&mapping->i_pages, flags); 581 582 if (mapping->a_ops->freepage) 583 mapping->a_ops->freepage(page); 584 585 put_page(page); /* pagecache ref */ 586 return 1; 587 failed: 588 xa_unlock_irqrestore(&mapping->i_pages, flags); 589 return 0; 590 } 591 592 static int do_launder_page(struct address_space *mapping, struct page *page) 593 { 594 if (!PageDirty(page)) 595 return 0; 596 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) 597 return 0; 598 return mapping->a_ops->launder_page(page); 599 } 600 601 /** 602 * invalidate_inode_pages2_range - remove range of pages from an address_space 603 * @mapping: the address_space 604 * @start: the page offset 'from' which to invalidate 605 * @end: the page offset 'to' which to invalidate (inclusive) 606 * 607 * Any pages which are found to be mapped into pagetables are unmapped prior to 608 * invalidation. 609 * 610 * Return: -EBUSY if any pages could not be invalidated. 611 */ 612 int invalidate_inode_pages2_range(struct address_space *mapping, 613 pgoff_t start, pgoff_t end) 614 { 615 pgoff_t indices[PAGEVEC_SIZE]; 616 struct pagevec pvec; 617 pgoff_t index; 618 int i; 619 int ret = 0; 620 int ret2 = 0; 621 int did_range_unmap = 0; 622 623 if (mapping_empty(mapping)) 624 goto out; 625 626 pagevec_init(&pvec); 627 index = start; 628 while (find_get_entries(mapping, index, end, &pvec, indices)) { 629 for (i = 0; i < pagevec_count(&pvec); i++) { 630 struct page *page = pvec.pages[i]; 631 632 /* We rely upon deletion not changing page->index */ 633 index = indices[i]; 634 635 if (xa_is_value(page)) { 636 if (!invalidate_exceptional_entry2(mapping, 637 index, page)) 638 ret = -EBUSY; 639 continue; 640 } 641 642 lock_page(page); 643 WARN_ON(page_to_index(page) != index); 644 if (page->mapping != mapping) { 645 unlock_page(page); 646 continue; 647 } 648 wait_on_page_writeback(page); 649 if (page_mapped(page)) { 650 if (!did_range_unmap) { 651 /* 652 * Zap the rest of the file in one hit. 653 */ 654 unmap_mapping_pages(mapping, index, 655 (1 + end - index), false); 656 did_range_unmap = 1; 657 } else { 658 /* 659 * Just zap this page 660 */ 661 unmap_mapping_pages(mapping, index, 662 1, false); 663 } 664 } 665 BUG_ON(page_mapped(page)); 666 ret2 = do_launder_page(mapping, page); 667 if (ret2 == 0) { 668 if (!invalidate_complete_page2(mapping, page)) 669 ret2 = -EBUSY; 670 } 671 if (ret2 < 0) 672 ret = ret2; 673 unlock_page(page); 674 } 675 pagevec_remove_exceptionals(&pvec); 676 pagevec_release(&pvec); 677 cond_resched(); 678 index++; 679 } 680 /* 681 * For DAX we invalidate page tables after invalidating page cache. We 682 * could invalidate page tables while invalidating each entry however 683 * that would be expensive. And doing range unmapping before doesn't 684 * work as we have no cheap way to find whether page cache entry didn't 685 * get remapped later. 686 */ 687 if (dax_mapping(mapping)) { 688 unmap_mapping_pages(mapping, start, end - start + 1, false); 689 } 690 out: 691 cleancache_invalidate_inode(mapping); 692 return ret; 693 } 694 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 695 696 /** 697 * invalidate_inode_pages2 - remove all pages from an address_space 698 * @mapping: the address_space 699 * 700 * Any pages which are found to be mapped into pagetables are unmapped prior to 701 * invalidation. 702 * 703 * Return: -EBUSY if any pages could not be invalidated. 704 */ 705 int invalidate_inode_pages2(struct address_space *mapping) 706 { 707 return invalidate_inode_pages2_range(mapping, 0, -1); 708 } 709 EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 710 711 /** 712 * truncate_pagecache - unmap and remove pagecache that has been truncated 713 * @inode: inode 714 * @newsize: new file size 715 * 716 * inode's new i_size must already be written before truncate_pagecache 717 * is called. 718 * 719 * This function should typically be called before the filesystem 720 * releases resources associated with the freed range (eg. deallocates 721 * blocks). This way, pagecache will always stay logically coherent 722 * with on-disk format, and the filesystem would not have to deal with 723 * situations such as writepage being called for a page that has already 724 * had its underlying blocks deallocated. 725 */ 726 void truncate_pagecache(struct inode *inode, loff_t newsize) 727 { 728 struct address_space *mapping = inode->i_mapping; 729 loff_t holebegin = round_up(newsize, PAGE_SIZE); 730 731 /* 732 * unmap_mapping_range is called twice, first simply for 733 * efficiency so that truncate_inode_pages does fewer 734 * single-page unmaps. However after this first call, and 735 * before truncate_inode_pages finishes, it is possible for 736 * private pages to be COWed, which remain after 737 * truncate_inode_pages finishes, hence the second 738 * unmap_mapping_range call must be made for correctness. 739 */ 740 unmap_mapping_range(mapping, holebegin, 0, 1); 741 truncate_inode_pages(mapping, newsize); 742 unmap_mapping_range(mapping, holebegin, 0, 1); 743 } 744 EXPORT_SYMBOL(truncate_pagecache); 745 746 /** 747 * truncate_setsize - update inode and pagecache for a new file size 748 * @inode: inode 749 * @newsize: new file size 750 * 751 * truncate_setsize updates i_size and performs pagecache truncation (if 752 * necessary) to @newsize. It will be typically be called from the filesystem's 753 * setattr function when ATTR_SIZE is passed in. 754 * 755 * Must be called with a lock serializing truncates and writes (generally 756 * i_mutex but e.g. xfs uses a different lock) and before all filesystem 757 * specific block truncation has been performed. 758 */ 759 void truncate_setsize(struct inode *inode, loff_t newsize) 760 { 761 loff_t oldsize = inode->i_size; 762 763 i_size_write(inode, newsize); 764 if (newsize > oldsize) 765 pagecache_isize_extended(inode, oldsize, newsize); 766 truncate_pagecache(inode, newsize); 767 } 768 EXPORT_SYMBOL(truncate_setsize); 769 770 /** 771 * pagecache_isize_extended - update pagecache after extension of i_size 772 * @inode: inode for which i_size was extended 773 * @from: original inode size 774 * @to: new inode size 775 * 776 * Handle extension of inode size either caused by extending truncate or by 777 * write starting after current i_size. We mark the page straddling current 778 * i_size RO so that page_mkwrite() is called on the nearest write access to 779 * the page. This way filesystem can be sure that page_mkwrite() is called on 780 * the page before user writes to the page via mmap after the i_size has been 781 * changed. 782 * 783 * The function must be called after i_size is updated so that page fault 784 * coming after we unlock the page will already see the new i_size. 785 * The function must be called while we still hold i_mutex - this not only 786 * makes sure i_size is stable but also that userspace cannot observe new 787 * i_size value before we are prepared to store mmap writes at new inode size. 788 */ 789 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) 790 { 791 int bsize = i_blocksize(inode); 792 loff_t rounded_from; 793 struct page *page; 794 pgoff_t index; 795 796 WARN_ON(to > inode->i_size); 797 798 if (from >= to || bsize == PAGE_SIZE) 799 return; 800 /* Page straddling @from will not have any hole block created? */ 801 rounded_from = round_up(from, bsize); 802 if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) 803 return; 804 805 index = from >> PAGE_SHIFT; 806 page = find_lock_page(inode->i_mapping, index); 807 /* Page not cached? Nothing to do */ 808 if (!page) 809 return; 810 /* 811 * See clear_page_dirty_for_io() for details why set_page_dirty() 812 * is needed. 813 */ 814 if (page_mkclean(page)) 815 set_page_dirty(page); 816 unlock_page(page); 817 put_page(page); 818 } 819 EXPORT_SYMBOL(pagecache_isize_extended); 820 821 /** 822 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 823 * @inode: inode 824 * @lstart: offset of beginning of hole 825 * @lend: offset of last byte of hole 826 * 827 * This function should typically be called before the filesystem 828 * releases resources associated with the freed range (eg. deallocates 829 * blocks). This way, pagecache will always stay logically coherent 830 * with on-disk format, and the filesystem would not have to deal with 831 * situations such as writepage being called for a page that has already 832 * had its underlying blocks deallocated. 833 */ 834 void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) 835 { 836 struct address_space *mapping = inode->i_mapping; 837 loff_t unmap_start = round_up(lstart, PAGE_SIZE); 838 loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; 839 /* 840 * This rounding is currently just for example: unmap_mapping_range 841 * expands its hole outwards, whereas we want it to contract the hole 842 * inwards. However, existing callers of truncate_pagecache_range are 843 * doing their own page rounding first. Note that unmap_mapping_range 844 * allows holelen 0 for all, and we allow lend -1 for end of file. 845 */ 846 847 /* 848 * Unlike in truncate_pagecache, unmap_mapping_range is called only 849 * once (before truncating pagecache), and without "even_cows" flag: 850 * hole-punching should not remove private COWed pages from the hole. 851 */ 852 if ((u64)unmap_end > (u64)unmap_start) 853 unmap_mapping_range(mapping, unmap_start, 854 1 + unmap_end - unmap_start, 0); 855 truncate_inode_pages_range(mapping, lstart, lend); 856 } 857 EXPORT_SYMBOL(truncate_pagecache_range); 858