1 /* 2 * mm/truncate.c - code for taking down pages from address_spaces 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 10Sep2002 Andrew Morton 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/backing-dev.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/export.h> 16 #include <linux/pagemap.h> 17 #include <linux/highmem.h> 18 #include <linux/pagevec.h> 19 #include <linux/task_io_accounting_ops.h> 20 #include <linux/buffer_head.h> /* grr. try_to_release_page, 21 do_invalidatepage */ 22 #include <linux/cleancache.h> 23 #include "internal.h" 24 25 26 /** 27 * do_invalidatepage - invalidate part or all of a page 28 * @page: the page which is affected 29 * @offset: the index of the truncation point 30 * 31 * do_invalidatepage() is called when all or part of the page has become 32 * invalidated by a truncate operation. 33 * 34 * do_invalidatepage() does not have to release all buffers, but it must 35 * ensure that no dirty buffer is left outside @offset and that no I/O 36 * is underway against any of the blocks which are outside the truncation 37 * point. Because the caller is about to free (and possibly reuse) those 38 * blocks on-disk. 39 */ 40 void do_invalidatepage(struct page *page, unsigned long offset) 41 { 42 void (*invalidatepage)(struct page *, unsigned long); 43 invalidatepage = page->mapping->a_ops->invalidatepage; 44 #ifdef CONFIG_BLOCK 45 if (!invalidatepage) 46 invalidatepage = block_invalidatepage; 47 #endif 48 if (invalidatepage) 49 (*invalidatepage)(page, offset); 50 } 51 52 static inline void truncate_partial_page(struct page *page, unsigned partial) 53 { 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 55 cleancache_invalidate_page(page->mapping, page); 56 if (page_has_private(page)) 57 do_invalidatepage(page, partial); 58 } 59 60 /* 61 * This cancels just the dirty bit on the kernel page itself, it 62 * does NOT actually remove dirty bits on any mmap's that may be 63 * around. It also leaves the page tagged dirty, so any sync 64 * activity will still find it on the dirty lists, and in particular, 65 * clear_page_dirty_for_io() will still look at the dirty bits in 66 * the VM. 67 * 68 * Doing this should *normally* only ever be done when a page 69 * is truncated, and is not actually mapped anywhere at all. However, 70 * fs/buffer.c does this when it notices that somebody has cleaned 71 * out all the buffers on a page without actually doing it through 72 * the VM. Can you say "ext3 is horribly ugly"? Tought you could. 73 */ 74 void cancel_dirty_page(struct page *page, unsigned int account_size) 75 { 76 if (TestClearPageDirty(page)) { 77 struct address_space *mapping = page->mapping; 78 if (mapping && mapping_cap_account_dirty(mapping)) { 79 dec_zone_page_state(page, NR_FILE_DIRTY); 80 dec_bdi_stat(mapping->backing_dev_info, 81 BDI_RECLAIMABLE); 82 if (account_size) 83 task_io_account_cancelled_write(account_size); 84 } 85 } 86 } 87 EXPORT_SYMBOL(cancel_dirty_page); 88 89 /* 90 * If truncate cannot remove the fs-private metadata from the page, the page 91 * becomes orphaned. It will be left on the LRU and may even be mapped into 92 * user pagetables if we're racing with filemap_fault(). 93 * 94 * We need to bale out if page->mapping is no longer equal to the original 95 * mapping. This happens a) when the VM reclaimed the page while we waited on 96 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 97 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 98 */ 99 static int 100 truncate_complete_page(struct address_space *mapping, struct page *page) 101 { 102 if (page->mapping != mapping) 103 return -EIO; 104 105 if (page_has_private(page)) 106 do_invalidatepage(page, 0); 107 108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 109 110 ClearPageMappedToDisk(page); 111 delete_from_page_cache(page); 112 return 0; 113 } 114 115 /* 116 * This is for invalidate_mapping_pages(). That function can be called at 117 * any time, and is not supposed to throw away dirty pages. But pages can 118 * be marked dirty at any time too, so use remove_mapping which safely 119 * discards clean, unused pages. 120 * 121 * Returns non-zero if the page was successfully invalidated. 122 */ 123 static int 124 invalidate_complete_page(struct address_space *mapping, struct page *page) 125 { 126 int ret; 127 128 if (page->mapping != mapping) 129 return 0; 130 131 if (page_has_private(page) && !try_to_release_page(page, 0)) 132 return 0; 133 134 ret = remove_mapping(mapping, page); 135 136 return ret; 137 } 138 139 int truncate_inode_page(struct address_space *mapping, struct page *page) 140 { 141 if (page_mapped(page)) { 142 unmap_mapping_range(mapping, 143 (loff_t)page->index << PAGE_CACHE_SHIFT, 144 PAGE_CACHE_SIZE, 0); 145 } 146 return truncate_complete_page(mapping, page); 147 } 148 149 /* 150 * Used to get rid of pages on hardware memory corruption. 151 */ 152 int generic_error_remove_page(struct address_space *mapping, struct page *page) 153 { 154 if (!mapping) 155 return -EINVAL; 156 /* 157 * Only punch for normal data pages for now. 158 * Handling other types like directories would need more auditing. 159 */ 160 if (!S_ISREG(mapping->host->i_mode)) 161 return -EIO; 162 return truncate_inode_page(mapping, page); 163 } 164 EXPORT_SYMBOL(generic_error_remove_page); 165 166 /* 167 * Safely invalidate one page from its pagecache mapping. 168 * It only drops clean, unused pages. The page must be locked. 169 * 170 * Returns 1 if the page is successfully invalidated, otherwise 0. 171 */ 172 int invalidate_inode_page(struct page *page) 173 { 174 struct address_space *mapping = page_mapping(page); 175 if (!mapping) 176 return 0; 177 if (PageDirty(page) || PageWriteback(page)) 178 return 0; 179 if (page_mapped(page)) 180 return 0; 181 return invalidate_complete_page(mapping, page); 182 } 183 184 /** 185 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets 186 * @mapping: mapping to truncate 187 * @lstart: offset from which to truncate 188 * @lend: offset to which to truncate 189 * 190 * Truncate the page cache, removing the pages that are between 191 * specified offsets (and zeroing out partial page 192 * (if lstart is not page aligned)). 193 * 194 * Truncate takes two passes - the first pass is nonblocking. It will not 195 * block on page locks and it will not block on writeback. The second pass 196 * will wait. This is to prevent as much IO as possible in the affected region. 197 * The first pass will remove most pages, so the search cost of the second pass 198 * is low. 199 * 200 * We pass down the cache-hot hint to the page freeing code. Even if the 201 * mapping is large, it is probably the case that the final pages are the most 202 * recently touched, and freeing happens in ascending file offset order. 203 */ 204 void truncate_inode_pages_range(struct address_space *mapping, 205 loff_t lstart, loff_t lend) 206 { 207 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 208 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 209 struct pagevec pvec; 210 pgoff_t index; 211 pgoff_t end; 212 int i; 213 214 cleancache_invalidate_inode(mapping); 215 if (mapping->nrpages == 0) 216 return; 217 218 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 219 end = (lend >> PAGE_CACHE_SHIFT); 220 221 pagevec_init(&pvec, 0); 222 index = start; 223 while (index <= end && pagevec_lookup(&pvec, mapping, index, 224 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 225 mem_cgroup_uncharge_start(); 226 for (i = 0; i < pagevec_count(&pvec); i++) { 227 struct page *page = pvec.pages[i]; 228 229 /* We rely upon deletion not changing page->index */ 230 index = page->index; 231 if (index > end) 232 break; 233 234 if (!trylock_page(page)) 235 continue; 236 WARN_ON(page->index != index); 237 if (PageWriteback(page)) { 238 unlock_page(page); 239 continue; 240 } 241 truncate_inode_page(mapping, page); 242 unlock_page(page); 243 } 244 pagevec_release(&pvec); 245 mem_cgroup_uncharge_end(); 246 cond_resched(); 247 index++; 248 } 249 250 if (partial) { 251 struct page *page = find_lock_page(mapping, start - 1); 252 if (page) { 253 wait_on_page_writeback(page); 254 truncate_partial_page(page, partial); 255 unlock_page(page); 256 page_cache_release(page); 257 } 258 } 259 260 index = start; 261 for ( ; ; ) { 262 cond_resched(); 263 if (!pagevec_lookup(&pvec, mapping, index, 264 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 265 if (index == start) 266 break; 267 index = start; 268 continue; 269 } 270 if (index == start && pvec.pages[0]->index > end) { 271 pagevec_release(&pvec); 272 break; 273 } 274 mem_cgroup_uncharge_start(); 275 for (i = 0; i < pagevec_count(&pvec); i++) { 276 struct page *page = pvec.pages[i]; 277 278 /* We rely upon deletion not changing page->index */ 279 index = page->index; 280 if (index > end) 281 break; 282 283 lock_page(page); 284 WARN_ON(page->index != index); 285 wait_on_page_writeback(page); 286 truncate_inode_page(mapping, page); 287 unlock_page(page); 288 } 289 pagevec_release(&pvec); 290 mem_cgroup_uncharge_end(); 291 index++; 292 } 293 cleancache_invalidate_inode(mapping); 294 } 295 EXPORT_SYMBOL(truncate_inode_pages_range); 296 297 /** 298 * truncate_inode_pages - truncate *all* the pages from an offset 299 * @mapping: mapping to truncate 300 * @lstart: offset from which to truncate 301 * 302 * Called under (and serialised by) inode->i_mutex. 303 * 304 * Note: When this function returns, there can be a page in the process of 305 * deletion (inside __delete_from_page_cache()) in the specified range. Thus 306 * mapping->nrpages can be non-zero when this function returns even after 307 * truncation of the whole mapping. 308 */ 309 void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 310 { 311 truncate_inode_pages_range(mapping, lstart, (loff_t)-1); 312 } 313 EXPORT_SYMBOL(truncate_inode_pages); 314 315 /** 316 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode 317 * @mapping: the address_space which holds the pages to invalidate 318 * @start: the offset 'from' which to invalidate 319 * @end: the offset 'to' which to invalidate (inclusive) 320 * 321 * This function only removes the unlocked pages, if you want to 322 * remove all the pages of one inode, you must call truncate_inode_pages. 323 * 324 * invalidate_mapping_pages() will not block on IO activity. It will not 325 * invalidate pages which are dirty, locked, under writeback or mapped into 326 * pagetables. 327 */ 328 unsigned long invalidate_mapping_pages(struct address_space *mapping, 329 pgoff_t start, pgoff_t end) 330 { 331 struct pagevec pvec; 332 pgoff_t index = start; 333 unsigned long ret; 334 unsigned long count = 0; 335 int i; 336 337 /* 338 * Note: this function may get called on a shmem/tmpfs mapping: 339 * pagevec_lookup() might then return 0 prematurely (because it 340 * got a gangful of swap entries); but it's hardly worth worrying 341 * about - it can rarely have anything to free from such a mapping 342 * (most pages are dirty), and already skips over any difficulties. 343 */ 344 345 pagevec_init(&pvec, 0); 346 while (index <= end && pagevec_lookup(&pvec, mapping, index, 347 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 348 mem_cgroup_uncharge_start(); 349 for (i = 0; i < pagevec_count(&pvec); i++) { 350 struct page *page = pvec.pages[i]; 351 352 /* We rely upon deletion not changing page->index */ 353 index = page->index; 354 if (index > end) 355 break; 356 357 if (!trylock_page(page)) 358 continue; 359 WARN_ON(page->index != index); 360 ret = invalidate_inode_page(page); 361 unlock_page(page); 362 /* 363 * Invalidation is a hint that the page is no longer 364 * of interest and try to speed up its reclaim. 365 */ 366 if (!ret) 367 deactivate_page(page); 368 count += ret; 369 } 370 pagevec_release(&pvec); 371 mem_cgroup_uncharge_end(); 372 cond_resched(); 373 index++; 374 } 375 return count; 376 } 377 EXPORT_SYMBOL(invalidate_mapping_pages); 378 379 /* 380 * This is like invalidate_complete_page(), except it ignores the page's 381 * refcount. We do this because invalidate_inode_pages2() needs stronger 382 * invalidation guarantees, and cannot afford to leave pages behind because 383 * shrink_page_list() has a temp ref on them, or because they're transiently 384 * sitting in the lru_cache_add() pagevecs. 385 */ 386 static int 387 invalidate_complete_page2(struct address_space *mapping, struct page *page) 388 { 389 if (page->mapping != mapping) 390 return 0; 391 392 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 393 return 0; 394 395 spin_lock_irq(&mapping->tree_lock); 396 if (PageDirty(page)) 397 goto failed; 398 399 BUG_ON(page_has_private(page)); 400 __delete_from_page_cache(page); 401 spin_unlock_irq(&mapping->tree_lock); 402 mem_cgroup_uncharge_cache_page(page); 403 404 if (mapping->a_ops->freepage) 405 mapping->a_ops->freepage(page); 406 407 page_cache_release(page); /* pagecache ref */ 408 return 1; 409 failed: 410 spin_unlock_irq(&mapping->tree_lock); 411 return 0; 412 } 413 414 static int do_launder_page(struct address_space *mapping, struct page *page) 415 { 416 if (!PageDirty(page)) 417 return 0; 418 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) 419 return 0; 420 return mapping->a_ops->launder_page(page); 421 } 422 423 /** 424 * invalidate_inode_pages2_range - remove range of pages from an address_space 425 * @mapping: the address_space 426 * @start: the page offset 'from' which to invalidate 427 * @end: the page offset 'to' which to invalidate (inclusive) 428 * 429 * Any pages which are found to be mapped into pagetables are unmapped prior to 430 * invalidation. 431 * 432 * Returns -EBUSY if any pages could not be invalidated. 433 */ 434 int invalidate_inode_pages2_range(struct address_space *mapping, 435 pgoff_t start, pgoff_t end) 436 { 437 struct pagevec pvec; 438 pgoff_t index; 439 int i; 440 int ret = 0; 441 int ret2 = 0; 442 int did_range_unmap = 0; 443 444 cleancache_invalidate_inode(mapping); 445 pagevec_init(&pvec, 0); 446 index = start; 447 while (index <= end && pagevec_lookup(&pvec, mapping, index, 448 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 449 mem_cgroup_uncharge_start(); 450 for (i = 0; i < pagevec_count(&pvec); i++) { 451 struct page *page = pvec.pages[i]; 452 453 /* We rely upon deletion not changing page->index */ 454 index = page->index; 455 if (index > end) 456 break; 457 458 lock_page(page); 459 WARN_ON(page->index != index); 460 if (page->mapping != mapping) { 461 unlock_page(page); 462 continue; 463 } 464 wait_on_page_writeback(page); 465 if (page_mapped(page)) { 466 if (!did_range_unmap) { 467 /* 468 * Zap the rest of the file in one hit. 469 */ 470 unmap_mapping_range(mapping, 471 (loff_t)index << PAGE_CACHE_SHIFT, 472 (loff_t)(1 + end - index) 473 << PAGE_CACHE_SHIFT, 474 0); 475 did_range_unmap = 1; 476 } else { 477 /* 478 * Just zap this page 479 */ 480 unmap_mapping_range(mapping, 481 (loff_t)index << PAGE_CACHE_SHIFT, 482 PAGE_CACHE_SIZE, 0); 483 } 484 } 485 BUG_ON(page_mapped(page)); 486 ret2 = do_launder_page(mapping, page); 487 if (ret2 == 0) { 488 if (!invalidate_complete_page2(mapping, page)) 489 ret2 = -EBUSY; 490 } 491 if (ret2 < 0) 492 ret = ret2; 493 unlock_page(page); 494 } 495 pagevec_release(&pvec); 496 mem_cgroup_uncharge_end(); 497 cond_resched(); 498 index++; 499 } 500 cleancache_invalidate_inode(mapping); 501 return ret; 502 } 503 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 504 505 /** 506 * invalidate_inode_pages2 - remove all pages from an address_space 507 * @mapping: the address_space 508 * 509 * Any pages which are found to be mapped into pagetables are unmapped prior to 510 * invalidation. 511 * 512 * Returns -EBUSY if any pages could not be invalidated. 513 */ 514 int invalidate_inode_pages2(struct address_space *mapping) 515 { 516 return invalidate_inode_pages2_range(mapping, 0, -1); 517 } 518 EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 519 520 /** 521 * truncate_pagecache - unmap and remove pagecache that has been truncated 522 * @inode: inode 523 * @oldsize: old file size 524 * @newsize: new file size 525 * 526 * inode's new i_size must already be written before truncate_pagecache 527 * is called. 528 * 529 * This function should typically be called before the filesystem 530 * releases resources associated with the freed range (eg. deallocates 531 * blocks). This way, pagecache will always stay logically coherent 532 * with on-disk format, and the filesystem would not have to deal with 533 * situations such as writepage being called for a page that has already 534 * had its underlying blocks deallocated. 535 */ 536 void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) 537 { 538 struct address_space *mapping = inode->i_mapping; 539 loff_t holebegin = round_up(newsize, PAGE_SIZE); 540 541 /* 542 * unmap_mapping_range is called twice, first simply for 543 * efficiency so that truncate_inode_pages does fewer 544 * single-page unmaps. However after this first call, and 545 * before truncate_inode_pages finishes, it is possible for 546 * private pages to be COWed, which remain after 547 * truncate_inode_pages finishes, hence the second 548 * unmap_mapping_range call must be made for correctness. 549 */ 550 unmap_mapping_range(mapping, holebegin, 0, 1); 551 truncate_inode_pages(mapping, newsize); 552 unmap_mapping_range(mapping, holebegin, 0, 1); 553 } 554 EXPORT_SYMBOL(truncate_pagecache); 555 556 /** 557 * truncate_setsize - update inode and pagecache for a new file size 558 * @inode: inode 559 * @newsize: new file size 560 * 561 * truncate_setsize updates i_size and performs pagecache truncation (if 562 * necessary) to @newsize. It will be typically be called from the filesystem's 563 * setattr function when ATTR_SIZE is passed in. 564 * 565 * Must be called with inode_mutex held and before all filesystem specific 566 * block truncation has been performed. 567 */ 568 void truncate_setsize(struct inode *inode, loff_t newsize) 569 { 570 loff_t oldsize; 571 572 oldsize = inode->i_size; 573 i_size_write(inode, newsize); 574 575 truncate_pagecache(inode, oldsize, newsize); 576 } 577 EXPORT_SYMBOL(truncate_setsize); 578 579 /** 580 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 581 * @inode: inode 582 * @lstart: offset of beginning of hole 583 * @lend: offset of last byte of hole 584 * 585 * This function should typically be called before the filesystem 586 * releases resources associated with the freed range (eg. deallocates 587 * blocks). This way, pagecache will always stay logically coherent 588 * with on-disk format, and the filesystem would not have to deal with 589 * situations such as writepage being called for a page that has already 590 * had its underlying blocks deallocated. 591 */ 592 void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) 593 { 594 struct address_space *mapping = inode->i_mapping; 595 loff_t unmap_start = round_up(lstart, PAGE_SIZE); 596 loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; 597 /* 598 * This rounding is currently just for example: unmap_mapping_range 599 * expands its hole outwards, whereas we want it to contract the hole 600 * inwards. However, existing callers of truncate_pagecache_range are 601 * doing their own page rounding first; and truncate_inode_pages_range 602 * currently BUGs if lend is not pagealigned-1 (it handles partial 603 * page at start of hole, but not partial page at end of hole). Note 604 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. 605 */ 606 607 /* 608 * Unlike in truncate_pagecache, unmap_mapping_range is called only 609 * once (before truncating pagecache), and without "even_cows" flag: 610 * hole-punching should not remove private COWed pages from the hole. 611 */ 612 if ((u64)unmap_end > (u64)unmap_start) 613 unmap_mapping_range(mapping, unmap_start, 614 1 + unmap_end - unmap_start, 0); 615 truncate_inode_pages_range(mapping, lstart, lend); 616 } 617 EXPORT_SYMBOL(truncate_pagecache_range); 618