1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/ext4/extents_status.c 4 * 5 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> 6 * Modified by 7 * Allison Henderson <achender@linux.vnet.ibm.com> 8 * Hugh Dickins <hughd@google.com> 9 * Zheng Liu <wenqing.lz@taobao.com> 10 * 11 * Ext4 extents status tree core functions. 12 */ 13 #include <linux/list_sort.h> 14 #include <linux/proc_fs.h> 15 #include <linux/seq_file.h> 16 #include "ext4.h" 17 18 #include <trace/events/ext4.h> 19 #include <kunit/static_stub.h> 20 21 /* 22 * According to previous discussion in Ext4 Developer Workshop, we 23 * will introduce a new structure called io tree to track all extent 24 * status in order to solve some problems that we have met 25 * (e.g. Reservation space warning), and provide extent-level locking. 26 * Delay extent tree is the first step to achieve this goal. It is 27 * original built by Yongqiang Yang. At that time it is called delay 28 * extent tree, whose goal is only track delayed extents in memory to 29 * simplify the implementation of fiemap and bigalloc, and introduce 30 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called 31 * delay extent tree at the first commit. But for better understand 32 * what it does, it has been rename to extent status tree. 33 * 34 * Step1: 35 * Currently the first step has been done. All delayed extents are 36 * tracked in the tree. It maintains the delayed extent when a delayed 37 * allocation is issued, and the delayed extent is written out or 38 * invalidated. Therefore the implementation of fiemap and bigalloc 39 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. 40 * 41 * The following comment describes the implemenmtation of extent 42 * status tree and future works. 43 * 44 * Step2: 45 * In this step all extent status are tracked by extent status tree. 46 * Thus, we can first try to lookup a block mapping in this tree before 47 * finding it in extent tree. Hence, single extent cache can be removed 48 * because extent status tree can do a better job. Extents in status 49 * tree are loaded on-demand. Therefore, the extent status tree may not 50 * contain all of the extents in a file. Meanwhile we define a shrinker 51 * to reclaim memory from extent status tree because fragmented extent 52 * tree will make status tree cost too much memory. written/unwritten/- 53 * hole extents in the tree will be reclaimed by this shrinker when we 54 * are under high memory pressure. Delayed extents will not be 55 * reclimed because fiemap, bigalloc, and seek_data/hole need it. 56 */ 57 58 /* 59 * Extent status tree implementation for ext4. 60 * 61 * 62 * ========================================================================== 63 * Extent status tree tracks all extent status. 64 * 65 * 1. Why we need to implement extent status tree? 66 * 67 * Without extent status tree, ext4 identifies a delayed extent by looking 68 * up page cache, this has several deficiencies - complicated, buggy, 69 * and inefficient code. 70 * 71 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a 72 * block or a range of blocks are belonged to a delayed extent. 73 * 74 * Let us have a look at how they do without extent status tree. 75 * -- FIEMAP 76 * FIEMAP looks up page cache to identify delayed allocations from holes. 77 * 78 * -- SEEK_HOLE/DATA 79 * SEEK_HOLE/DATA has the same problem as FIEMAP. 80 * 81 * -- bigalloc 82 * bigalloc looks up page cache to figure out if a block is 83 * already under delayed allocation or not to determine whether 84 * quota reserving is needed for the cluster. 85 * 86 * -- writeout 87 * Writeout looks up whole page cache to see if a buffer is 88 * mapped, If there are not very many delayed buffers, then it is 89 * time consuming. 90 * 91 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, 92 * bigalloc and writeout can figure out if a block or a range of 93 * blocks is under delayed allocation(belonged to a delayed extent) or 94 * not by searching the extent tree. 95 * 96 * 97 * ========================================================================== 98 * 2. Ext4 extent status tree impelmentation 99 * 100 * -- extent 101 * A extent is a range of blocks which are contiguous logically and 102 * physically. Unlike extent in extent tree, this extent in ext4 is 103 * a in-memory struct, there is no corresponding on-disk data. There 104 * is no limit on length of extent, so an extent can contain as many 105 * blocks as they are contiguous logically and physically. 106 * 107 * -- extent status tree 108 * Every inode has an extent status tree and all allocation blocks 109 * are added to the tree with different status. The extent in the 110 * tree are ordered by logical block no. 111 * 112 * -- operations on a extent status tree 113 * There are three important operations on a delayed extent tree: find 114 * next extent, adding a extent(a range of blocks) and removing a extent. 115 * 116 * -- race on a extent status tree 117 * Extent status tree is protected by inode->i_es_lock. 118 * 119 * -- memory consumption 120 * Fragmented extent tree will make extent status tree cost too much 121 * memory. Hence, we will reclaim written/unwritten/hole extents from 122 * the tree under a heavy memory pressure. 123 * 124 * ========================================================================== 125 * 3. Assurance of Ext4 extent status tree consistency 126 * 127 * When mapping blocks, Ext4 queries the extent status tree first and should 128 * always trusts that the extent status tree is consistent and up to date. 129 * Therefore, it is important to adheres to the following rules when createing, 130 * modifying and removing extents. 131 * 132 * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, 133 * the extent information should always be processed through the extent 134 * status tree instead of being organized manually through the on-disk 135 * extent tree. 136 * 137 * 2. When updating the extent tree, Ext4 should acquire the i_data_sem 138 * exclusively and update the extent status tree atomically. If the extents 139 * to be modified are large enough to exceed the range that a single 140 * i_data_sem can process (as ext4_datasem_ensure_credits() may drop 141 * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() 142 * does): 143 * 144 * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures 145 * exclusion against page faults, as well as reads and writes that may 146 * concurrently modify the extent status tree. 147 * b) Evict all page cache in the affected range and recommend rebuilding 148 * or dropping the extent status tree after modifying the on-disk 149 * extent tree. This ensures exclusion against concurrent writebacks 150 * that do not hold those locks but only holds a folio lock. 151 * 152 * 3. Based on the rules above, when querying block mappings, Ext4 should at 153 * least hold the i_rwsem or invalidate_lock or folio lock(s) for the 154 * specified querying range. 155 * 156 * ========================================================================== 157 * 4. Performance analysis 158 * 159 * -- overhead 160 * 1. There is a cache extent for write access, so if writes are 161 * not very random, adding space operaions are in O(1) time. 162 * 163 * -- gain 164 * 2. Code is much simpler, more readable, more maintainable and 165 * more efficient. 166 * 167 * 168 * ========================================================================== 169 * 5. TODO list 170 * 171 * -- Refactor delayed space reservation 172 * 173 * -- Extent-level locking 174 */ 175 176 static struct kmem_cache *ext4_es_cachep; 177 static struct kmem_cache *ext4_pending_cachep; 178 179 static int __es_insert_extent(struct inode *inode, struct extent_status *newes, 180 struct extent_status *prealloc); 181 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 182 ext4_lblk_t end, unsigned int status, 183 int *reserved, struct extent_status *res, 184 struct extent_status *prealloc); 185 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); 186 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 187 struct ext4_inode_info *locked_ei); 188 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, 189 ext4_lblk_t len, 190 struct pending_reservation **prealloc); 191 192 int __init ext4_init_es(void) 193 { 194 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); 195 if (ext4_es_cachep == NULL) 196 return -ENOMEM; 197 return 0; 198 } 199 200 void ext4_exit_es(void) 201 { 202 kmem_cache_destroy(ext4_es_cachep); 203 } 204 205 void ext4_es_init_tree(struct ext4_es_tree *tree) 206 { 207 tree->root = RB_ROOT; 208 tree->cache_es = NULL; 209 } 210 211 #ifdef ES_DEBUG__ 212 static void ext4_es_print_tree(struct inode *inode) 213 { 214 struct ext4_es_tree *tree; 215 struct rb_node *node; 216 217 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); 218 tree = &EXT4_I(inode)->i_es_tree; 219 node = rb_first(&tree->root); 220 while (node) { 221 struct extent_status *es; 222 es = rb_entry(node, struct extent_status, rb_node); 223 printk(KERN_DEBUG " [%u/%u) %llu %x", 224 es->es_lblk, es->es_len, 225 ext4_es_pblock(es), ext4_es_status(es)); 226 node = rb_next(node); 227 } 228 printk(KERN_DEBUG "\n"); 229 } 230 #else 231 #define ext4_es_print_tree(inode) 232 #endif 233 234 static inline ext4_lblk_t ext4_es_end(struct extent_status *es) 235 { 236 BUG_ON(es->es_lblk + es->es_len < es->es_lblk); 237 return es->es_lblk + es->es_len - 1; 238 } 239 240 static inline void ext4_es_inc_seq(struct inode *inode) 241 { 242 struct ext4_inode_info *ei = EXT4_I(inode); 243 244 WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); 245 } 246 247 static inline int __es_check_extent_status(struct extent_status *es, 248 unsigned int status, 249 struct extent_status *res) 250 { 251 if (ext4_es_type(es) & status) 252 return 0; 253 254 if (res) { 255 res->es_lblk = es->es_lblk; 256 res->es_len = es->es_len; 257 res->es_pblk = es->es_pblk; 258 } 259 return -EINVAL; 260 } 261 262 /* 263 * search through the tree for an delayed extent with a given offset. If 264 * it can't be found, try to find next extent. 265 */ 266 static struct extent_status *__es_tree_search(struct rb_root *root, 267 ext4_lblk_t lblk) 268 { 269 struct rb_node *node = root->rb_node; 270 struct extent_status *es = NULL; 271 272 while (node) { 273 es = rb_entry(node, struct extent_status, rb_node); 274 if (lblk < es->es_lblk) 275 node = node->rb_left; 276 else if (lblk > ext4_es_end(es)) 277 node = node->rb_right; 278 else 279 return es; 280 } 281 282 if (es && lblk < es->es_lblk) 283 return es; 284 285 if (es && lblk > ext4_es_end(es)) { 286 node = rb_next(&es->rb_node); 287 return node ? rb_entry(node, struct extent_status, rb_node) : 288 NULL; 289 } 290 291 return NULL; 292 } 293 294 /* 295 * ext4_es_find_extent_range - find extent with specified status within block 296 * range or next extent following block range in 297 * extents status tree 298 * 299 * @inode - file containing the range 300 * @matching_fn - pointer to function that matches extents with desired status 301 * @lblk - logical block defining start of range 302 * @end - logical block defining end of range 303 * @es - extent found, if any 304 * 305 * Find the first extent within the block range specified by @lblk and @end 306 * in the extents status tree that satisfies @matching_fn. If a match 307 * is found, it's returned in @es. If not, and a matching extent is found 308 * beyond the block range, it's returned in @es. If no match is found, an 309 * extent is returned in @es whose es_lblk, es_len, and es_pblk components 310 * are 0. 311 */ 312 static void __es_find_extent_range(struct inode *inode, 313 int (*matching_fn)(struct extent_status *es), 314 ext4_lblk_t lblk, ext4_lblk_t end, 315 struct extent_status *es) 316 { 317 struct ext4_es_tree *tree = NULL; 318 struct extent_status *es1 = NULL; 319 struct rb_node *node; 320 321 WARN_ON(es == NULL); 322 WARN_ON(end < lblk); 323 324 tree = &EXT4_I(inode)->i_es_tree; 325 326 /* see if the extent has been cached */ 327 es->es_lblk = es->es_len = es->es_pblk = 0; 328 es1 = READ_ONCE(tree->cache_es); 329 if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { 330 es_debug("%u cached by [%u/%u) %llu %x\n", 331 lblk, es1->es_lblk, es1->es_len, 332 ext4_es_pblock(es1), ext4_es_status(es1)); 333 goto out; 334 } 335 336 es1 = __es_tree_search(&tree->root, lblk); 337 338 out: 339 if (es1 && !matching_fn(es1)) { 340 while ((node = rb_next(&es1->rb_node)) != NULL) { 341 es1 = rb_entry(node, struct extent_status, rb_node); 342 if (es1->es_lblk > end) { 343 es1 = NULL; 344 break; 345 } 346 if (matching_fn(es1)) 347 break; 348 } 349 } 350 351 if (es1 && matching_fn(es1)) { 352 WRITE_ONCE(tree->cache_es, es1); 353 es->es_lblk = es1->es_lblk; 354 es->es_len = es1->es_len; 355 es->es_pblk = es1->es_pblk; 356 } 357 358 } 359 360 /* 361 * Locking for __es_find_extent_range() for external use 362 */ 363 void ext4_es_find_extent_range(struct inode *inode, 364 int (*matching_fn)(struct extent_status *es), 365 ext4_lblk_t lblk, ext4_lblk_t end, 366 struct extent_status *es) 367 { 368 es->es_lblk = es->es_len = es->es_pblk = 0; 369 370 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 371 return; 372 373 trace_ext4_es_find_extent_range_enter(inode, lblk); 374 375 read_lock(&EXT4_I(inode)->i_es_lock); 376 __es_find_extent_range(inode, matching_fn, lblk, end, es); 377 read_unlock(&EXT4_I(inode)->i_es_lock); 378 379 trace_ext4_es_find_extent_range_exit(inode, es); 380 } 381 382 /* 383 * __es_scan_range - search block range for block with specified status 384 * in extents status tree 385 * 386 * @inode - file containing the range 387 * @matching_fn - pointer to function that matches extents with desired status 388 * @lblk - logical block defining start of range 389 * @end - logical block defining end of range 390 * 391 * Returns true if at least one block in the specified block range satisfies 392 * the criterion specified by @matching_fn, and false if not. If at least 393 * one extent has the specified status, then there is at least one block 394 * in the cluster with that status. Should only be called by code that has 395 * taken i_es_lock. 396 */ 397 static bool __es_scan_range(struct inode *inode, 398 int (*matching_fn)(struct extent_status *es), 399 ext4_lblk_t start, ext4_lblk_t end) 400 { 401 struct extent_status es; 402 403 __es_find_extent_range(inode, matching_fn, start, end, &es); 404 if (es.es_len == 0) 405 return false; /* no matching extent in the tree */ 406 else if (es.es_lblk <= start && 407 start < es.es_lblk + es.es_len) 408 return true; 409 else if (start <= es.es_lblk && es.es_lblk <= end) 410 return true; 411 else 412 return false; 413 } 414 /* 415 * Locking for __es_scan_range() for external use 416 */ 417 bool ext4_es_scan_range(struct inode *inode, 418 int (*matching_fn)(struct extent_status *es), 419 ext4_lblk_t lblk, ext4_lblk_t end) 420 { 421 bool ret; 422 423 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 424 return false; 425 426 read_lock(&EXT4_I(inode)->i_es_lock); 427 ret = __es_scan_range(inode, matching_fn, lblk, end); 428 read_unlock(&EXT4_I(inode)->i_es_lock); 429 430 return ret; 431 } 432 433 /* 434 * __es_scan_clu - search cluster for block with specified status in 435 * extents status tree 436 * 437 * @inode - file containing the cluster 438 * @matching_fn - pointer to function that matches extents with desired status 439 * @lblk - logical block in cluster to be searched 440 * 441 * Returns true if at least one extent in the cluster containing @lblk 442 * satisfies the criterion specified by @matching_fn, and false if not. If at 443 * least one extent has the specified status, then there is at least one block 444 * in the cluster with that status. Should only be called by code that has 445 * taken i_es_lock. 446 */ 447 static bool __es_scan_clu(struct inode *inode, 448 int (*matching_fn)(struct extent_status *es), 449 ext4_lblk_t lblk) 450 { 451 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 452 ext4_lblk_t lblk_start, lblk_end; 453 454 lblk_start = EXT4_LBLK_CMASK(sbi, lblk); 455 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 456 457 return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); 458 } 459 460 /* 461 * Locking for __es_scan_clu() for external use 462 */ 463 bool ext4_es_scan_clu(struct inode *inode, 464 int (*matching_fn)(struct extent_status *es), 465 ext4_lblk_t lblk) 466 { 467 bool ret; 468 469 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 470 return false; 471 472 read_lock(&EXT4_I(inode)->i_es_lock); 473 ret = __es_scan_clu(inode, matching_fn, lblk); 474 read_unlock(&EXT4_I(inode)->i_es_lock); 475 476 return ret; 477 } 478 479 static void ext4_es_list_add(struct inode *inode) 480 { 481 struct ext4_inode_info *ei = EXT4_I(inode); 482 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 483 484 if (!list_empty(&ei->i_es_list)) 485 return; 486 487 spin_lock(&sbi->s_es_lock); 488 if (list_empty(&ei->i_es_list)) { 489 list_add_tail(&ei->i_es_list, &sbi->s_es_list); 490 sbi->s_es_nr_inode++; 491 } 492 spin_unlock(&sbi->s_es_lock); 493 } 494 495 static void ext4_es_list_del(struct inode *inode) 496 { 497 struct ext4_inode_info *ei = EXT4_I(inode); 498 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 499 500 spin_lock(&sbi->s_es_lock); 501 if (!list_empty(&ei->i_es_list)) { 502 list_del_init(&ei->i_es_list); 503 sbi->s_es_nr_inode--; 504 WARN_ON_ONCE(sbi->s_es_nr_inode < 0); 505 } 506 spin_unlock(&sbi->s_es_lock); 507 } 508 509 static inline struct pending_reservation *__alloc_pending(bool nofail) 510 { 511 if (!nofail) 512 return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); 513 514 return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); 515 } 516 517 static inline void __free_pending(struct pending_reservation *pr) 518 { 519 kmem_cache_free(ext4_pending_cachep, pr); 520 } 521 522 /* 523 * Returns true if we cannot fail to allocate memory for this extent_status 524 * entry and cannot reclaim it until its status changes. 525 */ 526 static inline bool ext4_es_must_keep(struct extent_status *es) 527 { 528 /* fiemap, bigalloc, and seek_data/hole need to use it. */ 529 if (ext4_es_is_delayed(es)) 530 return true; 531 532 return false; 533 } 534 535 static inline struct extent_status *__es_alloc_extent(bool nofail) 536 { 537 if (!nofail) 538 return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); 539 540 return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL); 541 } 542 543 static void ext4_es_init_extent(struct inode *inode, struct extent_status *es, 544 ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) 545 { 546 es->es_lblk = lblk; 547 es->es_len = len; 548 es->es_pblk = pblk; 549 550 /* We never try to reclaim a must kept extent, so we don't count it. */ 551 if (!ext4_es_must_keep(es)) { 552 if (!EXT4_I(inode)->i_es_shk_nr++) 553 ext4_es_list_add(inode); 554 percpu_counter_inc(&EXT4_SB(inode->i_sb)-> 555 s_es_stats.es_stats_shk_cnt); 556 } 557 558 EXT4_I(inode)->i_es_all_nr++; 559 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 560 } 561 562 static inline void __es_free_extent(struct extent_status *es) 563 { 564 kmem_cache_free(ext4_es_cachep, es); 565 } 566 567 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 568 { 569 EXT4_I(inode)->i_es_all_nr--; 570 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 571 572 /* Decrease the shrink counter when we can reclaim the extent. */ 573 if (!ext4_es_must_keep(es)) { 574 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); 575 if (!--EXT4_I(inode)->i_es_shk_nr) 576 ext4_es_list_del(inode); 577 percpu_counter_dec(&EXT4_SB(inode->i_sb)-> 578 s_es_stats.es_stats_shk_cnt); 579 } 580 581 __es_free_extent(es); 582 } 583 584 /* 585 * Check whether or not two extents can be merged 586 * Condition: 587 * - logical block number is contiguous 588 * - physical block number is contiguous 589 * - status is equal 590 */ 591 static int ext4_es_can_be_merged(struct extent_status *es1, 592 struct extent_status *es2) 593 { 594 if (ext4_es_type(es1) != ext4_es_type(es2)) 595 return 0; 596 597 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { 598 pr_warn("ES assertion failed when merging extents. " 599 "The sum of lengths of es1 (%d) and es2 (%d) " 600 "is bigger than allowed file size (%d)\n", 601 es1->es_len, es2->es_len, EXT_MAX_BLOCKS); 602 WARN_ON(1); 603 return 0; 604 } 605 606 if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) 607 return 0; 608 609 if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && 610 (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) 611 return 1; 612 613 if (ext4_es_is_hole(es1)) 614 return 1; 615 616 /* we need to check delayed extent */ 617 if (ext4_es_is_delayed(es1)) 618 return 1; 619 620 return 0; 621 } 622 623 static struct extent_status * 624 ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) 625 { 626 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 627 struct extent_status *es1; 628 struct rb_node *node; 629 630 node = rb_prev(&es->rb_node); 631 if (!node) 632 return es; 633 634 es1 = rb_entry(node, struct extent_status, rb_node); 635 if (ext4_es_can_be_merged(es1, es)) { 636 es1->es_len += es->es_len; 637 if (ext4_es_is_referenced(es)) 638 ext4_es_set_referenced(es1); 639 rb_erase(&es->rb_node, &tree->root); 640 ext4_es_free_extent(inode, es); 641 es = es1; 642 } 643 644 return es; 645 } 646 647 static struct extent_status * 648 ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) 649 { 650 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 651 struct extent_status *es1; 652 struct rb_node *node; 653 654 node = rb_next(&es->rb_node); 655 if (!node) 656 return es; 657 658 es1 = rb_entry(node, struct extent_status, rb_node); 659 if (ext4_es_can_be_merged(es, es1)) { 660 es->es_len += es1->es_len; 661 if (ext4_es_is_referenced(es1)) 662 ext4_es_set_referenced(es); 663 rb_erase(node, &tree->root); 664 ext4_es_free_extent(inode, es1); 665 } 666 667 return es; 668 } 669 670 #ifdef ES_AGGRESSIVE_TEST 671 #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ 672 673 static void ext4_es_insert_extent_ext_check(struct inode *inode, 674 struct extent_status *es) 675 { 676 struct ext4_ext_path *path = NULL; 677 struct ext4_extent *ex; 678 ext4_lblk_t ee_block; 679 ext4_fsblk_t ee_start; 680 unsigned short ee_len; 681 int depth, ee_status, es_status; 682 683 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); 684 if (IS_ERR(path)) 685 return; 686 687 depth = ext_depth(inode); 688 ex = path[depth].p_ext; 689 690 if (ex) { 691 692 ee_block = le32_to_cpu(ex->ee_block); 693 ee_start = ext4_ext_pblock(ex); 694 ee_len = ext4_ext_get_actual_len(ex); 695 696 ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; 697 es_status = ext4_es_is_unwritten(es) ? 1 : 0; 698 699 /* 700 * Make sure ex and es are not overlap when we try to insert 701 * a delayed/hole extent. 702 */ 703 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { 704 if (in_range(es->es_lblk, ee_block, ee_len)) { 705 pr_warn("ES insert assertion failed for " 706 "inode: %lu we can find an extent " 707 "at block [%d/%d/%llu/%c], but we " 708 "want to add a delayed/hole extent " 709 "[%d/%d/%llu/%x]\n", 710 inode->i_ino, ee_block, ee_len, 711 ee_start, ee_status ? 'u' : 'w', 712 es->es_lblk, es->es_len, 713 ext4_es_pblock(es), ext4_es_status(es)); 714 } 715 goto out; 716 } 717 718 /* 719 * We don't check ee_block == es->es_lblk, etc. because es 720 * might be a part of whole extent, vice versa. 721 */ 722 if (es->es_lblk < ee_block || 723 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { 724 pr_warn("ES insert assertion failed for inode: %lu " 725 "ex_status [%d/%d/%llu/%c] != " 726 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 727 ee_block, ee_len, ee_start, 728 ee_status ? 'u' : 'w', es->es_lblk, es->es_len, 729 ext4_es_pblock(es), es_status ? 'u' : 'w'); 730 goto out; 731 } 732 733 if (ee_status ^ es_status) { 734 pr_warn("ES insert assertion failed for inode: %lu " 735 "ex_status [%d/%d/%llu/%c] != " 736 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 737 ee_block, ee_len, ee_start, 738 ee_status ? 'u' : 'w', es->es_lblk, es->es_len, 739 ext4_es_pblock(es), es_status ? 'u' : 'w'); 740 } 741 } else { 742 /* 743 * We can't find an extent on disk. So we need to make sure 744 * that we don't want to add an written/unwritten extent. 745 */ 746 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 747 pr_warn("ES insert assertion failed for inode: %lu " 748 "can't find an extent at block %d but we want " 749 "to add a written/unwritten extent " 750 "[%d/%d/%llu/%x]\n", inode->i_ino, 751 es->es_lblk, es->es_lblk, es->es_len, 752 ext4_es_pblock(es), ext4_es_status(es)); 753 } 754 } 755 out: 756 ext4_free_ext_path(path); 757 } 758 759 static void ext4_es_insert_extent_ind_check(struct inode *inode, 760 struct extent_status *es) 761 { 762 struct ext4_map_blocks map; 763 int retval; 764 765 /* 766 * Here we call ext4_ind_map_blocks to lookup a block mapping because 767 * 'Indirect' structure is defined in indirect.c. So we couldn't 768 * access direct/indirect tree from outside. It is too dirty to define 769 * this function in indirect.c file. 770 */ 771 772 map.m_lblk = es->es_lblk; 773 map.m_len = es->es_len; 774 775 retval = ext4_ind_map_blocks(NULL, inode, &map, 0); 776 if (retval > 0) { 777 if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { 778 /* 779 * We want to add a delayed/hole extent but this 780 * block has been allocated. 781 */ 782 pr_warn("ES insert assertion failed for inode: %lu " 783 "We can find blocks but we want to add a " 784 "delayed/hole extent [%d/%d/%llu/%x]\n", 785 inode->i_ino, es->es_lblk, es->es_len, 786 ext4_es_pblock(es), ext4_es_status(es)); 787 return; 788 } else if (ext4_es_is_written(es)) { 789 if (retval != es->es_len) { 790 pr_warn("ES insert assertion failed for " 791 "inode: %lu retval %d != es_len %d\n", 792 inode->i_ino, retval, es->es_len); 793 return; 794 } 795 if (map.m_pblk != ext4_es_pblock(es)) { 796 pr_warn("ES insert assertion failed for " 797 "inode: %lu m_pblk %llu != " 798 "es_pblk %llu\n", 799 inode->i_ino, map.m_pblk, 800 ext4_es_pblock(es)); 801 return; 802 } 803 } else { 804 /* 805 * We don't need to check unwritten extent because 806 * indirect-based file doesn't have it. 807 */ 808 BUG(); 809 } 810 } else if (retval == 0) { 811 if (ext4_es_is_written(es)) { 812 pr_warn("ES insert assertion failed for inode: %lu " 813 "We can't find the block but we want to add " 814 "a written extent [%d/%d/%llu/%x]\n", 815 inode->i_ino, es->es_lblk, es->es_len, 816 ext4_es_pblock(es), ext4_es_status(es)); 817 return; 818 } 819 } 820 } 821 822 static inline void ext4_es_insert_extent_check(struct inode *inode, 823 struct extent_status *es) 824 { 825 /* 826 * We don't need to worry about the race condition because 827 * caller takes i_data_sem locking. 828 */ 829 BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 830 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 831 ext4_es_insert_extent_ext_check(inode, es); 832 else 833 ext4_es_insert_extent_ind_check(inode, es); 834 } 835 #else 836 static inline void ext4_es_insert_extent_check(struct inode *inode, 837 struct extent_status *es) 838 { 839 } 840 #endif 841 842 static int __es_insert_extent(struct inode *inode, struct extent_status *newes, 843 struct extent_status *prealloc) 844 { 845 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 846 struct rb_node **p = &tree->root.rb_node; 847 struct rb_node *parent = NULL; 848 struct extent_status *es; 849 850 while (*p) { 851 parent = *p; 852 es = rb_entry(parent, struct extent_status, rb_node); 853 854 if (newes->es_lblk < es->es_lblk) { 855 if (ext4_es_can_be_merged(newes, es)) { 856 /* 857 * Here we can modify es_lblk directly 858 * because it isn't overlapped. 859 */ 860 es->es_lblk = newes->es_lblk; 861 es->es_len += newes->es_len; 862 if (ext4_es_is_written(es) || 863 ext4_es_is_unwritten(es)) 864 ext4_es_store_pblock(es, 865 newes->es_pblk); 866 es = ext4_es_try_to_merge_left(inode, es); 867 goto out; 868 } 869 p = &(*p)->rb_left; 870 } else if (newes->es_lblk > ext4_es_end(es)) { 871 if (ext4_es_can_be_merged(es, newes)) { 872 es->es_len += newes->es_len; 873 es = ext4_es_try_to_merge_right(inode, es); 874 goto out; 875 } 876 p = &(*p)->rb_right; 877 } else { 878 BUG(); 879 return -EINVAL; 880 } 881 } 882 883 if (prealloc) 884 es = prealloc; 885 else 886 es = __es_alloc_extent(false); 887 if (!es) 888 return -ENOMEM; 889 ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, 890 newes->es_pblk); 891 892 rb_link_node(&es->rb_node, parent, p); 893 rb_insert_color(&es->rb_node, &tree->root); 894 895 out: 896 tree->cache_es = es; 897 return 0; 898 } 899 900 /* 901 * ext4_es_insert_extent() adds information to an inode's extent 902 * status tree. This interface is used for modifying extents. To cache 903 * on-disk extents, use ext4_es_cache_extent() instead. 904 */ 905 void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, 906 ext4_lblk_t len, ext4_fsblk_t pblk, 907 unsigned int status, bool delalloc_reserve_used) 908 { 909 struct extent_status newes; 910 ext4_lblk_t end = lblk + len - 1; 911 int err1 = 0, err2 = 0, err3 = 0; 912 int resv_used = 0, pending = 0; 913 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 914 struct extent_status *es1 = NULL; 915 struct extent_status *es2 = NULL; 916 struct pending_reservation *pr = NULL; 917 bool revise_pending = false; 918 919 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 920 return; 921 922 es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", 923 lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); 924 925 if (!len) 926 return; 927 928 BUG_ON(end < lblk); 929 WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); 930 931 newes.es_lblk = lblk; 932 newes.es_len = len; 933 ext4_es_store_pblock_status(&newes, pblk, status); 934 935 ext4_es_insert_extent_check(inode, &newes); 936 937 revise_pending = sbi->s_cluster_ratio > 1 && 938 test_opt(inode->i_sb, DELALLOC) && 939 (status & (EXTENT_STATUS_WRITTEN | 940 EXTENT_STATUS_UNWRITTEN)); 941 retry: 942 if (err1 && !es1) 943 es1 = __es_alloc_extent(true); 944 if ((err1 || err2) && !es2) 945 es2 = __es_alloc_extent(true); 946 if ((err1 || err2 || err3 < 0) && revise_pending && !pr) 947 pr = __alloc_pending(true); 948 write_lock(&EXT4_I(inode)->i_es_lock); 949 950 err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1); 951 if (err1 != 0) 952 goto error; 953 /* Free preallocated extent if it didn't get used. */ 954 if (es1) { 955 if (!es1->es_len) 956 __es_free_extent(es1); 957 es1 = NULL; 958 } 959 960 err2 = __es_insert_extent(inode, &newes, es2); 961 if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) 962 err2 = 0; 963 if (err2 != 0) 964 goto error; 965 /* Free preallocated extent if it didn't get used. */ 966 if (es2) { 967 if (!es2->es_len) 968 __es_free_extent(es2); 969 es2 = NULL; 970 } 971 972 if (revise_pending) { 973 err3 = __revise_pending(inode, lblk, len, &pr); 974 if (err3 < 0) 975 goto error; 976 if (pr) { 977 __free_pending(pr); 978 pr = NULL; 979 } 980 pending = err3; 981 } 982 ext4_es_inc_seq(inode); 983 error: 984 write_unlock(&EXT4_I(inode)->i_es_lock); 985 /* 986 * Reduce the reserved cluster count to reflect successful deferred 987 * allocation of delayed allocated clusters or direct allocation of 988 * clusters discovered to be delayed allocated. Once allocated, a 989 * cluster is not included in the reserved count. 990 * 991 * When direct allocating (from fallocate, filemap, DIO, or clusters 992 * allocated when delalloc has been disabled by ext4_nonda_switch()) 993 * an extent either 1) contains delayed blocks but start with 994 * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed 995 * allocated blocks which belong to delayed allocated clusters when 996 * bigalloc feature is enabled, quota has already been claimed by 997 * ext4_mb_new_blocks(), so release the quota reservations made for 998 * any previously delayed allocated clusters instead of claim them 999 * again. 1000 */ 1001 resv_used += pending; 1002 if (resv_used) 1003 ext4_da_update_reserve_space(inode, resv_used, 1004 delalloc_reserve_used); 1005 1006 if (err1 || err2 || err3 < 0) 1007 goto retry; 1008 1009 trace_ext4_es_insert_extent(inode, &newes); 1010 ext4_es_print_tree(inode); 1011 return; 1012 } 1013 1014 /* 1015 * ext4_es_cache_extent() inserts information into the extent status tree 1016 * only if there is no existing information about the specified range or 1017 * if the existing extents have the same status. 1018 * 1019 * Note that this interface is only used for caching on-disk extent 1020 * information and cannot be used to convert existing extents in the extent 1021 * status tree. To convert existing extents, use ext4_es_insert_extent() 1022 * instead. 1023 */ 1024 void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, 1025 ext4_lblk_t len, ext4_fsblk_t pblk, 1026 unsigned int status) 1027 { 1028 struct extent_status *es; 1029 struct extent_status chkes, newes; 1030 ext4_lblk_t end = lblk + len - 1; 1031 bool conflict = false; 1032 int err; 1033 1034 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 1035 return; 1036 1037 newes.es_lblk = lblk; 1038 newes.es_len = len; 1039 ext4_es_store_pblock_status(&newes, pblk, status); 1040 1041 if (!len) 1042 return; 1043 1044 BUG_ON(end < lblk); 1045 1046 write_lock(&EXT4_I(inode)->i_es_lock); 1047 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); 1048 if (es && es->es_lblk <= end) { 1049 /* Found an extent that covers the entire range. */ 1050 if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) { 1051 if (__es_check_extent_status(es, status, &chkes)) 1052 conflict = true; 1053 goto unlock; 1054 } 1055 /* Check and remove all extents in range. */ 1056 err = __es_remove_extent(inode, lblk, end, status, NULL, 1057 &chkes, NULL); 1058 if (err) { 1059 if (err == -EINVAL) 1060 conflict = true; 1061 goto unlock; 1062 } 1063 } 1064 __es_insert_extent(inode, &newes, NULL); 1065 trace_ext4_es_cache_extent(inode, &newes); 1066 ext4_es_print_tree(inode); 1067 unlock: 1068 write_unlock(&EXT4_I(inode)->i_es_lock); 1069 if (!conflict) 1070 return; 1071 /* 1072 * A hole in the on-disk extent but a delayed extent in the extent 1073 * status tree, is allowed. 1074 */ 1075 if (status == EXTENT_STATUS_HOLE && 1076 ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED) 1077 return; 1078 1079 ext4_warning_inode(inode, 1080 "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n", 1081 lblk, len, pblk, status, chkes.es_lblk, chkes.es_len, 1082 ext4_es_pblock(&chkes), ext4_es_status(&chkes)); 1083 } 1084 1085 /* 1086 * ext4_es_lookup_extent() looks up an extent in extent status tree. 1087 * 1088 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. 1089 * 1090 * Return: 1 on found, 0 on not 1091 */ 1092 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 1093 ext4_lblk_t *next_lblk, struct extent_status *es, 1094 u64 *pseq) 1095 { 1096 struct ext4_es_tree *tree; 1097 struct ext4_es_stats *stats; 1098 struct extent_status *es1 = NULL; 1099 struct rb_node *node; 1100 int found = 0; 1101 1102 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 1103 return 0; 1104 1105 trace_ext4_es_lookup_extent_enter(inode, lblk); 1106 es_debug("lookup extent in block %u\n", lblk); 1107 1108 tree = &EXT4_I(inode)->i_es_tree; 1109 read_lock(&EXT4_I(inode)->i_es_lock); 1110 1111 /* find extent in cache firstly */ 1112 es->es_lblk = es->es_len = es->es_pblk = 0; 1113 es1 = READ_ONCE(tree->cache_es); 1114 if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { 1115 es_debug("%u cached by [%u/%u)\n", 1116 lblk, es1->es_lblk, es1->es_len); 1117 found = 1; 1118 goto out; 1119 } 1120 1121 node = tree->root.rb_node; 1122 while (node) { 1123 es1 = rb_entry(node, struct extent_status, rb_node); 1124 if (lblk < es1->es_lblk) 1125 node = node->rb_left; 1126 else if (lblk > ext4_es_end(es1)) 1127 node = node->rb_right; 1128 else { 1129 found = 1; 1130 break; 1131 } 1132 } 1133 1134 out: 1135 stats = &EXT4_SB(inode->i_sb)->s_es_stats; 1136 if (found) { 1137 BUG_ON(!es1); 1138 es->es_lblk = es1->es_lblk; 1139 es->es_len = es1->es_len; 1140 es->es_pblk = es1->es_pblk; 1141 if (!ext4_es_is_referenced(es1)) 1142 ext4_es_set_referenced(es1); 1143 percpu_counter_inc(&stats->es_stats_cache_hits); 1144 if (next_lblk) { 1145 node = rb_next(&es1->rb_node); 1146 if (node) { 1147 es1 = rb_entry(node, struct extent_status, 1148 rb_node); 1149 *next_lblk = es1->es_lblk; 1150 } else 1151 *next_lblk = 0; 1152 } 1153 if (pseq) 1154 *pseq = EXT4_I(inode)->i_es_seq; 1155 } else { 1156 percpu_counter_inc(&stats->es_stats_cache_misses); 1157 } 1158 1159 read_unlock(&EXT4_I(inode)->i_es_lock); 1160 1161 trace_ext4_es_lookup_extent_exit(inode, es, found); 1162 return found; 1163 } 1164 1165 struct rsvd_count { 1166 int ndelayed; 1167 bool first_do_lblk_found; 1168 ext4_lblk_t first_do_lblk; 1169 ext4_lblk_t last_do_lblk; 1170 struct extent_status *left_es; 1171 bool partial; 1172 ext4_lblk_t lclu; 1173 }; 1174 1175 /* 1176 * init_rsvd - initialize reserved count data before removing block range 1177 * in file from extent status tree 1178 * 1179 * @inode - file containing range 1180 * @lblk - first block in range 1181 * @es - pointer to first extent in range 1182 * @rc - pointer to reserved count data 1183 * 1184 * Assumes es is not NULL 1185 */ 1186 static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, 1187 struct extent_status *es, struct rsvd_count *rc) 1188 { 1189 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1190 struct rb_node *node; 1191 1192 rc->ndelayed = 0; 1193 1194 /* 1195 * for bigalloc, note the first delayed block in the range has not 1196 * been found, record the extent containing the block to the left of 1197 * the region to be removed, if any, and note that there's no partial 1198 * cluster to track 1199 */ 1200 if (sbi->s_cluster_ratio > 1) { 1201 rc->first_do_lblk_found = false; 1202 if (lblk > es->es_lblk) { 1203 rc->left_es = es; 1204 } else { 1205 node = rb_prev(&es->rb_node); 1206 rc->left_es = node ? rb_entry(node, 1207 struct extent_status, 1208 rb_node) : NULL; 1209 } 1210 rc->partial = false; 1211 } 1212 } 1213 1214 /* 1215 * count_rsvd - count the clusters containing delayed blocks in a range 1216 * within an extent and add to the running tally in rsvd_count 1217 * 1218 * @inode - file containing extent 1219 * @lblk - first block in range 1220 * @len - length of range in blocks 1221 * @es - pointer to extent containing clusters to be counted 1222 * @rc - pointer to reserved count data 1223 * 1224 * Tracks partial clusters found at the beginning and end of extents so 1225 * they aren't overcounted when they span adjacent extents 1226 */ 1227 static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, 1228 struct extent_status *es, struct rsvd_count *rc) 1229 { 1230 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1231 ext4_lblk_t i, end, nclu; 1232 1233 if (!ext4_es_is_delayed(es)) 1234 return; 1235 1236 WARN_ON(len <= 0); 1237 1238 if (sbi->s_cluster_ratio == 1) { 1239 rc->ndelayed += (int) len; 1240 return; 1241 } 1242 1243 /* bigalloc */ 1244 1245 i = (lblk < es->es_lblk) ? es->es_lblk : lblk; 1246 end = lblk + (ext4_lblk_t) len - 1; 1247 end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; 1248 1249 /* record the first block of the first delayed extent seen */ 1250 if (!rc->first_do_lblk_found) { 1251 rc->first_do_lblk = i; 1252 rc->first_do_lblk_found = true; 1253 } 1254 1255 /* update the last lblk in the region seen so far */ 1256 rc->last_do_lblk = end; 1257 1258 /* 1259 * if we're tracking a partial cluster and the current extent 1260 * doesn't start with it, count it and stop tracking 1261 */ 1262 if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { 1263 rc->ndelayed++; 1264 rc->partial = false; 1265 } 1266 1267 /* 1268 * if the first cluster doesn't start on a cluster boundary but 1269 * ends on one, count it 1270 */ 1271 if (EXT4_LBLK_COFF(sbi, i) != 0) { 1272 if (end >= EXT4_LBLK_CFILL(sbi, i)) { 1273 rc->ndelayed++; 1274 rc->partial = false; 1275 i = EXT4_LBLK_CFILL(sbi, i) + 1; 1276 } 1277 } 1278 1279 /* 1280 * if the current cluster starts on a cluster boundary, count the 1281 * number of whole delayed clusters in the extent 1282 */ 1283 if ((i + sbi->s_cluster_ratio - 1) <= end) { 1284 nclu = (end - i + 1) >> sbi->s_cluster_bits; 1285 rc->ndelayed += nclu; 1286 i += nclu << sbi->s_cluster_bits; 1287 } 1288 1289 /* 1290 * start tracking a partial cluster if there's a partial at the end 1291 * of the current extent and we're not already tracking one 1292 */ 1293 if (!rc->partial && i <= end) { 1294 rc->partial = true; 1295 rc->lclu = EXT4_B2C(sbi, i); 1296 } 1297 } 1298 1299 /* 1300 * __pr_tree_search - search for a pending cluster reservation 1301 * 1302 * @root - root of pending reservation tree 1303 * @lclu - logical cluster to search for 1304 * 1305 * Returns the pending reservation for the cluster identified by @lclu 1306 * if found. If not, returns a reservation for the next cluster if any, 1307 * and if not, returns NULL. 1308 */ 1309 static struct pending_reservation *__pr_tree_search(struct rb_root *root, 1310 ext4_lblk_t lclu) 1311 { 1312 struct rb_node *node = root->rb_node; 1313 struct pending_reservation *pr = NULL; 1314 1315 while (node) { 1316 pr = rb_entry(node, struct pending_reservation, rb_node); 1317 if (lclu < pr->lclu) 1318 node = node->rb_left; 1319 else if (lclu > pr->lclu) 1320 node = node->rb_right; 1321 else 1322 return pr; 1323 } 1324 if (pr && lclu < pr->lclu) 1325 return pr; 1326 if (pr && lclu > pr->lclu) { 1327 node = rb_next(&pr->rb_node); 1328 return node ? rb_entry(node, struct pending_reservation, 1329 rb_node) : NULL; 1330 } 1331 return NULL; 1332 } 1333 1334 /* 1335 * get_rsvd - calculates and returns the number of cluster reservations to be 1336 * released when removing a block range from the extent status tree 1337 * and releases any pending reservations within the range 1338 * 1339 * @inode - file containing block range 1340 * @end - last block in range 1341 * @right_es - pointer to extent containing next block beyond end or NULL 1342 * @rc - pointer to reserved count data 1343 * 1344 * The number of reservations to be released is equal to the number of 1345 * clusters containing delayed blocks within the range, minus the number of 1346 * clusters still containing delayed blocks at the ends of the range, and 1347 * minus the number of pending reservations within the range. 1348 */ 1349 static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, 1350 struct extent_status *right_es, 1351 struct rsvd_count *rc) 1352 { 1353 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1354 struct pending_reservation *pr; 1355 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; 1356 struct rb_node *node; 1357 ext4_lblk_t first_lclu, last_lclu; 1358 bool left_delayed, right_delayed, count_pending; 1359 struct extent_status *es; 1360 1361 if (sbi->s_cluster_ratio > 1) { 1362 /* count any remaining partial cluster */ 1363 if (rc->partial) 1364 rc->ndelayed++; 1365 1366 if (rc->ndelayed == 0) 1367 return 0; 1368 1369 first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); 1370 last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); 1371 1372 /* 1373 * decrease the delayed count by the number of clusters at the 1374 * ends of the range that still contain delayed blocks - 1375 * these clusters still need to be reserved 1376 */ 1377 left_delayed = right_delayed = false; 1378 1379 es = rc->left_es; 1380 while (es && ext4_es_end(es) >= 1381 EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { 1382 if (ext4_es_is_delayed(es)) { 1383 rc->ndelayed--; 1384 left_delayed = true; 1385 break; 1386 } 1387 node = rb_prev(&es->rb_node); 1388 if (!node) 1389 break; 1390 es = rb_entry(node, struct extent_status, rb_node); 1391 } 1392 if (right_es && (!left_delayed || first_lclu != last_lclu)) { 1393 if (end < ext4_es_end(right_es)) { 1394 es = right_es; 1395 } else { 1396 node = rb_next(&right_es->rb_node); 1397 es = node ? rb_entry(node, struct extent_status, 1398 rb_node) : NULL; 1399 } 1400 while (es && es->es_lblk <= 1401 EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { 1402 if (ext4_es_is_delayed(es)) { 1403 rc->ndelayed--; 1404 right_delayed = true; 1405 break; 1406 } 1407 node = rb_next(&es->rb_node); 1408 if (!node) 1409 break; 1410 es = rb_entry(node, struct extent_status, 1411 rb_node); 1412 } 1413 } 1414 1415 /* 1416 * Determine the block range that should be searched for 1417 * pending reservations, if any. Clusters on the ends of the 1418 * original removed range containing delayed blocks are 1419 * excluded. They've already been accounted for and it's not 1420 * possible to determine if an associated pending reservation 1421 * should be released with the information available in the 1422 * extents status tree. 1423 */ 1424 if (first_lclu == last_lclu) { 1425 if (left_delayed | right_delayed) 1426 count_pending = false; 1427 else 1428 count_pending = true; 1429 } else { 1430 if (left_delayed) 1431 first_lclu++; 1432 if (right_delayed) 1433 last_lclu--; 1434 if (first_lclu <= last_lclu) 1435 count_pending = true; 1436 else 1437 count_pending = false; 1438 } 1439 1440 /* 1441 * a pending reservation found between first_lclu and last_lclu 1442 * represents an allocated cluster that contained at least one 1443 * delayed block, so the delayed total must be reduced by one 1444 * for each pending reservation found and released 1445 */ 1446 if (count_pending) { 1447 pr = __pr_tree_search(&tree->root, first_lclu); 1448 while (pr && pr->lclu <= last_lclu) { 1449 rc->ndelayed--; 1450 node = rb_next(&pr->rb_node); 1451 rb_erase(&pr->rb_node, &tree->root); 1452 __free_pending(pr); 1453 if (!node) 1454 break; 1455 pr = rb_entry(node, struct pending_reservation, 1456 rb_node); 1457 } 1458 } 1459 } 1460 return rc->ndelayed; 1461 } 1462 1463 /* 1464 * __es_remove_extent - removes block range from extent status tree 1465 * 1466 * @inode - file containing range 1467 * @lblk - first block in range 1468 * @end - last block in range 1469 * @status - the extent status to be checked 1470 * @reserved - number of cluster reservations released 1471 * @res - return the extent if the status is not match 1472 * @prealloc - pre-allocated es to avoid memory allocation failures 1473 * 1474 * If @reserved is not NULL and delayed allocation is enabled, counts 1475 * block/cluster reservations freed by removing range and if bigalloc 1476 * enabled cancels pending reservations as needed. If @status is not 1477 * zero, check extent status type while removing extent, return -EINVAL 1478 * and pass out the extent through @res if not match. Returns 0 on 1479 * success, error code on failure. 1480 */ 1481 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 1482 ext4_lblk_t end, unsigned int status, 1483 int *reserved, struct extent_status *res, 1484 struct extent_status *prealloc) 1485 { 1486 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 1487 struct rb_node *node; 1488 struct extent_status *es; 1489 struct extent_status orig_es; 1490 ext4_lblk_t len1, len2; 1491 ext4_fsblk_t block; 1492 int err; 1493 bool count_reserved = true; 1494 struct rsvd_count rc; 1495 1496 if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) 1497 count_reserved = false; 1498 if (status == 0) 1499 status = ES_TYPE_MASK; 1500 1501 es = __es_tree_search(&tree->root, lblk); 1502 if (!es) 1503 return 0; 1504 if (es->es_lblk > end) 1505 return 0; 1506 1507 err = __es_check_extent_status(es, status, res); 1508 if (err) 1509 return err; 1510 1511 /* Simply invalidate cache_es. */ 1512 tree->cache_es = NULL; 1513 if (count_reserved) 1514 init_rsvd(inode, lblk, es, &rc); 1515 1516 orig_es.es_lblk = es->es_lblk; 1517 orig_es.es_len = es->es_len; 1518 orig_es.es_pblk = es->es_pblk; 1519 1520 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; 1521 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; 1522 if (len1 > 0) 1523 es->es_len = len1; 1524 if (len2 > 0) { 1525 if (len1 > 0) { 1526 struct extent_status newes; 1527 1528 newes.es_lblk = end + 1; 1529 newes.es_len = len2; 1530 block = 0x7FDEADBEEFULL; 1531 if (ext4_es_is_written(&orig_es) || 1532 ext4_es_is_unwritten(&orig_es)) 1533 block = ext4_es_pblock(&orig_es) + 1534 orig_es.es_len - len2; 1535 ext4_es_store_pblock_status(&newes, block, 1536 ext4_es_status(&orig_es)); 1537 err = __es_insert_extent(inode, &newes, prealloc); 1538 if (err) { 1539 if (!ext4_es_must_keep(&newes)) 1540 return 0; 1541 1542 es->es_lblk = orig_es.es_lblk; 1543 es->es_len = orig_es.es_len; 1544 return err; 1545 } 1546 } else { 1547 es->es_lblk = end + 1; 1548 es->es_len = len2; 1549 if (ext4_es_is_written(es) || 1550 ext4_es_is_unwritten(es)) { 1551 block = orig_es.es_pblk + orig_es.es_len - len2; 1552 ext4_es_store_pblock(es, block); 1553 } 1554 } 1555 if (count_reserved) 1556 count_rsvd(inode, orig_es.es_lblk + len1, 1557 orig_es.es_len - len1 - len2, &orig_es, &rc); 1558 goto out; 1559 } 1560 1561 if (len1 > 0) { 1562 if (count_reserved) 1563 count_rsvd(inode, lblk, orig_es.es_len - len1, 1564 &orig_es, &rc); 1565 node = rb_next(&es->rb_node); 1566 if (node) 1567 es = rb_entry(node, struct extent_status, rb_node); 1568 else 1569 es = NULL; 1570 } 1571 1572 while (es && ext4_es_end(es) <= end) { 1573 err = __es_check_extent_status(es, status, res); 1574 if (err) 1575 return err; 1576 if (count_reserved) 1577 count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); 1578 node = rb_next(&es->rb_node); 1579 rb_erase(&es->rb_node, &tree->root); 1580 ext4_es_free_extent(inode, es); 1581 if (!node) { 1582 es = NULL; 1583 break; 1584 } 1585 es = rb_entry(node, struct extent_status, rb_node); 1586 } 1587 1588 if (es && es->es_lblk < end + 1) { 1589 ext4_lblk_t orig_len = es->es_len; 1590 1591 err = __es_check_extent_status(es, status, res); 1592 if (err) 1593 return err; 1594 1595 len1 = ext4_es_end(es) - end; 1596 if (count_reserved) 1597 count_rsvd(inode, es->es_lblk, orig_len - len1, 1598 es, &rc); 1599 es->es_lblk = end + 1; 1600 es->es_len = len1; 1601 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { 1602 block = es->es_pblk + orig_len - len1; 1603 ext4_es_store_pblock(es, block); 1604 } 1605 } 1606 1607 out: 1608 if (count_reserved) 1609 *reserved = get_rsvd(inode, end, es, &rc); 1610 return 0; 1611 } 1612 1613 /* 1614 * ext4_es_remove_extent - removes block range from extent status tree 1615 * 1616 * @inode - file containing range 1617 * @lblk - first block in range 1618 * @len - number of blocks to remove 1619 * 1620 * Reduces block/cluster reservation count and for bigalloc cancels pending 1621 * reservations as needed. 1622 */ 1623 void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 1624 ext4_lblk_t len) 1625 { 1626 ext4_lblk_t end; 1627 int err = 0; 1628 int reserved = 0; 1629 struct extent_status *es = NULL; 1630 1631 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 1632 return; 1633 1634 es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 1635 lblk, len, inode->i_ino); 1636 1637 if (!len) 1638 return; 1639 1640 end = lblk + len - 1; 1641 BUG_ON(end < lblk); 1642 1643 retry: 1644 if (err && !es) 1645 es = __es_alloc_extent(true); 1646 /* 1647 * ext4_clear_inode() depends on us taking i_es_lock unconditionally 1648 * so that we are sure __es_shrink() is done with the inode before it 1649 * is reclaimed. 1650 */ 1651 write_lock(&EXT4_I(inode)->i_es_lock); 1652 err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es); 1653 if (err) 1654 goto error; 1655 /* Free preallocated extent if it didn't get used. */ 1656 if (es) { 1657 if (!es->es_len) 1658 __es_free_extent(es); 1659 es = NULL; 1660 } 1661 ext4_es_inc_seq(inode); 1662 error: 1663 write_unlock(&EXT4_I(inode)->i_es_lock); 1664 if (err) 1665 goto retry; 1666 1667 trace_ext4_es_remove_extent(inode, lblk, len); 1668 ext4_es_print_tree(inode); 1669 ext4_da_release_space(inode, reserved); 1670 } 1671 1672 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 1673 struct ext4_inode_info *locked_ei) 1674 { 1675 struct ext4_inode_info *ei; 1676 struct ext4_es_stats *es_stats; 1677 ktime_t start_time; 1678 u64 scan_time; 1679 int nr_to_walk; 1680 int nr_shrunk = 0; 1681 int retried = 0, nr_skipped = 0; 1682 1683 es_stats = &sbi->s_es_stats; 1684 start_time = ktime_get(); 1685 1686 retry: 1687 spin_lock(&sbi->s_es_lock); 1688 nr_to_walk = sbi->s_es_nr_inode; 1689 while (nr_to_walk-- > 0) { 1690 if (list_empty(&sbi->s_es_list)) { 1691 spin_unlock(&sbi->s_es_lock); 1692 goto out; 1693 } 1694 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, 1695 i_es_list); 1696 /* Move the inode to the tail */ 1697 list_move_tail(&ei->i_es_list, &sbi->s_es_list); 1698 1699 /* 1700 * Normally we try hard to avoid shrinking precached inodes, 1701 * but we will as a last resort. 1702 */ 1703 if (!retried && ext4_test_inode_state(&ei->vfs_inode, 1704 EXT4_STATE_EXT_PRECACHED)) { 1705 nr_skipped++; 1706 continue; 1707 } 1708 1709 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { 1710 nr_skipped++; 1711 continue; 1712 } 1713 /* 1714 * Now we hold i_es_lock which protects us from inode reclaim 1715 * freeing inode under us 1716 */ 1717 spin_unlock(&sbi->s_es_lock); 1718 1719 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); 1720 write_unlock(&ei->i_es_lock); 1721 1722 if (nr_to_scan <= 0) 1723 goto out; 1724 spin_lock(&sbi->s_es_lock); 1725 } 1726 spin_unlock(&sbi->s_es_lock); 1727 1728 /* 1729 * If we skipped any inodes, and we weren't able to make any 1730 * forward progress, try again to scan precached inodes. 1731 */ 1732 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1733 retried++; 1734 goto retry; 1735 } 1736 1737 if (locked_ei && nr_shrunk == 0) 1738 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); 1739 1740 out: 1741 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1742 if (likely(es_stats->es_stats_scan_time)) 1743 es_stats->es_stats_scan_time = (scan_time + 1744 es_stats->es_stats_scan_time*3) / 4; 1745 else 1746 es_stats->es_stats_scan_time = scan_time; 1747 if (scan_time > es_stats->es_stats_max_scan_time) 1748 es_stats->es_stats_max_scan_time = scan_time; 1749 if (likely(es_stats->es_stats_shrunk)) 1750 es_stats->es_stats_shrunk = (nr_shrunk + 1751 es_stats->es_stats_shrunk*3) / 4; 1752 else 1753 es_stats->es_stats_shrunk = nr_shrunk; 1754 1755 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, 1756 nr_skipped, retried); 1757 return nr_shrunk; 1758 } 1759 1760 static unsigned long ext4_es_count(struct shrinker *shrink, 1761 struct shrink_control *sc) 1762 { 1763 unsigned long nr; 1764 struct ext4_sb_info *sbi; 1765 1766 sbi = shrink->private_data; 1767 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); 1768 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); 1769 return nr; 1770 } 1771 1772 static unsigned long ext4_es_scan(struct shrinker *shrink, 1773 struct shrink_control *sc) 1774 { 1775 struct ext4_sb_info *sbi = shrink->private_data; 1776 int nr_to_scan = sc->nr_to_scan; 1777 int ret, nr_shrunk; 1778 1779 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); 1780 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1781 1782 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); 1783 1784 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); 1785 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); 1786 return nr_shrunk; 1787 } 1788 1789 int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) 1790 { 1791 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private); 1792 struct ext4_es_stats *es_stats = &sbi->s_es_stats; 1793 struct ext4_inode_info *ei, *max = NULL; 1794 unsigned int inode_cnt = 0; 1795 1796 if (v != SEQ_START_TOKEN) 1797 return 0; 1798 1799 /* here we just find an inode that has the max nr. of objects */ 1800 spin_lock(&sbi->s_es_lock); 1801 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { 1802 inode_cnt++; 1803 if (max && max->i_es_all_nr < ei->i_es_all_nr) 1804 max = ei; 1805 else if (!max) 1806 max = ei; 1807 } 1808 spin_unlock(&sbi->s_es_lock); 1809 1810 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1811 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1812 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); 1813 seq_printf(seq, " %lld/%lld cache hits/misses\n", 1814 percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), 1815 percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); 1816 if (inode_cnt) 1817 seq_printf(seq, " %d inodes on list\n", inode_cnt); 1818 1819 seq_printf(seq, "average:\n %llu us scan time\n", 1820 div_u64(es_stats->es_stats_scan_time, 1000)); 1821 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); 1822 if (inode_cnt) 1823 seq_printf(seq, 1824 "maximum:\n %lu inode (%u objects, %u reclaimable)\n" 1825 " %llu us max scan time\n", 1826 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, 1827 div_u64(es_stats->es_stats_max_scan_time, 1000)); 1828 1829 return 0; 1830 } 1831 1832 int ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1833 { 1834 int err; 1835 1836 /* Make sure we have enough bits for physical block number */ 1837 BUILD_BUG_ON(ES_SHIFT < 48); 1838 INIT_LIST_HEAD(&sbi->s_es_list); 1839 sbi->s_es_nr_inode = 0; 1840 spin_lock_init(&sbi->s_es_lock); 1841 sbi->s_es_stats.es_stats_shrunk = 0; 1842 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, 1843 GFP_KERNEL); 1844 if (err) 1845 return err; 1846 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, 1847 GFP_KERNEL); 1848 if (err) 1849 goto err1; 1850 sbi->s_es_stats.es_stats_scan_time = 0; 1851 sbi->s_es_stats.es_stats_max_scan_time = 0; 1852 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); 1853 if (err) 1854 goto err2; 1855 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); 1856 if (err) 1857 goto err3; 1858 1859 sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); 1860 if (!sbi->s_es_shrinker) { 1861 err = -ENOMEM; 1862 goto err4; 1863 } 1864 1865 sbi->s_es_shrinker->scan_objects = ext4_es_scan; 1866 sbi->s_es_shrinker->count_objects = ext4_es_count; 1867 sbi->s_es_shrinker->private_data = sbi; 1868 1869 shrinker_register(sbi->s_es_shrinker); 1870 1871 return 0; 1872 err4: 1873 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); 1874 err3: 1875 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1876 err2: 1877 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); 1878 err1: 1879 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); 1880 return err; 1881 } 1882 1883 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 1884 { 1885 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); 1886 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); 1887 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1888 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); 1889 shrinker_free(sbi->s_es_shrinker); 1890 } 1891 1892 /* 1893 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at 1894 * most *nr_to_scan extents, update *nr_to_scan accordingly. 1895 * 1896 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. 1897 * Increment *nr_shrunk by the number of reclaimed extents. Also update 1898 * ei->i_es_shrink_lblk to where we should continue scanning. 1899 */ 1900 static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, 1901 int *nr_to_scan, int *nr_shrunk) 1902 { 1903 struct inode *inode = &ei->vfs_inode; 1904 struct ext4_es_tree *tree = &ei->i_es_tree; 1905 struct extent_status *es; 1906 struct rb_node *node; 1907 1908 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); 1909 if (!es) 1910 goto out_wrap; 1911 1912 while (*nr_to_scan > 0) { 1913 if (es->es_lblk > end) { 1914 ei->i_es_shrink_lblk = end + 1; 1915 return 0; 1916 } 1917 1918 (*nr_to_scan)--; 1919 node = rb_next(&es->rb_node); 1920 1921 if (ext4_es_must_keep(es)) 1922 goto next; 1923 if (ext4_es_is_referenced(es)) { 1924 ext4_es_clear_referenced(es); 1925 goto next; 1926 } 1927 1928 rb_erase(&es->rb_node, &tree->root); 1929 ext4_es_free_extent(inode, es); 1930 (*nr_shrunk)++; 1931 next: 1932 if (!node) 1933 goto out_wrap; 1934 es = rb_entry(node, struct extent_status, rb_node); 1935 } 1936 ei->i_es_shrink_lblk = es->es_lblk; 1937 return 1; 1938 out_wrap: 1939 ei->i_es_shrink_lblk = 0; 1940 return 0; 1941 } 1942 1943 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) 1944 { 1945 struct inode *inode = &ei->vfs_inode; 1946 int nr_shrunk = 0; 1947 ext4_lblk_t start = ei->i_es_shrink_lblk; 1948 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1949 DEFAULT_RATELIMIT_BURST); 1950 1951 if (ei->i_es_shk_nr == 0) 1952 return 0; 1953 1954 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && 1955 __ratelimit(&_rs)) 1956 ext4_warning(inode->i_sb, "forced shrink of precached extents"); 1957 1958 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && 1959 start != 0) 1960 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); 1961 1962 ei->i_es_tree.cache_es = NULL; 1963 return nr_shrunk; 1964 } 1965 1966 /* 1967 * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove 1968 * discretionary entries from the extent status cache. (Some entries 1969 * must be present for proper operations.) 1970 */ 1971 void ext4_clear_inode_es(struct inode *inode) 1972 { 1973 struct ext4_inode_info *ei = EXT4_I(inode); 1974 struct extent_status *es; 1975 struct ext4_es_tree *tree; 1976 struct rb_node *node; 1977 1978 write_lock(&ei->i_es_lock); 1979 tree = &EXT4_I(inode)->i_es_tree; 1980 tree->cache_es = NULL; 1981 node = rb_first(&tree->root); 1982 while (node) { 1983 es = rb_entry(node, struct extent_status, rb_node); 1984 node = rb_next(node); 1985 if (!ext4_es_must_keep(es)) { 1986 rb_erase(&es->rb_node, &tree->root); 1987 ext4_es_free_extent(inode, es); 1988 } 1989 } 1990 ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); 1991 write_unlock(&ei->i_es_lock); 1992 } 1993 1994 #ifdef ES_DEBUG__ 1995 static void ext4_print_pending_tree(struct inode *inode) 1996 { 1997 struct ext4_pending_tree *tree; 1998 struct rb_node *node; 1999 struct pending_reservation *pr; 2000 2001 printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); 2002 tree = &EXT4_I(inode)->i_pending_tree; 2003 node = rb_first(&tree->root); 2004 while (node) { 2005 pr = rb_entry(node, struct pending_reservation, rb_node); 2006 printk(KERN_DEBUG " %u", pr->lclu); 2007 node = rb_next(node); 2008 } 2009 printk(KERN_DEBUG "\n"); 2010 } 2011 #else 2012 #define ext4_print_pending_tree(inode) 2013 #endif 2014 2015 int __init ext4_init_pending(void) 2016 { 2017 ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); 2018 if (ext4_pending_cachep == NULL) 2019 return -ENOMEM; 2020 return 0; 2021 } 2022 2023 void ext4_exit_pending(void) 2024 { 2025 kmem_cache_destroy(ext4_pending_cachep); 2026 } 2027 2028 void ext4_init_pending_tree(struct ext4_pending_tree *tree) 2029 { 2030 tree->root = RB_ROOT; 2031 } 2032 2033 /* 2034 * __get_pending - retrieve a pointer to a pending reservation 2035 * 2036 * @inode - file containing the pending cluster reservation 2037 * @lclu - logical cluster of interest 2038 * 2039 * Returns a pointer to a pending reservation if it's a member of 2040 * the set, and NULL if not. Must be called holding i_es_lock. 2041 */ 2042 static struct pending_reservation *__get_pending(struct inode *inode, 2043 ext4_lblk_t lclu) 2044 { 2045 struct ext4_pending_tree *tree; 2046 struct rb_node *node; 2047 struct pending_reservation *pr = NULL; 2048 2049 tree = &EXT4_I(inode)->i_pending_tree; 2050 node = (&tree->root)->rb_node; 2051 2052 while (node) { 2053 pr = rb_entry(node, struct pending_reservation, rb_node); 2054 if (lclu < pr->lclu) 2055 node = node->rb_left; 2056 else if (lclu > pr->lclu) 2057 node = node->rb_right; 2058 else if (lclu == pr->lclu) 2059 return pr; 2060 } 2061 return NULL; 2062 } 2063 2064 /* 2065 * __insert_pending - adds a pending cluster reservation to the set of 2066 * pending reservations 2067 * 2068 * @inode - file containing the cluster 2069 * @lblk - logical block in the cluster to be added 2070 * @prealloc - preallocated pending entry 2071 * 2072 * Returns 1 on successful insertion and -ENOMEM on failure. If the 2073 * pending reservation is already in the set, returns successfully. 2074 */ 2075 static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, 2076 struct pending_reservation **prealloc) 2077 { 2078 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2079 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; 2080 struct rb_node **p = &tree->root.rb_node; 2081 struct rb_node *parent = NULL; 2082 struct pending_reservation *pr; 2083 ext4_lblk_t lclu; 2084 int ret = 0; 2085 2086 lclu = EXT4_B2C(sbi, lblk); 2087 /* search to find parent for insertion */ 2088 while (*p) { 2089 parent = *p; 2090 pr = rb_entry(parent, struct pending_reservation, rb_node); 2091 2092 if (lclu < pr->lclu) { 2093 p = &(*p)->rb_left; 2094 } else if (lclu > pr->lclu) { 2095 p = &(*p)->rb_right; 2096 } else { 2097 /* pending reservation already inserted */ 2098 goto out; 2099 } 2100 } 2101 2102 if (likely(*prealloc == NULL)) { 2103 pr = __alloc_pending(false); 2104 if (!pr) { 2105 ret = -ENOMEM; 2106 goto out; 2107 } 2108 } else { 2109 pr = *prealloc; 2110 *prealloc = NULL; 2111 } 2112 pr->lclu = lclu; 2113 2114 rb_link_node(&pr->rb_node, parent, p); 2115 rb_insert_color(&pr->rb_node, &tree->root); 2116 ret = 1; 2117 2118 out: 2119 return ret; 2120 } 2121 2122 /* 2123 * __remove_pending - removes a pending cluster reservation from the set 2124 * of pending reservations 2125 * 2126 * @inode - file containing the cluster 2127 * @lblk - logical block in the pending cluster reservation to be removed 2128 * 2129 * Returns successfully if pending reservation is not a member of the set. 2130 */ 2131 static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) 2132 { 2133 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2134 struct pending_reservation *pr; 2135 struct ext4_pending_tree *tree; 2136 2137 pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); 2138 if (pr != NULL) { 2139 tree = &EXT4_I(inode)->i_pending_tree; 2140 rb_erase(&pr->rb_node, &tree->root); 2141 __free_pending(pr); 2142 } 2143 } 2144 2145 /* 2146 * ext4_remove_pending - removes a pending cluster reservation from the set 2147 * of pending reservations 2148 * 2149 * @inode - file containing the cluster 2150 * @lblk - logical block in the pending cluster reservation to be removed 2151 * 2152 * Locking for external use of __remove_pending. 2153 */ 2154 void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) 2155 { 2156 struct ext4_inode_info *ei = EXT4_I(inode); 2157 2158 write_lock(&ei->i_es_lock); 2159 __remove_pending(inode, lblk); 2160 write_unlock(&ei->i_es_lock); 2161 } 2162 2163 /* 2164 * ext4_is_pending - determine whether a cluster has a pending reservation 2165 * on it 2166 * 2167 * @inode - file containing the cluster 2168 * @lblk - logical block in the cluster 2169 * 2170 * Returns true if there's a pending reservation for the cluster in the 2171 * set of pending reservations, and false if not. 2172 */ 2173 bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) 2174 { 2175 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2176 struct ext4_inode_info *ei = EXT4_I(inode); 2177 bool ret; 2178 2179 read_lock(&ei->i_es_lock); 2180 ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); 2181 read_unlock(&ei->i_es_lock); 2182 2183 return ret; 2184 } 2185 2186 /* 2187 * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents 2188 * status tree, adding a pending reservation 2189 * where needed 2190 * 2191 * @inode - file containing the newly added block 2192 * @lblk - start logical block to be added 2193 * @len - length of blocks to be added 2194 * @lclu_allocated/end_allocated - indicates whether a physical cluster has 2195 * been allocated for the logical cluster 2196 * that contains the start/end block. Note that 2197 * end_allocated should always be set to false 2198 * if the start and the end block are in the 2199 * same cluster 2200 */ 2201 void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, 2202 ext4_lblk_t len, bool lclu_allocated, 2203 bool end_allocated) 2204 { 2205 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2206 struct extent_status newes; 2207 ext4_lblk_t end = lblk + len - 1; 2208 int err1 = 0, err2 = 0, err3 = 0; 2209 struct extent_status *es1 = NULL; 2210 struct extent_status *es2 = NULL; 2211 struct pending_reservation *pr1 = NULL; 2212 struct pending_reservation *pr2 = NULL; 2213 2214 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 2215 return; 2216 2217 es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", 2218 lblk, len, inode->i_ino); 2219 if (!len) 2220 return; 2221 2222 WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && 2223 end_allocated); 2224 2225 newes.es_lblk = lblk; 2226 newes.es_len = len; 2227 ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); 2228 2229 ext4_es_insert_extent_check(inode, &newes); 2230 2231 retry: 2232 if (err1 && !es1) 2233 es1 = __es_alloc_extent(true); 2234 if ((err1 || err2) && !es2) 2235 es2 = __es_alloc_extent(true); 2236 if (err1 || err2 || err3 < 0) { 2237 if (lclu_allocated && !pr1) 2238 pr1 = __alloc_pending(true); 2239 if (end_allocated && !pr2) 2240 pr2 = __alloc_pending(true); 2241 } 2242 write_lock(&EXT4_I(inode)->i_es_lock); 2243 2244 err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1); 2245 if (err1 != 0) 2246 goto error; 2247 /* Free preallocated extent if it didn't get used. */ 2248 if (es1) { 2249 if (!es1->es_len) 2250 __es_free_extent(es1); 2251 es1 = NULL; 2252 } 2253 2254 err2 = __es_insert_extent(inode, &newes, es2); 2255 if (err2 != 0) 2256 goto error; 2257 /* Free preallocated extent if it didn't get used. */ 2258 if (es2) { 2259 if (!es2->es_len) 2260 __es_free_extent(es2); 2261 es2 = NULL; 2262 } 2263 2264 if (lclu_allocated) { 2265 err3 = __insert_pending(inode, lblk, &pr1); 2266 if (err3 < 0) 2267 goto error; 2268 if (pr1) { 2269 __free_pending(pr1); 2270 pr1 = NULL; 2271 } 2272 } 2273 if (end_allocated) { 2274 err3 = __insert_pending(inode, end, &pr2); 2275 if (err3 < 0) 2276 goto error; 2277 if (pr2) { 2278 __free_pending(pr2); 2279 pr2 = NULL; 2280 } 2281 } 2282 ext4_es_inc_seq(inode); 2283 error: 2284 write_unlock(&EXT4_I(inode)->i_es_lock); 2285 if (err1 || err2 || err3 < 0) 2286 goto retry; 2287 2288 trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, 2289 end_allocated); 2290 ext4_es_print_tree(inode); 2291 ext4_print_pending_tree(inode); 2292 return; 2293 } 2294 2295 /* 2296 * __revise_pending - makes, cancels, or leaves unchanged pending cluster 2297 * reservations for a specified block range depending 2298 * upon the presence or absence of delayed blocks 2299 * outside the range within clusters at the ends of the 2300 * range 2301 * 2302 * @inode - file containing the range 2303 * @lblk - logical block defining the start of range 2304 * @len - length of range in blocks 2305 * @prealloc - preallocated pending entry 2306 * 2307 * Used after a newly allocated extent is added to the extents status tree. 2308 * Requires that the extents in the range have either written or unwritten 2309 * status. Must be called while holding i_es_lock. Returns number of new 2310 * inserts pending cluster on insert pendings, returns 0 on remove pendings, 2311 * return -ENOMEM on failure. 2312 */ 2313 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, 2314 ext4_lblk_t len, 2315 struct pending_reservation **prealloc) 2316 { 2317 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2318 ext4_lblk_t end = lblk + len - 1; 2319 ext4_lblk_t first, last; 2320 bool f_del = false, l_del = false; 2321 int pendings = 0; 2322 int ret = 0; 2323 2324 if (len == 0) 2325 return 0; 2326 2327 /* 2328 * Two cases - block range within single cluster and block range 2329 * spanning two or more clusters. Note that a cluster belonging 2330 * to a range starting and/or ending on a cluster boundary is treated 2331 * as if it does not contain a delayed extent. The new range may 2332 * have allocated space for previously delayed blocks out to the 2333 * cluster boundary, requiring that any pre-existing pending 2334 * reservation be canceled. Because this code only looks at blocks 2335 * outside the range, it should revise pending reservations 2336 * correctly even if the extent represented by the range can't be 2337 * inserted in the extents status tree due to ENOSPC. 2338 */ 2339 2340 if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { 2341 first = EXT4_LBLK_CMASK(sbi, lblk); 2342 if (first != lblk) 2343 f_del = __es_scan_range(inode, &ext4_es_is_delayed, 2344 first, lblk - 1); 2345 if (f_del) { 2346 ret = __insert_pending(inode, first, prealloc); 2347 if (ret < 0) 2348 goto out; 2349 pendings += ret; 2350 } else { 2351 last = EXT4_LBLK_CMASK(sbi, end) + 2352 sbi->s_cluster_ratio - 1; 2353 if (last != end) 2354 l_del = __es_scan_range(inode, 2355 &ext4_es_is_delayed, 2356 end + 1, last); 2357 if (l_del) { 2358 ret = __insert_pending(inode, last, prealloc); 2359 if (ret < 0) 2360 goto out; 2361 pendings += ret; 2362 } else 2363 __remove_pending(inode, last); 2364 } 2365 } else { 2366 first = EXT4_LBLK_CMASK(sbi, lblk); 2367 if (first != lblk) 2368 f_del = __es_scan_range(inode, &ext4_es_is_delayed, 2369 first, lblk - 1); 2370 if (f_del) { 2371 ret = __insert_pending(inode, first, prealloc); 2372 if (ret < 0) 2373 goto out; 2374 pendings += ret; 2375 } else 2376 __remove_pending(inode, first); 2377 2378 last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; 2379 if (last != end) 2380 l_del = __es_scan_range(inode, &ext4_es_is_delayed, 2381 end + 1, last); 2382 if (l_del) { 2383 ret = __insert_pending(inode, last, prealloc); 2384 if (ret < 0) 2385 goto out; 2386 pendings += ret; 2387 } else 2388 __remove_pending(inode, last); 2389 } 2390 out: 2391 return (ret < 0) ? ret : pendings; 2392 } 2393