1 /* 2 * fs/ext4/extents_status.c 3 * 4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> 5 * Modified by 6 * Allison Henderson <achender@linux.vnet.ibm.com> 7 * Hugh Dickins <hughd@google.com> 8 * Zheng Liu <wenqing.lz@taobao.com> 9 * 10 * Ext4 extents status tree core functions. 11 */ 12 #include <linux/rbtree.h> 13 #include "ext4.h" 14 #include "extents_status.h" 15 #include "ext4_extents.h" 16 17 #include <trace/events/ext4.h> 18 19 /* 20 * According to previous discussion in Ext4 Developer Workshop, we 21 * will introduce a new structure called io tree to track all extent 22 * status in order to solve some problems that we have met 23 * (e.g. Reservation space warning), and provide extent-level locking. 24 * Delay extent tree is the first step to achieve this goal. It is 25 * original built by Yongqiang Yang. At that time it is called delay 26 * extent tree, whose goal is only track delay extent in memory to 27 * simplify the implementation of fiemap and bigalloc, and introduce 28 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called 29 * delay extent tree at the following comment. But for better 30 * understand what it does, it has been rename to extent status tree. 31 * 32 * Currently the first step has been done. All delay extents are 33 * tracked in the tree. It maintains the delay extent when a delay 34 * allocation is issued, and the delay extent is written out or 35 * invalidated. Therefore the implementation of fiemap and bigalloc 36 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. 37 * 38 * The following comment describes the implemenmtation of extent 39 * status tree and future works. 40 */ 41 42 /* 43 * extents status tree implementation for ext4. 44 * 45 * 46 * ========================================================================== 47 * Extents status encompass delayed extents and extent locks 48 * 49 * 1. Why delayed extent implementation ? 50 * 51 * Without delayed extent, ext4 identifies a delayed extent by looking 52 * up page cache, this has several deficiencies - complicated, buggy, 53 * and inefficient code. 54 * 55 * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need 56 * to know if a block or a range of blocks are belonged to a delayed 57 * extent. 58 * 59 * Let us have a look at how they do without delayed extents implementation. 60 * -- FIEMAP 61 * FIEMAP looks up page cache to identify delayed allocations from holes. 62 * 63 * -- SEEK_HOLE/DATA 64 * SEEK_HOLE/DATA has the same problem as FIEMAP. 65 * 66 * -- bigalloc 67 * bigalloc looks up page cache to figure out if a block is 68 * already under delayed allocation or not to determine whether 69 * quota reserving is needed for the cluster. 70 * 71 * -- punch hole 72 * punch hole looks up page cache to identify a delayed extent. 73 * 74 * -- writeout 75 * Writeout looks up whole page cache to see if a buffer is 76 * mapped, If there are not very many delayed buffers, then it is 77 * time comsuming. 78 * 79 * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, 80 * bigalloc and writeout can figure out if a block or a range of 81 * blocks is under delayed allocation(belonged to a delayed extent) or 82 * not by searching the delayed extent tree. 83 * 84 * 85 * ========================================================================== 86 * 2. ext4 delayed extents impelmentation 87 * 88 * -- delayed extent 89 * A delayed extent is a range of blocks which are contiguous 90 * logically and under delayed allocation. Unlike extent in 91 * ext4, delayed extent in ext4 is a in-memory struct, there is 92 * no corresponding on-disk data. There is no limit on length of 93 * delayed extent, so a delayed extent can contain as many blocks 94 * as they are contiguous logically. 95 * 96 * -- delayed extent tree 97 * Every inode has a delayed extent tree and all under delayed 98 * allocation blocks are added to the tree as delayed extents. 99 * Delayed extents in the tree are ordered by logical block no. 100 * 101 * -- operations on a delayed extent tree 102 * There are three operations on a delayed extent tree: find next 103 * delayed extent, adding a space(a range of blocks) and removing 104 * a space. 105 * 106 * -- race on a delayed extent tree 107 * Delayed extent tree is protected inode->i_es_lock. 108 * 109 * 110 * ========================================================================== 111 * 3. performance analysis 112 * -- overhead 113 * 1. There is a cache extent for write access, so if writes are 114 * not very random, adding space operaions are in O(1) time. 115 * 116 * -- gain 117 * 2. Code is much simpler, more readable, more maintainable and 118 * more efficient. 119 * 120 * 121 * ========================================================================== 122 * 4. TODO list 123 * -- Track all extent status 124 * 125 * -- Improve get block process 126 * 127 * -- Extent-level locking 128 */ 129 130 static struct kmem_cache *ext4_es_cachep; 131 132 int __init ext4_init_es(void) 133 { 134 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); 135 if (ext4_es_cachep == NULL) 136 return -ENOMEM; 137 return 0; 138 } 139 140 void ext4_exit_es(void) 141 { 142 if (ext4_es_cachep) 143 kmem_cache_destroy(ext4_es_cachep); 144 } 145 146 void ext4_es_init_tree(struct ext4_es_tree *tree) 147 { 148 tree->root = RB_ROOT; 149 tree->cache_es = NULL; 150 } 151 152 #ifdef ES_DEBUG__ 153 static void ext4_es_print_tree(struct inode *inode) 154 { 155 struct ext4_es_tree *tree; 156 struct rb_node *node; 157 158 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); 159 tree = &EXT4_I(inode)->i_es_tree; 160 node = rb_first(&tree->root); 161 while (node) { 162 struct extent_status *es; 163 es = rb_entry(node, struct extent_status, rb_node); 164 printk(KERN_DEBUG " [%u/%u)", es->start, es->len); 165 node = rb_next(node); 166 } 167 printk(KERN_DEBUG "\n"); 168 } 169 #else 170 #define ext4_es_print_tree(inode) 171 #endif 172 173 static inline ext4_lblk_t extent_status_end(struct extent_status *es) 174 { 175 BUG_ON(es->start + es->len < es->start); 176 return es->start + es->len - 1; 177 } 178 179 /* 180 * search through the tree for an delayed extent with a given offset. If 181 * it can't be found, try to find next extent. 182 */ 183 static struct extent_status *__es_tree_search(struct rb_root *root, 184 ext4_lblk_t offset) 185 { 186 struct rb_node *node = root->rb_node; 187 struct extent_status *es = NULL; 188 189 while (node) { 190 es = rb_entry(node, struct extent_status, rb_node); 191 if (offset < es->start) 192 node = node->rb_left; 193 else if (offset > extent_status_end(es)) 194 node = node->rb_right; 195 else 196 return es; 197 } 198 199 if (es && offset < es->start) 200 return es; 201 202 if (es && offset > extent_status_end(es)) { 203 node = rb_next(&es->rb_node); 204 return node ? rb_entry(node, struct extent_status, rb_node) : 205 NULL; 206 } 207 208 return NULL; 209 } 210 211 /* 212 * ext4_es_find_extent: find the 1st delayed extent covering @es->start 213 * if it exists, otherwise, the next extent after @es->start. 214 * 215 * @inode: the inode which owns delayed extents 216 * @es: delayed extent that we found 217 * 218 * Returns the first block of the next extent after es, otherwise 219 * EXT_MAX_BLOCKS if no delay extent is found. 220 * Delayed extent is returned via @es. 221 */ 222 ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) 223 { 224 struct ext4_es_tree *tree = NULL; 225 struct extent_status *es1 = NULL; 226 struct rb_node *node; 227 ext4_lblk_t ret = EXT_MAX_BLOCKS; 228 229 trace_ext4_es_find_extent_enter(inode, es->start); 230 231 read_lock(&EXT4_I(inode)->i_es_lock); 232 tree = &EXT4_I(inode)->i_es_tree; 233 234 /* find delay extent in cache firstly */ 235 if (tree->cache_es) { 236 es1 = tree->cache_es; 237 if (in_range(es->start, es1->start, es1->len)) { 238 es_debug("%u cached by [%u/%u)\n", 239 es->start, es1->start, es1->len); 240 goto out; 241 } 242 } 243 244 es->len = 0; 245 es1 = __es_tree_search(&tree->root, es->start); 246 247 out: 248 if (es1) { 249 tree->cache_es = es1; 250 es->start = es1->start; 251 es->len = es1->len; 252 node = rb_next(&es1->rb_node); 253 if (node) { 254 es1 = rb_entry(node, struct extent_status, rb_node); 255 ret = es1->start; 256 } 257 } 258 259 read_unlock(&EXT4_I(inode)->i_es_lock); 260 261 trace_ext4_es_find_extent_exit(inode, es, ret); 262 return ret; 263 } 264 265 static struct extent_status * 266 ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) 267 { 268 struct extent_status *es; 269 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); 270 if (es == NULL) 271 return NULL; 272 es->start = start; 273 es->len = len; 274 return es; 275 } 276 277 static void ext4_es_free_extent(struct extent_status *es) 278 { 279 kmem_cache_free(ext4_es_cachep, es); 280 } 281 282 static struct extent_status * 283 ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) 284 { 285 struct extent_status *es1; 286 struct rb_node *node; 287 288 node = rb_prev(&es->rb_node); 289 if (!node) 290 return es; 291 292 es1 = rb_entry(node, struct extent_status, rb_node); 293 if (es->start == extent_status_end(es1) + 1) { 294 es1->len += es->len; 295 rb_erase(&es->rb_node, &tree->root); 296 ext4_es_free_extent(es); 297 es = es1; 298 } 299 300 return es; 301 } 302 303 static struct extent_status * 304 ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) 305 { 306 struct extent_status *es1; 307 struct rb_node *node; 308 309 node = rb_next(&es->rb_node); 310 if (!node) 311 return es; 312 313 es1 = rb_entry(node, struct extent_status, rb_node); 314 if (es1->start == extent_status_end(es) + 1) { 315 es->len += es1->len; 316 rb_erase(node, &tree->root); 317 ext4_es_free_extent(es1); 318 } 319 320 return es; 321 } 322 323 static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, 324 ext4_lblk_t len) 325 { 326 struct rb_node **p = &tree->root.rb_node; 327 struct rb_node *parent = NULL; 328 struct extent_status *es; 329 ext4_lblk_t end = offset + len - 1; 330 331 BUG_ON(end < offset); 332 es = tree->cache_es; 333 if (es && offset == (extent_status_end(es) + 1)) { 334 es_debug("cached by [%u/%u)\n", es->start, es->len); 335 es->len += len; 336 es = ext4_es_try_to_merge_right(tree, es); 337 goto out; 338 } else if (es && es->start == end + 1) { 339 es_debug("cached by [%u/%u)\n", es->start, es->len); 340 es->start = offset; 341 es->len += len; 342 es = ext4_es_try_to_merge_left(tree, es); 343 goto out; 344 } else if (es && es->start <= offset && 345 end <= extent_status_end(es)) { 346 es_debug("cached by [%u/%u)\n", es->start, es->len); 347 goto out; 348 } 349 350 while (*p) { 351 parent = *p; 352 es = rb_entry(parent, struct extent_status, rb_node); 353 354 if (offset < es->start) { 355 if (es->start == end + 1) { 356 es->start = offset; 357 es->len += len; 358 es = ext4_es_try_to_merge_left(tree, es); 359 goto out; 360 } 361 p = &(*p)->rb_left; 362 } else if (offset > extent_status_end(es)) { 363 if (offset == extent_status_end(es) + 1) { 364 es->len += len; 365 es = ext4_es_try_to_merge_right(tree, es); 366 goto out; 367 } 368 p = &(*p)->rb_right; 369 } else { 370 if (extent_status_end(es) <= end) 371 es->len = offset - es->start + len; 372 goto out; 373 } 374 } 375 376 es = ext4_es_alloc_extent(offset, len); 377 if (!es) 378 return -ENOMEM; 379 rb_link_node(&es->rb_node, parent, p); 380 rb_insert_color(&es->rb_node, &tree->root); 381 382 out: 383 tree->cache_es = es; 384 return 0; 385 } 386 387 /* 388 * ext4_es_insert_extent() adds a space to a delayed extent tree. 389 * Caller holds inode->i_es_lock. 390 * 391 * ext4_es_insert_extent is called by ext4_da_write_begin and 392 * ext4_es_remove_extent. 393 * 394 * Return 0 on success, error code on failure. 395 */ 396 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, 397 ext4_lblk_t len) 398 { 399 struct ext4_es_tree *tree; 400 int err = 0; 401 402 trace_ext4_es_insert_extent(inode, offset, len); 403 es_debug("add [%u/%u) to extent status tree of inode %lu\n", 404 offset, len, inode->i_ino); 405 406 write_lock(&EXT4_I(inode)->i_es_lock); 407 tree = &EXT4_I(inode)->i_es_tree; 408 err = __es_insert_extent(tree, offset, len); 409 write_unlock(&EXT4_I(inode)->i_es_lock); 410 411 ext4_es_print_tree(inode); 412 413 return err; 414 } 415 416 /* 417 * ext4_es_remove_extent() removes a space from a delayed extent tree. 418 * Caller holds inode->i_es_lock. 419 * 420 * Return 0 on success, error code on failure. 421 */ 422 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, 423 ext4_lblk_t len) 424 { 425 struct rb_node *node; 426 struct ext4_es_tree *tree; 427 struct extent_status *es; 428 struct extent_status orig_es; 429 ext4_lblk_t len1, len2, end; 430 int err = 0; 431 432 trace_ext4_es_remove_extent(inode, offset, len); 433 es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 434 offset, len, inode->i_ino); 435 436 end = offset + len - 1; 437 BUG_ON(end < offset); 438 write_lock(&EXT4_I(inode)->i_es_lock); 439 tree = &EXT4_I(inode)->i_es_tree; 440 es = __es_tree_search(&tree->root, offset); 441 if (!es) 442 goto out; 443 if (es->start > end) 444 goto out; 445 446 /* Simply invalidate cache_es. */ 447 tree->cache_es = NULL; 448 449 orig_es.start = es->start; 450 orig_es.len = es->len; 451 len1 = offset > es->start ? offset - es->start : 0; 452 len2 = extent_status_end(es) > end ? 453 extent_status_end(es) - end : 0; 454 if (len1 > 0) 455 es->len = len1; 456 if (len2 > 0) { 457 if (len1 > 0) { 458 err = __es_insert_extent(tree, end + 1, len2); 459 if (err) { 460 es->start = orig_es.start; 461 es->len = orig_es.len; 462 goto out; 463 } 464 } else { 465 es->start = end + 1; 466 es->len = len2; 467 } 468 goto out; 469 } 470 471 if (len1 > 0) { 472 node = rb_next(&es->rb_node); 473 if (node) 474 es = rb_entry(node, struct extent_status, rb_node); 475 else 476 es = NULL; 477 } 478 479 while (es && extent_status_end(es) <= end) { 480 node = rb_next(&es->rb_node); 481 rb_erase(&es->rb_node, &tree->root); 482 ext4_es_free_extent(es); 483 if (!node) { 484 es = NULL; 485 break; 486 } 487 es = rb_entry(node, struct extent_status, rb_node); 488 } 489 490 if (es && es->start < end + 1) { 491 len1 = extent_status_end(es) - end; 492 es->start = end + 1; 493 es->len = len1; 494 } 495 496 out: 497 write_unlock(&EXT4_I(inode)->i_es_lock); 498 ext4_es_print_tree(inode); 499 return err; 500 } 501