1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public Licens 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 17 */ 18 19 20 /* 21 * mballoc.c contains the multiblocks allocation routines 22 */ 23 24 #include "mballoc.h" 25 #include <trace/events/ext4.h> 26 27 /* 28 * MUSTDO: 29 * - test ext4_ext_search_left() and ext4_ext_search_right() 30 * - search for metadata in few groups 31 * 32 * TODO v4: 33 * - normalization should take into account whether file is still open 34 * - discard preallocations if no free space left (policy?) 35 * - don't normalize tails 36 * - quota 37 * - reservation for superuser 38 * 39 * TODO v3: 40 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 41 * - track min/max extents in each group for better group selection 42 * - mb_mark_used() may allocate chunk right after splitting buddy 43 * - tree of groups sorted by number of free blocks 44 * - error handling 45 */ 46 47 /* 48 * The allocation request involve request for multiple number of blocks 49 * near to the goal(block) value specified. 50 * 51 * During initialization phase of the allocator we decide to use the 52 * group preallocation or inode preallocation depending on the size of 53 * the file. The size of the file could be the resulting file size we 54 * would have after allocation, or the current file size, which ever 55 * is larger. If the size is less than sbi->s_mb_stream_request we 56 * select to use the group preallocation. The default value of 57 * s_mb_stream_request is 16 blocks. This can also be tuned via 58 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 59 * terms of number of blocks. 60 * 61 * The main motivation for having small file use group preallocation is to 62 * ensure that we have small files closer together on the disk. 63 * 64 * First stage the allocator looks at the inode prealloc list, 65 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 66 * spaces for this particular inode. The inode prealloc space is 67 * represented as: 68 * 69 * pa_lstart -> the logical start block for this prealloc space 70 * pa_pstart -> the physical start block for this prealloc space 71 * pa_len -> lenght for this prealloc space 72 * pa_free -> free space available in this prealloc space 73 * 74 * The inode preallocation space is used looking at the _logical_ start 75 * block. If only the logical file block falls within the range of prealloc 76 * space we will consume the particular prealloc space. This make sure that 77 * that the we have contiguous physical blocks representing the file blocks 78 * 79 * The important thing to be noted in case of inode prealloc space is that 80 * we don't modify the values associated to inode prealloc space except 81 * pa_free. 82 * 83 * If we are not able to find blocks in the inode prealloc space and if we 84 * have the group allocation flag set then we look at the locality group 85 * prealloc space. These are per CPU prealloc list repreasented as 86 * 87 * ext4_sb_info.s_locality_groups[smp_processor_id()] 88 * 89 * The reason for having a per cpu locality group is to reduce the contention 90 * between CPUs. It is possible to get scheduled at this point. 91 * 92 * The locality group prealloc space is used looking at whether we have 93 * enough free space (pa_free) withing the prealloc space. 94 * 95 * If we can't allocate blocks via inode prealloc or/and locality group 96 * prealloc then we look at the buddy cache. The buddy cache is represented 97 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 98 * mapped to the buddy and bitmap information regarding different 99 * groups. The buddy information is attached to buddy cache inode so that 100 * we can access them through the page cache. The information regarding 101 * each group is loaded via ext4_mb_load_buddy. The information involve 102 * block bitmap and buddy information. The information are stored in the 103 * inode as: 104 * 105 * { page } 106 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 107 * 108 * 109 * one block each for bitmap and buddy information. So for each group we 110 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / 111 * blocksize) blocks. So it can have information regarding groups_per_page 112 * which is blocks_per_page/2 113 * 114 * The buddy cache inode is not stored on disk. The inode is thrown 115 * away when the filesystem is unmounted. 116 * 117 * We look for count number of blocks in the buddy cache. If we were able 118 * to locate that many free blocks we return with additional information 119 * regarding rest of the contiguous physical block available 120 * 121 * Before allocating blocks via buddy cache we normalize the request 122 * blocks. This ensure we ask for more blocks that we needed. The extra 123 * blocks that we get after allocation is added to the respective prealloc 124 * list. In case of inode preallocation we follow a list of heuristics 125 * based on file size. This can be found in ext4_mb_normalize_request. If 126 * we are doing a group prealloc we try to normalize the request to 127 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is 128 * 512 blocks. This can be tuned via 129 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in 130 * terms of number of blocks. If we have mounted the file system with -O 131 * stripe=<value> option the group prealloc request is normalized to the 132 * stripe value (sbi->s_stripe) 133 * 134 * The regular allocator(using the buddy cache) supports few tunables. 135 * 136 * /sys/fs/ext4/<partition>/mb_min_to_scan 137 * /sys/fs/ext4/<partition>/mb_max_to_scan 138 * /sys/fs/ext4/<partition>/mb_order2_req 139 * 140 * The regular allocator uses buddy scan only if the request len is power of 141 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 142 * value of s_mb_order2_reqs can be tuned via 143 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 144 * stripe size (sbi->s_stripe), we try to search for contigous block in 145 * stripe size. This should result in better allocation on RAID setups. If 146 * not, we search in the specific group using bitmap for best extents. The 147 * tunable min_to_scan and max_to_scan control the behaviour here. 148 * min_to_scan indicate how long the mballoc __must__ look for a best 149 * extent and max_to_scan indicates how long the mballoc __can__ look for a 150 * best extent in the found extents. Searching for the blocks starts with 151 * the group specified as the goal value in allocation context via 152 * ac_g_ex. Each group is first checked based on the criteria whether it 153 * can used for allocation. ext4_mb_good_group explains how the groups are 154 * checked. 155 * 156 * Both the prealloc space are getting populated as above. So for the first 157 * request we will hit the buddy cache which will result in this prealloc 158 * space getting filled. The prealloc space is then later used for the 159 * subsequent request. 160 */ 161 162 /* 163 * mballoc operates on the following data: 164 * - on-disk bitmap 165 * - in-core buddy (actually includes buddy and bitmap) 166 * - preallocation descriptors (PAs) 167 * 168 * there are two types of preallocations: 169 * - inode 170 * assiged to specific inode and can be used for this inode only. 171 * it describes part of inode's space preallocated to specific 172 * physical blocks. any block from that preallocated can be used 173 * independent. the descriptor just tracks number of blocks left 174 * unused. so, before taking some block from descriptor, one must 175 * make sure corresponded logical block isn't allocated yet. this 176 * also means that freeing any block within descriptor's range 177 * must discard all preallocated blocks. 178 * - locality group 179 * assigned to specific locality group which does not translate to 180 * permanent set of inodes: inode can join and leave group. space 181 * from this type of preallocation can be used for any inode. thus 182 * it's consumed from the beginning to the end. 183 * 184 * relation between them can be expressed as: 185 * in-core buddy = on-disk bitmap + preallocation descriptors 186 * 187 * this mean blocks mballoc considers used are: 188 * - allocated blocks (persistent) 189 * - preallocated blocks (non-persistent) 190 * 191 * consistency in mballoc world means that at any time a block is either 192 * free or used in ALL structures. notice: "any time" should not be read 193 * literally -- time is discrete and delimited by locks. 194 * 195 * to keep it simple, we don't use block numbers, instead we count number of 196 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 197 * 198 * all operations can be expressed as: 199 * - init buddy: buddy = on-disk + PAs 200 * - new PA: buddy += N; PA = N 201 * - use inode PA: on-disk += N; PA -= N 202 * - discard inode PA buddy -= on-disk - PA; PA = 0 203 * - use locality group PA on-disk += N; PA -= N 204 * - discard locality group PA buddy -= PA; PA = 0 205 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 206 * is used in real operation because we can't know actual used 207 * bits from PA, only from on-disk bitmap 208 * 209 * if we follow this strict logic, then all operations above should be atomic. 210 * given some of them can block, we'd have to use something like semaphores 211 * killing performance on high-end SMP hardware. let's try to relax it using 212 * the following knowledge: 213 * 1) if buddy is referenced, it's already initialized 214 * 2) while block is used in buddy and the buddy is referenced, 215 * nobody can re-allocate that block 216 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 217 * bit set and PA claims same block, it's OK. IOW, one can set bit in 218 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 219 * block 220 * 221 * so, now we're building a concurrency table: 222 * - init buddy vs. 223 * - new PA 224 * blocks for PA are allocated in the buddy, buddy must be referenced 225 * until PA is linked to allocation group to avoid concurrent buddy init 226 * - use inode PA 227 * we need to make sure that either on-disk bitmap or PA has uptodate data 228 * given (3) we care that PA-=N operation doesn't interfere with init 229 * - discard inode PA 230 * the simplest way would be to have buddy initialized by the discard 231 * - use locality group PA 232 * again PA-=N must be serialized with init 233 * - discard locality group PA 234 * the simplest way would be to have buddy initialized by the discard 235 * - new PA vs. 236 * - use inode PA 237 * i_data_sem serializes them 238 * - discard inode PA 239 * discard process must wait until PA isn't used by another process 240 * - use locality group PA 241 * some mutex should serialize them 242 * - discard locality group PA 243 * discard process must wait until PA isn't used by another process 244 * - use inode PA 245 * - use inode PA 246 * i_data_sem or another mutex should serializes them 247 * - discard inode PA 248 * discard process must wait until PA isn't used by another process 249 * - use locality group PA 250 * nothing wrong here -- they're different PAs covering different blocks 251 * - discard locality group PA 252 * discard process must wait until PA isn't used by another process 253 * 254 * now we're ready to make few consequences: 255 * - PA is referenced and while it is no discard is possible 256 * - PA is referenced until block isn't marked in on-disk bitmap 257 * - PA changes only after on-disk bitmap 258 * - discard must not compete with init. either init is done before 259 * any discard or they're serialized somehow 260 * - buddy init as sum of on-disk bitmap and PAs is done atomically 261 * 262 * a special case when we've used PA to emptiness. no need to modify buddy 263 * in this case, but we should care about concurrent init 264 * 265 */ 266 267 /* 268 * Logic in few words: 269 * 270 * - allocation: 271 * load group 272 * find blocks 273 * mark bits in on-disk bitmap 274 * release group 275 * 276 * - use preallocation: 277 * find proper PA (per-inode or group) 278 * load group 279 * mark bits in on-disk bitmap 280 * release group 281 * release PA 282 * 283 * - free: 284 * load group 285 * mark bits in on-disk bitmap 286 * release group 287 * 288 * - discard preallocations in group: 289 * mark PAs deleted 290 * move them onto local list 291 * load on-disk bitmap 292 * load group 293 * remove PA from object (inode or locality group) 294 * mark free blocks in-core 295 * 296 * - discard inode's preallocations: 297 */ 298 299 /* 300 * Locking rules 301 * 302 * Locks: 303 * - bitlock on a group (group) 304 * - object (inode/locality) (object) 305 * - per-pa lock (pa) 306 * 307 * Paths: 308 * - new pa 309 * object 310 * group 311 * 312 * - find and use pa: 313 * pa 314 * 315 * - release consumed pa: 316 * pa 317 * group 318 * object 319 * 320 * - generate in-core bitmap: 321 * group 322 * pa 323 * 324 * - discard all for given object (inode, locality group): 325 * object 326 * pa 327 * group 328 * 329 * - discard all for given group: 330 * group 331 * pa 332 * group 333 * object 334 * 335 */ 336 static struct kmem_cache *ext4_pspace_cachep; 337 static struct kmem_cache *ext4_ac_cachep; 338 static struct kmem_cache *ext4_free_ext_cachep; 339 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 340 ext4_group_t group); 341 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 342 ext4_group_t group); 343 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 344 345 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 346 { 347 #if BITS_PER_LONG == 64 348 *bit += ((unsigned long) addr & 7UL) << 3; 349 addr = (void *) ((unsigned long) addr & ~7UL); 350 #elif BITS_PER_LONG == 32 351 *bit += ((unsigned long) addr & 3UL) << 3; 352 addr = (void *) ((unsigned long) addr & ~3UL); 353 #else 354 #error "how many bits you are?!" 355 #endif 356 return addr; 357 } 358 359 static inline int mb_test_bit(int bit, void *addr) 360 { 361 /* 362 * ext4_test_bit on architecture like powerpc 363 * needs unsigned long aligned address 364 */ 365 addr = mb_correct_addr_and_bit(&bit, addr); 366 return ext4_test_bit(bit, addr); 367 } 368 369 static inline void mb_set_bit(int bit, void *addr) 370 { 371 addr = mb_correct_addr_and_bit(&bit, addr); 372 ext4_set_bit(bit, addr); 373 } 374 375 static inline void mb_clear_bit(int bit, void *addr) 376 { 377 addr = mb_correct_addr_and_bit(&bit, addr); 378 ext4_clear_bit(bit, addr); 379 } 380 381 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 382 { 383 int fix = 0, ret, tmpmax; 384 addr = mb_correct_addr_and_bit(&fix, addr); 385 tmpmax = max + fix; 386 start += fix; 387 388 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 389 if (ret > max) 390 return max; 391 return ret; 392 } 393 394 static inline int mb_find_next_bit(void *addr, int max, int start) 395 { 396 int fix = 0, ret, tmpmax; 397 addr = mb_correct_addr_and_bit(&fix, addr); 398 tmpmax = max + fix; 399 start += fix; 400 401 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 402 if (ret > max) 403 return max; 404 return ret; 405 } 406 407 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 408 { 409 char *bb; 410 411 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 412 BUG_ON(max == NULL); 413 414 if (order > e4b->bd_blkbits + 1) { 415 *max = 0; 416 return NULL; 417 } 418 419 /* at order 0 we see each particular block */ 420 *max = 1 << (e4b->bd_blkbits + 3); 421 if (order == 0) 422 return EXT4_MB_BITMAP(e4b); 423 424 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 425 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 426 427 return bb; 428 } 429 430 #ifdef DOUBLE_CHECK 431 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 432 int first, int count) 433 { 434 int i; 435 struct super_block *sb = e4b->bd_sb; 436 437 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 438 return; 439 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 440 for (i = 0; i < count; i++) { 441 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 442 ext4_fsblk_t blocknr; 443 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 444 blocknr += first + i; 445 blocknr += 446 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 447 ext4_grp_locked_error(sb, e4b->bd_group, 448 __func__, "double-free of inode" 449 " %lu's block %llu(bit %u in group %u)", 450 inode ? inode->i_ino : 0, blocknr, 451 first + i, e4b->bd_group); 452 } 453 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 454 } 455 } 456 457 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 458 { 459 int i; 460 461 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 462 return; 463 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 464 for (i = 0; i < count; i++) { 465 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 466 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 467 } 468 } 469 470 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 471 { 472 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 473 unsigned char *b1, *b2; 474 int i; 475 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 476 b2 = (unsigned char *) bitmap; 477 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 478 if (b1[i] != b2[i]) { 479 printk(KERN_ERR "corruption in group %u " 480 "at byte %u(%u): %x in copy != %x " 481 "on disk/prealloc\n", 482 e4b->bd_group, i, i * 8, b1[i], b2[i]); 483 BUG(); 484 } 485 } 486 } 487 } 488 489 #else 490 static inline void mb_free_blocks_double(struct inode *inode, 491 struct ext4_buddy *e4b, int first, int count) 492 { 493 return; 494 } 495 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 496 int first, int count) 497 { 498 return; 499 } 500 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 501 { 502 return; 503 } 504 #endif 505 506 #ifdef AGGRESSIVE_CHECK 507 508 #define MB_CHECK_ASSERT(assert) \ 509 do { \ 510 if (!(assert)) { \ 511 printk(KERN_EMERG \ 512 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 513 function, file, line, # assert); \ 514 BUG(); \ 515 } \ 516 } while (0) 517 518 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 519 const char *function, int line) 520 { 521 struct super_block *sb = e4b->bd_sb; 522 int order = e4b->bd_blkbits + 1; 523 int max; 524 int max2; 525 int i; 526 int j; 527 int k; 528 int count; 529 struct ext4_group_info *grp; 530 int fragments = 0; 531 int fstart; 532 struct list_head *cur; 533 void *buddy; 534 void *buddy2; 535 536 { 537 static int mb_check_counter; 538 if (mb_check_counter++ % 100 != 0) 539 return 0; 540 } 541 542 while (order > 1) { 543 buddy = mb_find_buddy(e4b, order, &max); 544 MB_CHECK_ASSERT(buddy); 545 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 546 MB_CHECK_ASSERT(buddy2); 547 MB_CHECK_ASSERT(buddy != buddy2); 548 MB_CHECK_ASSERT(max * 2 == max2); 549 550 count = 0; 551 for (i = 0; i < max; i++) { 552 553 if (mb_test_bit(i, buddy)) { 554 /* only single bit in buddy2 may be 1 */ 555 if (!mb_test_bit(i << 1, buddy2)) { 556 MB_CHECK_ASSERT( 557 mb_test_bit((i<<1)+1, buddy2)); 558 } else if (!mb_test_bit((i << 1) + 1, buddy2)) { 559 MB_CHECK_ASSERT( 560 mb_test_bit(i << 1, buddy2)); 561 } 562 continue; 563 } 564 565 /* both bits in buddy2 must be 0 */ 566 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 567 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 568 569 for (j = 0; j < (1 << order); j++) { 570 k = (i * (1 << order)) + j; 571 MB_CHECK_ASSERT( 572 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 573 } 574 count++; 575 } 576 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 577 order--; 578 } 579 580 fstart = -1; 581 buddy = mb_find_buddy(e4b, 0, &max); 582 for (i = 0; i < max; i++) { 583 if (!mb_test_bit(i, buddy)) { 584 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 585 if (fstart == -1) { 586 fragments++; 587 fstart = i; 588 } 589 continue; 590 } 591 fstart = -1; 592 /* check used bits only */ 593 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 594 buddy2 = mb_find_buddy(e4b, j, &max2); 595 k = i >> j; 596 MB_CHECK_ASSERT(k < max2); 597 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 598 } 599 } 600 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 601 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 602 603 grp = ext4_get_group_info(sb, e4b->bd_group); 604 buddy = mb_find_buddy(e4b, 0, &max); 605 list_for_each(cur, &grp->bb_prealloc_list) { 606 ext4_group_t groupnr; 607 struct ext4_prealloc_space *pa; 608 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 609 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 610 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 611 for (i = 0; i < pa->pa_len; i++) 612 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 613 } 614 return 0; 615 } 616 #undef MB_CHECK_ASSERT 617 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 618 __FILE__, __func__, __LINE__) 619 #else 620 #define mb_check_buddy(e4b) 621 #endif 622 623 /* FIXME!! need more doc */ 624 static void ext4_mb_mark_free_simple(struct super_block *sb, 625 void *buddy, unsigned first, int len, 626 struct ext4_group_info *grp) 627 { 628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 unsigned short min; 630 unsigned short max; 631 unsigned short chunk; 632 unsigned short border; 633 634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 636 border = 2 << sb->s_blocksize_bits; 637 638 while (len > 0) { 639 /* find how many blocks can be covered since this position */ 640 max = ffs(first | border) - 1; 641 642 /* find how many blocks of power 2 we need to mark */ 643 min = fls(len) - 1; 644 645 if (max < min) 646 min = max; 647 chunk = 1 << min; 648 649 /* mark multiblock chunks only */ 650 grp->bb_counters[min]++; 651 if (min > 0) 652 mb_clear_bit(first >> min, 653 buddy + sbi->s_mb_offsets[min]); 654 655 len -= chunk; 656 first += chunk; 657 } 658 } 659 660 static noinline_for_stack 661 void ext4_mb_generate_buddy(struct super_block *sb, 662 void *buddy, void *bitmap, ext4_group_t group) 663 { 664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 unsigned short i = 0; 667 unsigned short first; 668 unsigned short len; 669 unsigned free = 0; 670 unsigned fragments = 0; 671 unsigned long long period = get_cycles(); 672 673 /* initialize buddy from bitmap which is aggregation 674 * of on-disk bitmap and preallocations */ 675 i = mb_find_next_zero_bit(bitmap, max, 0); 676 grp->bb_first_free = i; 677 while (i < max) { 678 fragments++; 679 first = i; 680 i = mb_find_next_bit(bitmap, max, i); 681 len = i - first; 682 free += len; 683 if (len > 1) 684 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 685 else 686 grp->bb_counters[0]++; 687 if (i < max) 688 i = mb_find_next_zero_bit(bitmap, max, i); 689 } 690 grp->bb_fragments = fragments; 691 692 if (free != grp->bb_free) { 693 ext4_grp_locked_error(sb, group, __func__, 694 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", 695 group, free, grp->bb_free); 696 /* 697 * If we intent to continue, we consider group descritor 698 * corrupt and update bb_free using bitmap value 699 */ 700 grp->bb_free = free; 701 } 702 703 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 704 705 period = get_cycles() - period; 706 spin_lock(&EXT4_SB(sb)->s_bal_lock); 707 EXT4_SB(sb)->s_mb_buddies_generated++; 708 EXT4_SB(sb)->s_mb_generation_time += period; 709 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 710 } 711 712 /* The buddy information is attached the buddy cache inode 713 * for convenience. The information regarding each group 714 * is loaded via ext4_mb_load_buddy. The information involve 715 * block bitmap and buddy information. The information are 716 * stored in the inode as 717 * 718 * { page } 719 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 720 * 721 * 722 * one block each for bitmap and buddy information. 723 * So for each group we take up 2 blocks. A page can 724 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 725 * So it can have information regarding groups_per_page which 726 * is blocks_per_page/2 727 */ 728 729 static int ext4_mb_init_cache(struct page *page, char *incore) 730 { 731 ext4_group_t ngroups; 732 int blocksize; 733 int blocks_per_page; 734 int groups_per_page; 735 int err = 0; 736 int i; 737 ext4_group_t first_group; 738 int first_block; 739 struct super_block *sb; 740 struct buffer_head *bhs; 741 struct buffer_head **bh; 742 struct inode *inode; 743 char *data; 744 char *bitmap; 745 746 mb_debug("init page %lu\n", page->index); 747 748 inode = page->mapping->host; 749 sb = inode->i_sb; 750 ngroups = ext4_get_groups_count(sb); 751 blocksize = 1 << inode->i_blkbits; 752 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 753 754 groups_per_page = blocks_per_page >> 1; 755 if (groups_per_page == 0) 756 groups_per_page = 1; 757 758 /* allocate buffer_heads to read bitmaps */ 759 if (groups_per_page > 1) { 760 err = -ENOMEM; 761 i = sizeof(struct buffer_head *) * groups_per_page; 762 bh = kzalloc(i, GFP_NOFS); 763 if (bh == NULL) 764 goto out; 765 } else 766 bh = &bhs; 767 768 first_group = page->index * blocks_per_page / 2; 769 770 /* read all groups the page covers into the cache */ 771 for (i = 0; i < groups_per_page; i++) { 772 struct ext4_group_desc *desc; 773 774 if (first_group + i >= ngroups) 775 break; 776 777 err = -EIO; 778 desc = ext4_get_group_desc(sb, first_group + i, NULL); 779 if (desc == NULL) 780 goto out; 781 782 err = -ENOMEM; 783 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); 784 if (bh[i] == NULL) 785 goto out; 786 787 if (bitmap_uptodate(bh[i])) 788 continue; 789 790 lock_buffer(bh[i]); 791 if (bitmap_uptodate(bh[i])) { 792 unlock_buffer(bh[i]); 793 continue; 794 } 795 ext4_lock_group(sb, first_group + i); 796 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 797 ext4_init_block_bitmap(sb, bh[i], 798 first_group + i, desc); 799 set_bitmap_uptodate(bh[i]); 800 set_buffer_uptodate(bh[i]); 801 ext4_unlock_group(sb, first_group + i); 802 unlock_buffer(bh[i]); 803 continue; 804 } 805 ext4_unlock_group(sb, first_group + i); 806 if (buffer_uptodate(bh[i])) { 807 /* 808 * if not uninit if bh is uptodate, 809 * bitmap is also uptodate 810 */ 811 set_bitmap_uptodate(bh[i]); 812 unlock_buffer(bh[i]); 813 continue; 814 } 815 get_bh(bh[i]); 816 /* 817 * submit the buffer_head for read. We can 818 * safely mark the bitmap as uptodate now. 819 * We do it here so the bitmap uptodate bit 820 * get set with buffer lock held. 821 */ 822 set_bitmap_uptodate(bh[i]); 823 bh[i]->b_end_io = end_buffer_read_sync; 824 submit_bh(READ, bh[i]); 825 mb_debug("read bitmap for group %u\n", first_group + i); 826 } 827 828 /* wait for I/O completion */ 829 for (i = 0; i < groups_per_page && bh[i]; i++) 830 wait_on_buffer(bh[i]); 831 832 err = -EIO; 833 for (i = 0; i < groups_per_page && bh[i]; i++) 834 if (!buffer_uptodate(bh[i])) 835 goto out; 836 837 err = 0; 838 first_block = page->index * blocks_per_page; 839 /* init the page */ 840 memset(page_address(page), 0xff, PAGE_CACHE_SIZE); 841 for (i = 0; i < blocks_per_page; i++) { 842 int group; 843 struct ext4_group_info *grinfo; 844 845 group = (first_block + i) >> 1; 846 if (group >= ngroups) 847 break; 848 849 /* 850 * data carry information regarding this 851 * particular group in the format specified 852 * above 853 * 854 */ 855 data = page_address(page) + (i * blocksize); 856 bitmap = bh[group - first_group]->b_data; 857 858 /* 859 * We place the buddy block and bitmap block 860 * close together 861 */ 862 if ((first_block + i) & 1) { 863 /* this is block of buddy */ 864 BUG_ON(incore == NULL); 865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 group, page->index, i * blocksize); 867 grinfo = ext4_get_group_info(sb, group); 868 grinfo->bb_fragments = 0; 869 memset(grinfo->bb_counters, 0, 870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 /* 872 * incore got set to the group block bitmap below 873 */ 874 ext4_lock_group(sb, group); 875 ext4_mb_generate_buddy(sb, data, incore, group); 876 ext4_unlock_group(sb, group); 877 incore = NULL; 878 } else { 879 /* this is block of bitmap */ 880 BUG_ON(incore != NULL); 881 mb_debug("put bitmap for group %u in page %lu/%x\n", 882 group, page->index, i * blocksize); 883 884 /* see comments in ext4_mb_put_pa() */ 885 ext4_lock_group(sb, group); 886 memcpy(data, bitmap, blocksize); 887 888 /* mark all preallocated blks used in in-core bitmap */ 889 ext4_mb_generate_from_pa(sb, data, group); 890 ext4_mb_generate_from_freelist(sb, data, group); 891 ext4_unlock_group(sb, group); 892 893 /* set incore so that the buddy information can be 894 * generated using this 895 */ 896 incore = data; 897 } 898 } 899 SetPageUptodate(page); 900 901 out: 902 if (bh) { 903 for (i = 0; i < groups_per_page && bh[i]; i++) 904 brelse(bh[i]); 905 if (bh != &bhs) 906 kfree(bh); 907 } 908 return err; 909 } 910 911 static noinline_for_stack int 912 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 913 struct ext4_buddy *e4b) 914 { 915 int blocks_per_page; 916 int block; 917 int pnum; 918 int poff; 919 struct page *page; 920 int ret; 921 struct ext4_group_info *grp; 922 struct ext4_sb_info *sbi = EXT4_SB(sb); 923 struct inode *inode = sbi->s_buddy_cache; 924 925 mb_debug("load group %u\n", group); 926 927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 928 grp = ext4_get_group_info(sb, group); 929 930 e4b->bd_blkbits = sb->s_blocksize_bits; 931 e4b->bd_info = ext4_get_group_info(sb, group); 932 e4b->bd_sb = sb; 933 e4b->bd_group = group; 934 e4b->bd_buddy_page = NULL; 935 e4b->bd_bitmap_page = NULL; 936 e4b->alloc_semp = &grp->alloc_sem; 937 938 /* Take the read lock on the group alloc 939 * sem. This would make sure a parallel 940 * ext4_mb_init_group happening on other 941 * groups mapped by the page is blocked 942 * till we are done with allocation 943 */ 944 down_read(e4b->alloc_semp); 945 946 /* 947 * the buddy cache inode stores the block bitmap 948 * and buddy information in consecutive blocks. 949 * So for each group we need two blocks. 950 */ 951 block = group * 2; 952 pnum = block / blocks_per_page; 953 poff = block % blocks_per_page; 954 955 /* we could use find_or_create_page(), but it locks page 956 * what we'd like to avoid in fast path ... */ 957 page = find_get_page(inode->i_mapping, pnum); 958 if (page == NULL || !PageUptodate(page)) { 959 if (page) 960 /* 961 * drop the page reference and try 962 * to get the page with lock. If we 963 * are not uptodate that implies 964 * somebody just created the page but 965 * is yet to initialize the same. So 966 * wait for it to initialize. 967 */ 968 page_cache_release(page); 969 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 970 if (page) { 971 BUG_ON(page->mapping != inode->i_mapping); 972 if (!PageUptodate(page)) { 973 ret = ext4_mb_init_cache(page, NULL); 974 if (ret) { 975 unlock_page(page); 976 goto err; 977 } 978 mb_cmp_bitmaps(e4b, page_address(page) + 979 (poff * sb->s_blocksize)); 980 } 981 unlock_page(page); 982 } 983 } 984 if (page == NULL || !PageUptodate(page)) { 985 ret = -EIO; 986 goto err; 987 } 988 e4b->bd_bitmap_page = page; 989 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 990 mark_page_accessed(page); 991 992 block++; 993 pnum = block / blocks_per_page; 994 poff = block % blocks_per_page; 995 996 page = find_get_page(inode->i_mapping, pnum); 997 if (page == NULL || !PageUptodate(page)) { 998 if (page) 999 page_cache_release(page); 1000 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1001 if (page) { 1002 BUG_ON(page->mapping != inode->i_mapping); 1003 if (!PageUptodate(page)) { 1004 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1005 if (ret) { 1006 unlock_page(page); 1007 goto err; 1008 } 1009 } 1010 unlock_page(page); 1011 } 1012 } 1013 if (page == NULL || !PageUptodate(page)) { 1014 ret = -EIO; 1015 goto err; 1016 } 1017 e4b->bd_buddy_page = page; 1018 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1019 mark_page_accessed(page); 1020 1021 BUG_ON(e4b->bd_bitmap_page == NULL); 1022 BUG_ON(e4b->bd_buddy_page == NULL); 1023 1024 return 0; 1025 1026 err: 1027 if (e4b->bd_bitmap_page) 1028 page_cache_release(e4b->bd_bitmap_page); 1029 if (e4b->bd_buddy_page) 1030 page_cache_release(e4b->bd_buddy_page); 1031 e4b->bd_buddy = NULL; 1032 e4b->bd_bitmap = NULL; 1033 1034 /* Done with the buddy cache */ 1035 up_read(e4b->alloc_semp); 1036 return ret; 1037 } 1038 1039 static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1040 { 1041 if (e4b->bd_bitmap_page) 1042 page_cache_release(e4b->bd_bitmap_page); 1043 if (e4b->bd_buddy_page) 1044 page_cache_release(e4b->bd_buddy_page); 1045 /* Done with the buddy cache */ 1046 if (e4b->alloc_semp) 1047 up_read(e4b->alloc_semp); 1048 } 1049 1050 1051 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1052 { 1053 int order = 1; 1054 void *bb; 1055 1056 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1057 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1058 1059 bb = EXT4_MB_BUDDY(e4b); 1060 while (order <= e4b->bd_blkbits + 1) { 1061 block = block >> 1; 1062 if (!mb_test_bit(block, bb)) { 1063 /* this block is part of buddy of order 'order' */ 1064 return order; 1065 } 1066 bb += 1 << (e4b->bd_blkbits - order); 1067 order++; 1068 } 1069 return 0; 1070 } 1071 1072 static void mb_clear_bits(void *bm, int cur, int len) 1073 { 1074 __u32 *addr; 1075 1076 len = cur + len; 1077 while (cur < len) { 1078 if ((cur & 31) == 0 && (len - cur) >= 32) { 1079 /* fast path: clear whole word at once */ 1080 addr = bm + (cur >> 3); 1081 *addr = 0; 1082 cur += 32; 1083 continue; 1084 } 1085 mb_clear_bit(cur, bm); 1086 cur++; 1087 } 1088 } 1089 1090 static void mb_set_bits(void *bm, int cur, int len) 1091 { 1092 __u32 *addr; 1093 1094 len = cur + len; 1095 while (cur < len) { 1096 if ((cur & 31) == 0 && (len - cur) >= 32) { 1097 /* fast path: set whole word at once */ 1098 addr = bm + (cur >> 3); 1099 *addr = 0xffffffff; 1100 cur += 32; 1101 continue; 1102 } 1103 mb_set_bit(cur, bm); 1104 cur++; 1105 } 1106 } 1107 1108 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1109 int first, int count) 1110 { 1111 int block = 0; 1112 int max = 0; 1113 int order; 1114 void *buddy; 1115 void *buddy2; 1116 struct super_block *sb = e4b->bd_sb; 1117 1118 BUG_ON(first + count > (sb->s_blocksize << 3)); 1119 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1120 mb_check_buddy(e4b); 1121 mb_free_blocks_double(inode, e4b, first, count); 1122 1123 e4b->bd_info->bb_free += count; 1124 if (first < e4b->bd_info->bb_first_free) 1125 e4b->bd_info->bb_first_free = first; 1126 1127 /* let's maintain fragments counter */ 1128 if (first != 0) 1129 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1130 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1131 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1132 if (block && max) 1133 e4b->bd_info->bb_fragments--; 1134 else if (!block && !max) 1135 e4b->bd_info->bb_fragments++; 1136 1137 /* let's maintain buddy itself */ 1138 while (count-- > 0) { 1139 block = first++; 1140 order = 0; 1141 1142 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1143 ext4_fsblk_t blocknr; 1144 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1145 blocknr += block; 1146 blocknr += 1147 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1148 ext4_grp_locked_error(sb, e4b->bd_group, 1149 __func__, "double-free of inode" 1150 " %lu's block %llu(bit %u in group %u)", 1151 inode ? inode->i_ino : 0, blocknr, block, 1152 e4b->bd_group); 1153 } 1154 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1155 e4b->bd_info->bb_counters[order]++; 1156 1157 /* start of the buddy */ 1158 buddy = mb_find_buddy(e4b, order, &max); 1159 1160 do { 1161 block &= ~1UL; 1162 if (mb_test_bit(block, buddy) || 1163 mb_test_bit(block + 1, buddy)) 1164 break; 1165 1166 /* both the buddies are free, try to coalesce them */ 1167 buddy2 = mb_find_buddy(e4b, order + 1, &max); 1168 1169 if (!buddy2) 1170 break; 1171 1172 if (order > 0) { 1173 /* for special purposes, we don't set 1174 * free bits in bitmap */ 1175 mb_set_bit(block, buddy); 1176 mb_set_bit(block + 1, buddy); 1177 } 1178 e4b->bd_info->bb_counters[order]--; 1179 e4b->bd_info->bb_counters[order]--; 1180 1181 block = block >> 1; 1182 order++; 1183 e4b->bd_info->bb_counters[order]++; 1184 1185 mb_clear_bit(block, buddy2); 1186 buddy = buddy2; 1187 } while (1); 1188 } 1189 mb_check_buddy(e4b); 1190 } 1191 1192 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1193 int needed, struct ext4_free_extent *ex) 1194 { 1195 int next = block; 1196 int max; 1197 int ord; 1198 void *buddy; 1199 1200 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1201 BUG_ON(ex == NULL); 1202 1203 buddy = mb_find_buddy(e4b, order, &max); 1204 BUG_ON(buddy == NULL); 1205 BUG_ON(block >= max); 1206 if (mb_test_bit(block, buddy)) { 1207 ex->fe_len = 0; 1208 ex->fe_start = 0; 1209 ex->fe_group = 0; 1210 return 0; 1211 } 1212 1213 /* FIXME dorp order completely ? */ 1214 if (likely(order == 0)) { 1215 /* find actual order */ 1216 order = mb_find_order_for_block(e4b, block); 1217 block = block >> order; 1218 } 1219 1220 ex->fe_len = 1 << order; 1221 ex->fe_start = block << order; 1222 ex->fe_group = e4b->bd_group; 1223 1224 /* calc difference from given start */ 1225 next = next - ex->fe_start; 1226 ex->fe_len -= next; 1227 ex->fe_start += next; 1228 1229 while (needed > ex->fe_len && 1230 (buddy = mb_find_buddy(e4b, order, &max))) { 1231 1232 if (block + 1 >= max) 1233 break; 1234 1235 next = (block + 1) * (1 << order); 1236 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1237 break; 1238 1239 ord = mb_find_order_for_block(e4b, next); 1240 1241 order = ord; 1242 block = next >> order; 1243 ex->fe_len += 1 << order; 1244 } 1245 1246 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); 1247 return ex->fe_len; 1248 } 1249 1250 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1251 { 1252 int ord; 1253 int mlen = 0; 1254 int max = 0; 1255 int cur; 1256 int start = ex->fe_start; 1257 int len = ex->fe_len; 1258 unsigned ret = 0; 1259 int len0 = len; 1260 void *buddy; 1261 1262 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1263 BUG_ON(e4b->bd_group != ex->fe_group); 1264 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1265 mb_check_buddy(e4b); 1266 mb_mark_used_double(e4b, start, len); 1267 1268 e4b->bd_info->bb_free -= len; 1269 if (e4b->bd_info->bb_first_free == start) 1270 e4b->bd_info->bb_first_free += len; 1271 1272 /* let's maintain fragments counter */ 1273 if (start != 0) 1274 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1275 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1276 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1277 if (mlen && max) 1278 e4b->bd_info->bb_fragments++; 1279 else if (!mlen && !max) 1280 e4b->bd_info->bb_fragments--; 1281 1282 /* let's maintain buddy itself */ 1283 while (len) { 1284 ord = mb_find_order_for_block(e4b, start); 1285 1286 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1287 /* the whole chunk may be allocated at once! */ 1288 mlen = 1 << ord; 1289 buddy = mb_find_buddy(e4b, ord, &max); 1290 BUG_ON((start >> ord) >= max); 1291 mb_set_bit(start >> ord, buddy); 1292 e4b->bd_info->bb_counters[ord]--; 1293 start += mlen; 1294 len -= mlen; 1295 BUG_ON(len < 0); 1296 continue; 1297 } 1298 1299 /* store for history */ 1300 if (ret == 0) 1301 ret = len | (ord << 16); 1302 1303 /* we have to split large buddy */ 1304 BUG_ON(ord <= 0); 1305 buddy = mb_find_buddy(e4b, ord, &max); 1306 mb_set_bit(start >> ord, buddy); 1307 e4b->bd_info->bb_counters[ord]--; 1308 1309 ord--; 1310 cur = (start >> ord) & ~1U; 1311 buddy = mb_find_buddy(e4b, ord, &max); 1312 mb_clear_bit(cur, buddy); 1313 mb_clear_bit(cur + 1, buddy); 1314 e4b->bd_info->bb_counters[ord]++; 1315 e4b->bd_info->bb_counters[ord]++; 1316 } 1317 1318 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1319 mb_check_buddy(e4b); 1320 1321 return ret; 1322 } 1323 1324 /* 1325 * Must be called under group lock! 1326 */ 1327 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1328 struct ext4_buddy *e4b) 1329 { 1330 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1331 int ret; 1332 1333 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1334 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1335 1336 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1337 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1338 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1339 1340 /* preallocation can change ac_b_ex, thus we store actually 1341 * allocated blocks for history */ 1342 ac->ac_f_ex = ac->ac_b_ex; 1343 1344 ac->ac_status = AC_STATUS_FOUND; 1345 ac->ac_tail = ret & 0xffff; 1346 ac->ac_buddy = ret >> 16; 1347 1348 /* 1349 * take the page reference. We want the page to be pinned 1350 * so that we don't get a ext4_mb_init_cache_call for this 1351 * group until we update the bitmap. That would mean we 1352 * double allocate blocks. The reference is dropped 1353 * in ext4_mb_release_context 1354 */ 1355 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1356 get_page(ac->ac_bitmap_page); 1357 ac->ac_buddy_page = e4b->bd_buddy_page; 1358 get_page(ac->ac_buddy_page); 1359 /* on allocation we use ac to track the held semaphore */ 1360 ac->alloc_semp = e4b->alloc_semp; 1361 e4b->alloc_semp = NULL; 1362 /* store last allocated for subsequent stream allocation */ 1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1364 spin_lock(&sbi->s_md_lock); 1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1367 spin_unlock(&sbi->s_md_lock); 1368 } 1369 } 1370 1371 /* 1372 * regular allocator, for general purposes allocation 1373 */ 1374 1375 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 1376 struct ext4_buddy *e4b, 1377 int finish_group) 1378 { 1379 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1380 struct ext4_free_extent *bex = &ac->ac_b_ex; 1381 struct ext4_free_extent *gex = &ac->ac_g_ex; 1382 struct ext4_free_extent ex; 1383 int max; 1384 1385 if (ac->ac_status == AC_STATUS_FOUND) 1386 return; 1387 /* 1388 * We don't want to scan for a whole year 1389 */ 1390 if (ac->ac_found > sbi->s_mb_max_to_scan && 1391 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1392 ac->ac_status = AC_STATUS_BREAK; 1393 return; 1394 } 1395 1396 /* 1397 * Haven't found good chunk so far, let's continue 1398 */ 1399 if (bex->fe_len < gex->fe_len) 1400 return; 1401 1402 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 1403 && bex->fe_group == e4b->bd_group) { 1404 /* recheck chunk's availability - we don't know 1405 * when it was found (within this lock-unlock 1406 * period or not) */ 1407 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1408 if (max >= gex->fe_len) { 1409 ext4_mb_use_best_found(ac, e4b); 1410 return; 1411 } 1412 } 1413 } 1414 1415 /* 1416 * The routine checks whether found extent is good enough. If it is, 1417 * then the extent gets marked used and flag is set to the context 1418 * to stop scanning. Otherwise, the extent is compared with the 1419 * previous found extent and if new one is better, then it's stored 1420 * in the context. Later, the best found extent will be used, if 1421 * mballoc can't find good enough extent. 1422 * 1423 * FIXME: real allocation policy is to be designed yet! 1424 */ 1425 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 1426 struct ext4_free_extent *ex, 1427 struct ext4_buddy *e4b) 1428 { 1429 struct ext4_free_extent *bex = &ac->ac_b_ex; 1430 struct ext4_free_extent *gex = &ac->ac_g_ex; 1431 1432 BUG_ON(ex->fe_len <= 0); 1433 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1434 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1435 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1436 1437 ac->ac_found++; 1438 1439 /* 1440 * The special case - take what you catch first 1441 */ 1442 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1443 *bex = *ex; 1444 ext4_mb_use_best_found(ac, e4b); 1445 return; 1446 } 1447 1448 /* 1449 * Let's check whether the chuck is good enough 1450 */ 1451 if (ex->fe_len == gex->fe_len) { 1452 *bex = *ex; 1453 ext4_mb_use_best_found(ac, e4b); 1454 return; 1455 } 1456 1457 /* 1458 * If this is first found extent, just store it in the context 1459 */ 1460 if (bex->fe_len == 0) { 1461 *bex = *ex; 1462 return; 1463 } 1464 1465 /* 1466 * If new found extent is better, store it in the context 1467 */ 1468 if (bex->fe_len < gex->fe_len) { 1469 /* if the request isn't satisfied, any found extent 1470 * larger than previous best one is better */ 1471 if (ex->fe_len > bex->fe_len) 1472 *bex = *ex; 1473 } else if (ex->fe_len > gex->fe_len) { 1474 /* if the request is satisfied, then we try to find 1475 * an extent that still satisfy the request, but is 1476 * smaller than previous one */ 1477 if (ex->fe_len < bex->fe_len) 1478 *bex = *ex; 1479 } 1480 1481 ext4_mb_check_limits(ac, e4b, 0); 1482 } 1483 1484 static noinline_for_stack 1485 int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1486 struct ext4_buddy *e4b) 1487 { 1488 struct ext4_free_extent ex = ac->ac_b_ex; 1489 ext4_group_t group = ex.fe_group; 1490 int max; 1491 int err; 1492 1493 BUG_ON(ex.fe_len <= 0); 1494 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1495 if (err) 1496 return err; 1497 1498 ext4_lock_group(ac->ac_sb, group); 1499 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1500 1501 if (max > 0) { 1502 ac->ac_b_ex = ex; 1503 ext4_mb_use_best_found(ac, e4b); 1504 } 1505 1506 ext4_unlock_group(ac->ac_sb, group); 1507 ext4_mb_release_desc(e4b); 1508 1509 return 0; 1510 } 1511 1512 static noinline_for_stack 1513 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1514 struct ext4_buddy *e4b) 1515 { 1516 ext4_group_t group = ac->ac_g_ex.fe_group; 1517 int max; 1518 int err; 1519 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1520 struct ext4_super_block *es = sbi->s_es; 1521 struct ext4_free_extent ex; 1522 1523 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1524 return 0; 1525 1526 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1527 if (err) 1528 return err; 1529 1530 ext4_lock_group(ac->ac_sb, group); 1531 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1532 ac->ac_g_ex.fe_len, &ex); 1533 1534 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1535 ext4_fsblk_t start; 1536 1537 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1538 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1539 /* use do_div to get remainder (would be 64-bit modulo) */ 1540 if (do_div(start, sbi->s_stripe) == 0) { 1541 ac->ac_found++; 1542 ac->ac_b_ex = ex; 1543 ext4_mb_use_best_found(ac, e4b); 1544 } 1545 } else if (max >= ac->ac_g_ex.fe_len) { 1546 BUG_ON(ex.fe_len <= 0); 1547 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1548 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1549 ac->ac_found++; 1550 ac->ac_b_ex = ex; 1551 ext4_mb_use_best_found(ac, e4b); 1552 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 1553 /* Sometimes, caller may want to merge even small 1554 * number of blocks to an existing extent */ 1555 BUG_ON(ex.fe_len <= 0); 1556 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1557 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1558 ac->ac_found++; 1559 ac->ac_b_ex = ex; 1560 ext4_mb_use_best_found(ac, e4b); 1561 } 1562 ext4_unlock_group(ac->ac_sb, group); 1563 ext4_mb_release_desc(e4b); 1564 1565 return 0; 1566 } 1567 1568 /* 1569 * The routine scans buddy structures (not bitmap!) from given order 1570 * to max order and tries to find big enough chunk to satisfy the req 1571 */ 1572 static noinline_for_stack 1573 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1574 struct ext4_buddy *e4b) 1575 { 1576 struct super_block *sb = ac->ac_sb; 1577 struct ext4_group_info *grp = e4b->bd_info; 1578 void *buddy; 1579 int i; 1580 int k; 1581 int max; 1582 1583 BUG_ON(ac->ac_2order <= 0); 1584 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1585 if (grp->bb_counters[i] == 0) 1586 continue; 1587 1588 buddy = mb_find_buddy(e4b, i, &max); 1589 BUG_ON(buddy == NULL); 1590 1591 k = mb_find_next_zero_bit(buddy, max, 0); 1592 BUG_ON(k >= max); 1593 1594 ac->ac_found++; 1595 1596 ac->ac_b_ex.fe_len = 1 << i; 1597 ac->ac_b_ex.fe_start = k << i; 1598 ac->ac_b_ex.fe_group = e4b->bd_group; 1599 1600 ext4_mb_use_best_found(ac, e4b); 1601 1602 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 1603 1604 if (EXT4_SB(sb)->s_mb_stats) 1605 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 1606 1607 break; 1608 } 1609 } 1610 1611 /* 1612 * The routine scans the group and measures all found extents. 1613 * In order to optimize scanning, caller must pass number of 1614 * free blocks in the group, so the routine can know upper limit. 1615 */ 1616 static noinline_for_stack 1617 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1618 struct ext4_buddy *e4b) 1619 { 1620 struct super_block *sb = ac->ac_sb; 1621 void *bitmap = EXT4_MB_BITMAP(e4b); 1622 struct ext4_free_extent ex; 1623 int i; 1624 int free; 1625 1626 free = e4b->bd_info->bb_free; 1627 BUG_ON(free <= 0); 1628 1629 i = e4b->bd_info->bb_first_free; 1630 1631 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1632 i = mb_find_next_zero_bit(bitmap, 1633 EXT4_BLOCKS_PER_GROUP(sb), i); 1634 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { 1635 /* 1636 * IF we have corrupt bitmap, we won't find any 1637 * free blocks even though group info says we 1638 * we have free blocks 1639 */ 1640 ext4_grp_locked_error(sb, e4b->bd_group, 1641 __func__, "%d free blocks as per " 1642 "group info. But bitmap says 0", 1643 free); 1644 break; 1645 } 1646 1647 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1648 BUG_ON(ex.fe_len <= 0); 1649 if (free < ex.fe_len) { 1650 ext4_grp_locked_error(sb, e4b->bd_group, 1651 __func__, "%d free blocks as per " 1652 "group info. But got %d blocks", 1653 free, ex.fe_len); 1654 /* 1655 * The number of free blocks differs. This mostly 1656 * indicate that the bitmap is corrupt. So exit 1657 * without claiming the space. 1658 */ 1659 break; 1660 } 1661 1662 ext4_mb_measure_extent(ac, &ex, e4b); 1663 1664 i += ex.fe_len; 1665 free -= ex.fe_len; 1666 } 1667 1668 ext4_mb_check_limits(ac, e4b, 1); 1669 } 1670 1671 /* 1672 * This is a special case for storages like raid5 1673 * we try to find stripe-aligned chunks for stripe-size requests 1674 * XXX should do so at least for multiples of stripe size as well 1675 */ 1676 static noinline_for_stack 1677 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1678 struct ext4_buddy *e4b) 1679 { 1680 struct super_block *sb = ac->ac_sb; 1681 struct ext4_sb_info *sbi = EXT4_SB(sb); 1682 void *bitmap = EXT4_MB_BITMAP(e4b); 1683 struct ext4_free_extent ex; 1684 ext4_fsblk_t first_group_block; 1685 ext4_fsblk_t a; 1686 ext4_grpblk_t i; 1687 int max; 1688 1689 BUG_ON(sbi->s_stripe == 0); 1690 1691 /* find first stripe-aligned block in group */ 1692 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1693 + le32_to_cpu(sbi->s_es->s_first_data_block); 1694 a = first_group_block + sbi->s_stripe - 1; 1695 do_div(a, sbi->s_stripe); 1696 i = (a * sbi->s_stripe) - first_group_block; 1697 1698 while (i < EXT4_BLOCKS_PER_GROUP(sb)) { 1699 if (!mb_test_bit(i, bitmap)) { 1700 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1701 if (max >= sbi->s_stripe) { 1702 ac->ac_found++; 1703 ac->ac_b_ex = ex; 1704 ext4_mb_use_best_found(ac, e4b); 1705 break; 1706 } 1707 } 1708 i += sbi->s_stripe; 1709 } 1710 } 1711 1712 static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1713 ext4_group_t group, int cr) 1714 { 1715 unsigned free, fragments; 1716 unsigned i, bits; 1717 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1718 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1719 1720 BUG_ON(cr < 0 || cr >= 4); 1721 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1722 1723 free = grp->bb_free; 1724 fragments = grp->bb_fragments; 1725 if (free == 0) 1726 return 0; 1727 if (fragments == 0) 1728 return 0; 1729 1730 switch (cr) { 1731 case 0: 1732 BUG_ON(ac->ac_2order == 0); 1733 1734 /* Avoid using the first bg of a flexgroup for data files */ 1735 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1736 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1737 ((group % flex_size) == 0)) 1738 return 0; 1739 1740 bits = ac->ac_sb->s_blocksize_bits + 1; 1741 for (i = ac->ac_2order; i <= bits; i++) 1742 if (grp->bb_counters[i] > 0) 1743 return 1; 1744 break; 1745 case 1: 1746 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1747 return 1; 1748 break; 1749 case 2: 1750 if (free >= ac->ac_g_ex.fe_len) 1751 return 1; 1752 break; 1753 case 3: 1754 return 1; 1755 default: 1756 BUG(); 1757 } 1758 1759 return 0; 1760 } 1761 1762 /* 1763 * lock the group_info alloc_sem of all the groups 1764 * belonging to the same buddy cache page. This 1765 * make sure other parallel operation on the buddy 1766 * cache doesn't happen whild holding the buddy cache 1767 * lock 1768 */ 1769 int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) 1770 { 1771 int i; 1772 int block, pnum; 1773 int blocks_per_page; 1774 int groups_per_page; 1775 ext4_group_t ngroups = ext4_get_groups_count(sb); 1776 ext4_group_t first_group; 1777 struct ext4_group_info *grp; 1778 1779 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1780 /* 1781 * the buddy cache inode stores the block bitmap 1782 * and buddy information in consecutive blocks. 1783 * So for each group we need two blocks. 1784 */ 1785 block = group * 2; 1786 pnum = block / blocks_per_page; 1787 first_group = pnum * blocks_per_page / 2; 1788 1789 groups_per_page = blocks_per_page >> 1; 1790 if (groups_per_page == 0) 1791 groups_per_page = 1; 1792 /* read all groups the page covers into the cache */ 1793 for (i = 0; i < groups_per_page; i++) { 1794 1795 if ((first_group + i) >= ngroups) 1796 break; 1797 grp = ext4_get_group_info(sb, first_group + i); 1798 /* take all groups write allocation 1799 * semaphore. This make sure there is 1800 * no block allocation going on in any 1801 * of that groups 1802 */ 1803 down_write_nested(&grp->alloc_sem, i); 1804 } 1805 return i; 1806 } 1807 1808 void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1809 ext4_group_t group, int locked_group) 1810 { 1811 int i; 1812 int block, pnum; 1813 int blocks_per_page; 1814 ext4_group_t first_group; 1815 struct ext4_group_info *grp; 1816 1817 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1818 /* 1819 * the buddy cache inode stores the block bitmap 1820 * and buddy information in consecutive blocks. 1821 * So for each group we need two blocks. 1822 */ 1823 block = group * 2; 1824 pnum = block / blocks_per_page; 1825 first_group = pnum * blocks_per_page / 2; 1826 /* release locks on all the groups */ 1827 for (i = 0; i < locked_group; i++) { 1828 1829 grp = ext4_get_group_info(sb, first_group + i); 1830 /* take all groups write allocation 1831 * semaphore. This make sure there is 1832 * no block allocation going on in any 1833 * of that groups 1834 */ 1835 up_write(&grp->alloc_sem); 1836 } 1837 1838 } 1839 1840 static noinline_for_stack 1841 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1842 { 1843 1844 int ret; 1845 void *bitmap; 1846 int blocks_per_page; 1847 int block, pnum, poff; 1848 int num_grp_locked = 0; 1849 struct ext4_group_info *this_grp; 1850 struct ext4_sb_info *sbi = EXT4_SB(sb); 1851 struct inode *inode = sbi->s_buddy_cache; 1852 struct page *page = NULL, *bitmap_page = NULL; 1853 1854 mb_debug("init group %lu\n", group); 1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1856 this_grp = ext4_get_group_info(sb, group); 1857 /* 1858 * This ensures we don't add group 1859 * to this buddy cache via resize 1860 */ 1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1863 /* 1864 * somebody initialized the group 1865 * return without doing anything 1866 */ 1867 ret = 0; 1868 goto err; 1869 } 1870 /* 1871 * the buddy cache inode stores the block bitmap 1872 * and buddy information in consecutive blocks. 1873 * So for each group we need two blocks. 1874 */ 1875 block = group * 2; 1876 pnum = block / blocks_per_page; 1877 poff = block % blocks_per_page; 1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1879 if (page) { 1880 BUG_ON(page->mapping != inode->i_mapping); 1881 ret = ext4_mb_init_cache(page, NULL); 1882 if (ret) { 1883 unlock_page(page); 1884 goto err; 1885 } 1886 unlock_page(page); 1887 } 1888 if (page == NULL || !PageUptodate(page)) { 1889 ret = -EIO; 1890 goto err; 1891 } 1892 mark_page_accessed(page); 1893 bitmap_page = page; 1894 bitmap = page_address(page) + (poff * sb->s_blocksize); 1895 1896 /* init buddy cache */ 1897 block++; 1898 pnum = block / blocks_per_page; 1899 poff = block % blocks_per_page; 1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1901 if (page == bitmap_page) { 1902 /* 1903 * If both the bitmap and buddy are in 1904 * the same page we don't need to force 1905 * init the buddy 1906 */ 1907 unlock_page(page); 1908 } else if (page) { 1909 BUG_ON(page->mapping != inode->i_mapping); 1910 ret = ext4_mb_init_cache(page, bitmap); 1911 if (ret) { 1912 unlock_page(page); 1913 goto err; 1914 } 1915 unlock_page(page); 1916 } 1917 if (page == NULL || !PageUptodate(page)) { 1918 ret = -EIO; 1919 goto err; 1920 } 1921 mark_page_accessed(page); 1922 err: 1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1924 if (bitmap_page) 1925 page_cache_release(bitmap_page); 1926 if (page) 1927 page_cache_release(page); 1928 return ret; 1929 } 1930 1931 static noinline_for_stack int 1932 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1933 { 1934 ext4_group_t ngroups, group, i; 1935 int cr; 1936 int err = 0; 1937 int bsbits; 1938 struct ext4_sb_info *sbi; 1939 struct super_block *sb; 1940 struct ext4_buddy e4b; 1941 loff_t size, isize; 1942 1943 sb = ac->ac_sb; 1944 sbi = EXT4_SB(sb); 1945 ngroups = ext4_get_groups_count(sb); 1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1947 1948 /* first, try the goal */ 1949 err = ext4_mb_find_by_goal(ac, &e4b); 1950 if (err || ac->ac_status == AC_STATUS_FOUND) 1951 goto out; 1952 1953 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 1954 goto out; 1955 1956 /* 1957 * ac->ac2_order is set only if the fe_len is a power of 2 1958 * if ac2_order is set we also set criteria to 0 so that we 1959 * try exact allocation using buddy. 1960 */ 1961 i = fls(ac->ac_g_ex.fe_len); 1962 ac->ac_2order = 0; 1963 /* 1964 * We search using buddy data only if the order of the request 1965 * is greater than equal to the sbi_s_mb_order2_reqs 1966 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 1967 */ 1968 if (i >= sbi->s_mb_order2_reqs) { 1969 /* 1970 * This should tell if fe_len is exactly power of 2 1971 */ 1972 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 1973 ac->ac_2order = i - 1; 1974 } 1975 1976 bsbits = ac->ac_sb->s_blocksize_bits; 1977 /* if stream allocation is enabled, use global goal */ 1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 1979 isize = i_size_read(ac->ac_inode) >> bsbits; 1980 if (size < isize) 1981 size = isize; 1982 1983 if (size < sbi->s_mb_stream_request && 1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 1985 /* TBD: may be hot point */ 1986 spin_lock(&sbi->s_md_lock); 1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 1989 spin_unlock(&sbi->s_md_lock); 1990 } 1991 /* Let's just scan groups to find more-less suitable blocks */ 1992 cr = ac->ac_2order ? 0 : 1; 1993 /* 1994 * cr == 0 try to get exact allocation, 1995 * cr == 3 try to get anything 1996 */ 1997 repeat: 1998 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 1999 ac->ac_criteria = cr; 2000 /* 2001 * searching for the right group start 2002 * from the goal value specified 2003 */ 2004 group = ac->ac_g_ex.fe_group; 2005 2006 for (i = 0; i < ngroups; group++, i++) { 2007 struct ext4_group_info *grp; 2008 struct ext4_group_desc *desc; 2009 2010 if (group == ngroups) 2011 group = 0; 2012 2013 /* quick check to skip empty groups */ 2014 grp = ext4_get_group_info(sb, group); 2015 if (grp->bb_free == 0) 2016 continue; 2017 2018 /* 2019 * if the group is already init we check whether it is 2020 * a good group and if not we don't load the buddy 2021 */ 2022 if (EXT4_MB_GRP_NEED_INIT(grp)) { 2023 /* 2024 * we need full data about the group 2025 * to make a good selection 2026 */ 2027 err = ext4_mb_init_group(sb, group); 2028 if (err) 2029 goto out; 2030 } 2031 2032 /* 2033 * If the particular group doesn't satisfy our 2034 * criteria we continue with the next group 2035 */ 2036 if (!ext4_mb_good_group(ac, group, cr)) 2037 continue; 2038 2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 if (err) 2041 goto out; 2042 2043 ext4_lock_group(sb, group); 2044 if (!ext4_mb_good_group(ac, group, cr)) { 2045 /* someone did allocation from this group */ 2046 ext4_unlock_group(sb, group); 2047 ext4_mb_release_desc(&e4b); 2048 continue; 2049 } 2050 2051 ac->ac_groups_scanned++; 2052 desc = ext4_get_group_desc(sb, group, NULL); 2053 if (cr == 0) 2054 ext4_mb_simple_scan_group(ac, &e4b); 2055 else if (cr == 1 && 2056 ac->ac_g_ex.fe_len == sbi->s_stripe) 2057 ext4_mb_scan_aligned(ac, &e4b); 2058 else 2059 ext4_mb_complex_scan_group(ac, &e4b); 2060 2061 ext4_unlock_group(sb, group); 2062 ext4_mb_release_desc(&e4b); 2063 2064 if (ac->ac_status != AC_STATUS_CONTINUE) 2065 break; 2066 } 2067 } 2068 2069 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2070 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2071 /* 2072 * We've been searching too long. Let's try to allocate 2073 * the best chunk we've found so far 2074 */ 2075 2076 ext4_mb_try_best_found(ac, &e4b); 2077 if (ac->ac_status != AC_STATUS_FOUND) { 2078 /* 2079 * Someone more lucky has already allocated it. 2080 * The only thing we can do is just take first 2081 * found block(s) 2082 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); 2083 */ 2084 ac->ac_b_ex.fe_group = 0; 2085 ac->ac_b_ex.fe_start = 0; 2086 ac->ac_b_ex.fe_len = 0; 2087 ac->ac_status = AC_STATUS_CONTINUE; 2088 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2089 cr = 3; 2090 atomic_inc(&sbi->s_mb_lost_chunks); 2091 goto repeat; 2092 } 2093 } 2094 out: 2095 return err; 2096 } 2097 2098 #ifdef EXT4_MB_HISTORY 2099 struct ext4_mb_proc_session { 2100 struct ext4_mb_history *history; 2101 struct super_block *sb; 2102 int start; 2103 int max; 2104 }; 2105 2106 static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, 2107 struct ext4_mb_history *hs, 2108 int first) 2109 { 2110 if (hs == s->history + s->max) 2111 hs = s->history; 2112 if (!first && hs == s->history + s->start) 2113 return NULL; 2114 while (hs->orig.fe_len == 0) { 2115 hs++; 2116 if (hs == s->history + s->max) 2117 hs = s->history; 2118 if (hs == s->history + s->start) 2119 return NULL; 2120 } 2121 return hs; 2122 } 2123 2124 static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) 2125 { 2126 struct ext4_mb_proc_session *s = seq->private; 2127 struct ext4_mb_history *hs; 2128 int l = *pos; 2129 2130 if (l == 0) 2131 return SEQ_START_TOKEN; 2132 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); 2133 if (!hs) 2134 return NULL; 2135 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); 2136 return hs; 2137 } 2138 2139 static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, 2140 loff_t *pos) 2141 { 2142 struct ext4_mb_proc_session *s = seq->private; 2143 struct ext4_mb_history *hs = v; 2144 2145 ++*pos; 2146 if (v == SEQ_START_TOKEN) 2147 return ext4_mb_history_skip_empty(s, s->history + s->start, 1); 2148 else 2149 return ext4_mb_history_skip_empty(s, ++hs, 0); 2150 } 2151 2152 static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) 2153 { 2154 char buf[25], buf2[25], buf3[25], *fmt; 2155 struct ext4_mb_history *hs = v; 2156 2157 if (v == SEQ_START_TOKEN) { 2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2160 "pid", "inode", "original", "goal", "result", "found", 2161 "grps", "cr", "flags", "merge", "tail", "broken"); 2162 return 0; 2163 } 2164 2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2167 "%-5u %-5s %-5u %-6u\n"; 2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2169 hs->result.fe_start, hs->result.fe_len, 2170 hs->result.fe_logical); 2171 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, 2172 hs->orig.fe_start, hs->orig.fe_len, 2173 hs->orig.fe_logical); 2174 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, 2175 hs->goal.fe_start, hs->goal.fe_len, 2176 hs->goal.fe_logical); 2177 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 2178 hs->found, hs->groups, hs->cr, hs->flags, 2179 hs->merged ? "M" : "", hs->tail, 2180 hs->buddy ? 1 << hs->buddy : 0); 2181 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 2182 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 2183 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2184 hs->result.fe_start, hs->result.fe_len, 2185 hs->result.fe_logical); 2186 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, 2187 hs->orig.fe_start, hs->orig.fe_len, 2188 hs->orig.fe_logical); 2189 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 2190 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 2191 sprintf(buf2, "%u/%d/%u", hs->result.fe_group, 2192 hs->result.fe_start, hs->result.fe_len); 2193 seq_printf(seq, "%-5u %-8u %-23s discard\n", 2194 hs->pid, hs->ino, buf2); 2195 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 2196 sprintf(buf2, "%u/%d/%u", hs->result.fe_group, 2197 hs->result.fe_start, hs->result.fe_len); 2198 seq_printf(seq, "%-5u %-8u %-23s free\n", 2199 hs->pid, hs->ino, buf2); 2200 } 2201 return 0; 2202 } 2203 2204 static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) 2205 { 2206 } 2207 2208 static struct seq_operations ext4_mb_seq_history_ops = { 2209 .start = ext4_mb_seq_history_start, 2210 .next = ext4_mb_seq_history_next, 2211 .stop = ext4_mb_seq_history_stop, 2212 .show = ext4_mb_seq_history_show, 2213 }; 2214 2215 static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) 2216 { 2217 struct super_block *sb = PDE(inode)->data; 2218 struct ext4_sb_info *sbi = EXT4_SB(sb); 2219 struct ext4_mb_proc_session *s; 2220 int rc; 2221 int size; 2222 2223 if (unlikely(sbi->s_mb_history == NULL)) 2224 return -ENOMEM; 2225 s = kmalloc(sizeof(*s), GFP_KERNEL); 2226 if (s == NULL) 2227 return -ENOMEM; 2228 s->sb = sb; 2229 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; 2230 s->history = kmalloc(size, GFP_KERNEL); 2231 if (s->history == NULL) { 2232 kfree(s); 2233 return -ENOMEM; 2234 } 2235 2236 spin_lock(&sbi->s_mb_history_lock); 2237 memcpy(s->history, sbi->s_mb_history, size); 2238 s->max = sbi->s_mb_history_max; 2239 s->start = sbi->s_mb_history_cur % s->max; 2240 spin_unlock(&sbi->s_mb_history_lock); 2241 2242 rc = seq_open(file, &ext4_mb_seq_history_ops); 2243 if (rc == 0) { 2244 struct seq_file *m = (struct seq_file *)file->private_data; 2245 m->private = s; 2246 } else { 2247 kfree(s->history); 2248 kfree(s); 2249 } 2250 return rc; 2251 2252 } 2253 2254 static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) 2255 { 2256 struct seq_file *seq = (struct seq_file *)file->private_data; 2257 struct ext4_mb_proc_session *s = seq->private; 2258 kfree(s->history); 2259 kfree(s); 2260 return seq_release(inode, file); 2261 } 2262 2263 static ssize_t ext4_mb_seq_history_write(struct file *file, 2264 const char __user *buffer, 2265 size_t count, loff_t *ppos) 2266 { 2267 struct seq_file *seq = (struct seq_file *)file->private_data; 2268 struct ext4_mb_proc_session *s = seq->private; 2269 struct super_block *sb = s->sb; 2270 char str[32]; 2271 int value; 2272 2273 if (count >= sizeof(str)) { 2274 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", 2275 "mb_history", (int)sizeof(str)); 2276 return -EOVERFLOW; 2277 } 2278 2279 if (copy_from_user(str, buffer, count)) 2280 return -EFAULT; 2281 2282 value = simple_strtol(str, NULL, 0); 2283 if (value < 0) 2284 return -ERANGE; 2285 EXT4_SB(sb)->s_mb_history_filter = value; 2286 2287 return count; 2288 } 2289 2290 static struct file_operations ext4_mb_seq_history_fops = { 2291 .owner = THIS_MODULE, 2292 .open = ext4_mb_seq_history_open, 2293 .read = seq_read, 2294 .write = ext4_mb_seq_history_write, 2295 .llseek = seq_lseek, 2296 .release = ext4_mb_seq_history_release, 2297 }; 2298 2299 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2300 { 2301 struct super_block *sb = seq->private; 2302 ext4_group_t group; 2303 2304 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2305 return NULL; 2306 group = *pos + 1; 2307 return (void *) ((unsigned long) group); 2308 } 2309 2310 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2311 { 2312 struct super_block *sb = seq->private; 2313 ext4_group_t group; 2314 2315 ++*pos; 2316 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2317 return NULL; 2318 group = *pos + 1; 2319 return (void *) ((unsigned long) group); 2320 } 2321 2322 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2323 { 2324 struct super_block *sb = seq->private; 2325 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2326 int i; 2327 int err; 2328 struct ext4_buddy e4b; 2329 struct sg { 2330 struct ext4_group_info info; 2331 unsigned short counters[16]; 2332 } sg; 2333 2334 group--; 2335 if (group == 0) 2336 seq_printf(seq, "#%-5s: %-5s %-5s %-5s " 2337 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " 2338 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", 2339 "group", "free", "frags", "first", 2340 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", 2341 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); 2342 2343 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2344 sizeof(struct ext4_group_info); 2345 err = ext4_mb_load_buddy(sb, group, &e4b); 2346 if (err) { 2347 seq_printf(seq, "#%-5u: I/O error\n", group); 2348 return 0; 2349 } 2350 ext4_lock_group(sb, group); 2351 memcpy(&sg, ext4_get_group_info(sb, group), i); 2352 ext4_unlock_group(sb, group); 2353 ext4_mb_release_desc(&e4b); 2354 2355 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2356 sg.info.bb_fragments, sg.info.bb_first_free); 2357 for (i = 0; i <= 13; i++) 2358 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2359 sg.info.bb_counters[i] : 0); 2360 seq_printf(seq, " ]\n"); 2361 2362 return 0; 2363 } 2364 2365 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2366 { 2367 } 2368 2369 static struct seq_operations ext4_mb_seq_groups_ops = { 2370 .start = ext4_mb_seq_groups_start, 2371 .next = ext4_mb_seq_groups_next, 2372 .stop = ext4_mb_seq_groups_stop, 2373 .show = ext4_mb_seq_groups_show, 2374 }; 2375 2376 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) 2377 { 2378 struct super_block *sb = PDE(inode)->data; 2379 int rc; 2380 2381 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2382 if (rc == 0) { 2383 struct seq_file *m = (struct seq_file *)file->private_data; 2384 m->private = sb; 2385 } 2386 return rc; 2387 2388 } 2389 2390 static struct file_operations ext4_mb_seq_groups_fops = { 2391 .owner = THIS_MODULE, 2392 .open = ext4_mb_seq_groups_open, 2393 .read = seq_read, 2394 .llseek = seq_lseek, 2395 .release = seq_release, 2396 }; 2397 2398 static void ext4_mb_history_release(struct super_block *sb) 2399 { 2400 struct ext4_sb_info *sbi = EXT4_SB(sb); 2401 2402 if (sbi->s_proc != NULL) { 2403 remove_proc_entry("mb_groups", sbi->s_proc); 2404 if (sbi->s_mb_history_max) 2405 remove_proc_entry("mb_history", sbi->s_proc); 2406 } 2407 kfree(sbi->s_mb_history); 2408 } 2409 2410 static void ext4_mb_history_init(struct super_block *sb) 2411 { 2412 struct ext4_sb_info *sbi = EXT4_SB(sb); 2413 int i; 2414 2415 if (sbi->s_proc != NULL) { 2416 if (sbi->s_mb_history_max) 2417 proc_create_data("mb_history", S_IRUGO, sbi->s_proc, 2418 &ext4_mb_seq_history_fops, sb); 2419 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2420 &ext4_mb_seq_groups_fops, sb); 2421 } 2422 2423 sbi->s_mb_history_cur = 0; 2424 spin_lock_init(&sbi->s_mb_history_lock); 2425 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2426 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; 2427 /* if we can't allocate history, then we simple won't use it */ 2428 } 2429 2430 static noinline_for_stack void 2431 ext4_mb_store_history(struct ext4_allocation_context *ac) 2432 { 2433 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2434 struct ext4_mb_history h; 2435 2436 if (sbi->s_mb_history == NULL) 2437 return; 2438 2439 if (!(ac->ac_op & sbi->s_mb_history_filter)) 2440 return; 2441 2442 h.op = ac->ac_op; 2443 h.pid = current->pid; 2444 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; 2445 h.orig = ac->ac_o_ex; 2446 h.result = ac->ac_b_ex; 2447 h.flags = ac->ac_flags; 2448 h.found = ac->ac_found; 2449 h.groups = ac->ac_groups_scanned; 2450 h.cr = ac->ac_criteria; 2451 h.tail = ac->ac_tail; 2452 h.buddy = ac->ac_buddy; 2453 h.merged = 0; 2454 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { 2455 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 2456 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 2457 h.merged = 1; 2458 h.goal = ac->ac_g_ex; 2459 h.result = ac->ac_f_ex; 2460 } 2461 2462 spin_lock(&sbi->s_mb_history_lock); 2463 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); 2464 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) 2465 sbi->s_mb_history_cur = 0; 2466 spin_unlock(&sbi->s_mb_history_lock); 2467 } 2468 2469 #else 2470 #define ext4_mb_history_release(sb) 2471 #define ext4_mb_history_init(sb) 2472 #endif 2473 2474 2475 /* Create and initialize ext4_group_info data for the given group. */ 2476 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2477 struct ext4_group_desc *desc) 2478 { 2479 int i, len; 2480 int metalen = 0; 2481 struct ext4_sb_info *sbi = EXT4_SB(sb); 2482 struct ext4_group_info **meta_group_info; 2483 2484 /* 2485 * First check if this group is the first of a reserved block. 2486 * If it's true, we have to allocate a new table of pointers 2487 * to ext4_group_info structures 2488 */ 2489 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2490 metalen = sizeof(*meta_group_info) << 2491 EXT4_DESC_PER_BLOCK_BITS(sb); 2492 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2493 if (meta_group_info == NULL) { 2494 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2495 "buddy group\n"); 2496 goto exit_meta_group_info; 2497 } 2498 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2499 meta_group_info; 2500 } 2501 2502 /* 2503 * calculate needed size. if change bb_counters size, 2504 * don't forget about ext4_mb_generate_buddy() 2505 */ 2506 len = offsetof(typeof(**meta_group_info), 2507 bb_counters[sb->s_blocksize_bits + 2]); 2508 2509 meta_group_info = 2510 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2511 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2512 2513 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2514 if (meta_group_info[i] == NULL) { 2515 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2516 goto exit_group_info; 2517 } 2518 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2519 &(meta_group_info[i]->bb_state)); 2520 2521 /* 2522 * initialize bb_free to be able to skip 2523 * empty groups without initialization 2524 */ 2525 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2526 meta_group_info[i]->bb_free = 2527 ext4_free_blocks_after_init(sb, group, desc); 2528 } else { 2529 meta_group_info[i]->bb_free = 2530 ext4_free_blks_count(sb, desc); 2531 } 2532 2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2536 2537 #ifdef DOUBLE_CHECK 2538 { 2539 struct buffer_head *bh; 2540 meta_group_info[i]->bb_bitmap = 2541 kmalloc(sb->s_blocksize, GFP_KERNEL); 2542 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2543 bh = ext4_read_block_bitmap(sb, group); 2544 BUG_ON(bh == NULL); 2545 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, 2546 sb->s_blocksize); 2547 put_bh(bh); 2548 } 2549 #endif 2550 2551 return 0; 2552 2553 exit_group_info: 2554 /* If a meta_group_info table has been allocated, release it now */ 2555 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) 2556 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2557 exit_meta_group_info: 2558 return -ENOMEM; 2559 } /* ext4_mb_add_groupinfo */ 2560 2561 /* 2562 * Update an existing group. 2563 * This function is used for online resize 2564 */ 2565 void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) 2566 { 2567 grp->bb_free += add; 2568 } 2569 2570 static int ext4_mb_init_backend(struct super_block *sb) 2571 { 2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2573 ext4_group_t i; 2574 int metalen; 2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2576 struct ext4_super_block *es = sbi->s_es; 2577 int num_meta_group_infos; 2578 int num_meta_group_infos_max; 2579 int array_size; 2580 struct ext4_group_info **meta_group_info; 2581 struct ext4_group_desc *desc; 2582 2583 /* This is the number of blocks used by GDT */ 2584 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2585 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2586 2587 /* 2588 * This is the total number of blocks used by GDT including 2589 * the number of reserved blocks for GDT. 2590 * The s_group_info array is allocated with this value 2591 * to allow a clean online resize without a complex 2592 * manipulation of pointer. 2593 * The drawback is the unused memory when no resize 2594 * occurs but it's very low in terms of pages 2595 * (see comments below) 2596 * Need to handle this properly when META_BG resizing is allowed 2597 */ 2598 num_meta_group_infos_max = num_meta_group_infos + 2599 le16_to_cpu(es->s_reserved_gdt_blocks); 2600 2601 /* 2602 * array_size is the size of s_group_info array. We round it 2603 * to the next power of two because this approximation is done 2604 * internally by kmalloc so we can have some more memory 2605 * for free here (e.g. may be used for META_BG resize). 2606 */ 2607 array_size = 1; 2608 while (array_size < sizeof(*sbi->s_group_info) * 2609 num_meta_group_infos_max) 2610 array_size = array_size << 1; 2611 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2612 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2613 * So a two level scheme suffices for now. */ 2614 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2615 if (sbi->s_group_info == NULL) { 2616 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2617 return -ENOMEM; 2618 } 2619 sbi->s_buddy_cache = new_inode(sb); 2620 if (sbi->s_buddy_cache == NULL) { 2621 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2622 goto err_freesgi; 2623 } 2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2625 2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); 2627 for (i = 0; i < num_meta_group_infos; i++) { 2628 if ((i + 1) == num_meta_group_infos) 2629 metalen = sizeof(*meta_group_info) * 2630 (ngroups - 2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb))); 2632 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2633 if (meta_group_info == NULL) { 2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2635 "buddy group\n"); 2636 goto err_freemeta; 2637 } 2638 sbi->s_group_info[i] = meta_group_info; 2639 } 2640 2641 for (i = 0; i < ngroups; i++) { 2642 desc = ext4_get_group_desc(sb, i, NULL); 2643 if (desc == NULL) { 2644 printk(KERN_ERR 2645 "EXT4-fs: can't read descriptor %u\n", i); 2646 goto err_freebuddy; 2647 } 2648 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2649 goto err_freebuddy; 2650 } 2651 2652 return 0; 2653 2654 err_freebuddy: 2655 while (i-- > 0) 2656 kfree(ext4_get_group_info(sb, i)); 2657 i = num_meta_group_infos; 2658 err_freemeta: 2659 while (i-- > 0) 2660 kfree(sbi->s_group_info[i]); 2661 iput(sbi->s_buddy_cache); 2662 err_freesgi: 2663 kfree(sbi->s_group_info); 2664 return -ENOMEM; 2665 } 2666 2667 int ext4_mb_init(struct super_block *sb, int needs_recovery) 2668 { 2669 struct ext4_sb_info *sbi = EXT4_SB(sb); 2670 unsigned i, j; 2671 unsigned offset; 2672 unsigned max; 2673 int ret; 2674 2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2676 2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2678 if (sbi->s_mb_offsets == NULL) { 2679 return -ENOMEM; 2680 } 2681 2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2684 if (sbi->s_mb_maxs == NULL) { 2685 kfree(sbi->s_mb_offsets); 2686 return -ENOMEM; 2687 } 2688 2689 /* order 0 is regular bitmap */ 2690 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2691 sbi->s_mb_offsets[0] = 0; 2692 2693 i = 1; 2694 offset = 0; 2695 max = sb->s_blocksize << 2; 2696 do { 2697 sbi->s_mb_offsets[i] = offset; 2698 sbi->s_mb_maxs[i] = max; 2699 offset += 1 << (sb->s_blocksize_bits - i); 2700 max = max >> 1; 2701 i++; 2702 } while (i <= sb->s_blocksize_bits + 1); 2703 2704 /* init file for buddy data */ 2705 ret = ext4_mb_init_backend(sb); 2706 if (ret != 0) { 2707 kfree(sbi->s_mb_offsets); 2708 kfree(sbi->s_mb_maxs); 2709 return ret; 2710 } 2711 2712 spin_lock_init(&sbi->s_md_lock); 2713 spin_lock_init(&sbi->s_bal_lock); 2714 2715 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2716 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 2717 sbi->s_mb_stats = MB_DEFAULT_STATS; 2718 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2719 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2720 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2721 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2722 2723 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2724 if (sbi->s_locality_groups == NULL) { 2725 kfree(sbi->s_mb_offsets); 2726 kfree(sbi->s_mb_maxs); 2727 return -ENOMEM; 2728 } 2729 for_each_possible_cpu(i) { 2730 struct ext4_locality_group *lg; 2731 lg = per_cpu_ptr(sbi->s_locality_groups, i); 2732 mutex_init(&lg->lg_mutex); 2733 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2734 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2735 spin_lock_init(&lg->lg_prealloc_lock); 2736 } 2737 2738 ext4_mb_history_init(sb); 2739 2740 if (sbi->s_journal) 2741 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2742 2743 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2744 return 0; 2745 } 2746 2747 /* need to called with the ext4 group lock held */ 2748 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2749 { 2750 struct ext4_prealloc_space *pa; 2751 struct list_head *cur, *tmp; 2752 int count = 0; 2753 2754 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 2755 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2756 list_del(&pa->pa_group_list); 2757 count++; 2758 kmem_cache_free(ext4_pspace_cachep, pa); 2759 } 2760 if (count) 2761 mb_debug("mballoc: %u PAs left\n", count); 2762 2763 } 2764 2765 int ext4_mb_release(struct super_block *sb) 2766 { 2767 ext4_group_t ngroups = ext4_get_groups_count(sb); 2768 ext4_group_t i; 2769 int num_meta_group_infos; 2770 struct ext4_group_info *grinfo; 2771 struct ext4_sb_info *sbi = EXT4_SB(sb); 2772 2773 if (sbi->s_group_info) { 2774 for (i = 0; i < ngroups; i++) { 2775 grinfo = ext4_get_group_info(sb, i); 2776 #ifdef DOUBLE_CHECK 2777 kfree(grinfo->bb_bitmap); 2778 #endif 2779 ext4_lock_group(sb, i); 2780 ext4_mb_cleanup_pa(grinfo); 2781 ext4_unlock_group(sb, i); 2782 kfree(grinfo); 2783 } 2784 num_meta_group_infos = (ngroups + 2785 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2786 EXT4_DESC_PER_BLOCK_BITS(sb); 2787 for (i = 0; i < num_meta_group_infos; i++) 2788 kfree(sbi->s_group_info[i]); 2789 kfree(sbi->s_group_info); 2790 } 2791 kfree(sbi->s_mb_offsets); 2792 kfree(sbi->s_mb_maxs); 2793 if (sbi->s_buddy_cache) 2794 iput(sbi->s_buddy_cache); 2795 if (sbi->s_mb_stats) { 2796 printk(KERN_INFO 2797 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", 2798 atomic_read(&sbi->s_bal_allocated), 2799 atomic_read(&sbi->s_bal_reqs), 2800 atomic_read(&sbi->s_bal_success)); 2801 printk(KERN_INFO 2802 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " 2803 "%u 2^N hits, %u breaks, %u lost\n", 2804 atomic_read(&sbi->s_bal_ex_scanned), 2805 atomic_read(&sbi->s_bal_goals), 2806 atomic_read(&sbi->s_bal_2orders), 2807 atomic_read(&sbi->s_bal_breaks), 2808 atomic_read(&sbi->s_mb_lost_chunks)); 2809 printk(KERN_INFO 2810 "EXT4-fs: mballoc: %lu generated and it took %Lu\n", 2811 sbi->s_mb_buddies_generated++, 2812 sbi->s_mb_generation_time); 2813 printk(KERN_INFO 2814 "EXT4-fs: mballoc: %u preallocated, %u discarded\n", 2815 atomic_read(&sbi->s_mb_preallocated), 2816 atomic_read(&sbi->s_mb_discarded)); 2817 } 2818 2819 free_percpu(sbi->s_locality_groups); 2820 ext4_mb_history_release(sb); 2821 2822 return 0; 2823 } 2824 2825 /* 2826 * This function is called by the jbd2 layer once the commit has finished, 2827 * so we know we can free the blocks that were released with that commit. 2828 */ 2829 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2830 { 2831 struct super_block *sb = journal->j_private; 2832 struct ext4_buddy e4b; 2833 struct ext4_group_info *db; 2834 int err, count = 0, count2 = 0; 2835 struct ext4_free_data *entry; 2836 ext4_fsblk_t discard_block; 2837 struct list_head *l, *ltmp; 2838 2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2840 entry = list_entry(l, struct ext4_free_data, list); 2841 2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2843 entry->count, entry->group, entry); 2844 2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2846 /* we expect to find existing buddy because it's pinned */ 2847 BUG_ON(err != 0); 2848 2849 db = e4b.bd_info; 2850 /* there are blocks to put in buddy to make them really free */ 2851 count += entry->count; 2852 count2++; 2853 ext4_lock_group(sb, entry->group); 2854 /* Take it out of per group rb tree */ 2855 rb_erase(&entry->node, &(db->bb_free_root)); 2856 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2857 2858 if (!db->bb_free_root.rb_node) { 2859 /* No more items in the per group rb tree 2860 * balance refcounts from ext4_mb_free_metadata() 2861 */ 2862 page_cache_release(e4b.bd_buddy_page); 2863 page_cache_release(e4b.bd_bitmap_page); 2864 } 2865 ext4_unlock_group(sb, entry->group); 2866 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2867 + entry->start_blk 2868 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2869 trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, 2870 entry->count); 2871 sb_issue_discard(sb, discard_block, entry->count); 2872 2873 kmem_cache_free(ext4_free_ext_cachep, entry); 2874 ext4_mb_release_desc(&e4b); 2875 } 2876 2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2878 } 2879 2880 int __init init_ext4_mballoc(void) 2881 { 2882 ext4_pspace_cachep = 2883 kmem_cache_create("ext4_prealloc_space", 2884 sizeof(struct ext4_prealloc_space), 2885 0, SLAB_RECLAIM_ACCOUNT, NULL); 2886 if (ext4_pspace_cachep == NULL) 2887 return -ENOMEM; 2888 2889 ext4_ac_cachep = 2890 kmem_cache_create("ext4_alloc_context", 2891 sizeof(struct ext4_allocation_context), 2892 0, SLAB_RECLAIM_ACCOUNT, NULL); 2893 if (ext4_ac_cachep == NULL) { 2894 kmem_cache_destroy(ext4_pspace_cachep); 2895 return -ENOMEM; 2896 } 2897 2898 ext4_free_ext_cachep = 2899 kmem_cache_create("ext4_free_block_extents", 2900 sizeof(struct ext4_free_data), 2901 0, SLAB_RECLAIM_ACCOUNT, NULL); 2902 if (ext4_free_ext_cachep == NULL) { 2903 kmem_cache_destroy(ext4_pspace_cachep); 2904 kmem_cache_destroy(ext4_ac_cachep); 2905 return -ENOMEM; 2906 } 2907 return 0; 2908 } 2909 2910 void exit_ext4_mballoc(void) 2911 { 2912 /* 2913 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2914 * before destroying the slab cache. 2915 */ 2916 rcu_barrier(); 2917 kmem_cache_destroy(ext4_pspace_cachep); 2918 kmem_cache_destroy(ext4_ac_cachep); 2919 kmem_cache_destroy(ext4_free_ext_cachep); 2920 } 2921 2922 2923 /* 2924 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps 2925 * Returns 0 if success or error code 2926 */ 2927 static noinline_for_stack int 2928 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2929 handle_t *handle, unsigned int reserv_blks) 2930 { 2931 struct buffer_head *bitmap_bh = NULL; 2932 struct ext4_super_block *es; 2933 struct ext4_group_desc *gdp; 2934 struct buffer_head *gdp_bh; 2935 struct ext4_sb_info *sbi; 2936 struct super_block *sb; 2937 ext4_fsblk_t block; 2938 int err, len; 2939 2940 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 2941 BUG_ON(ac->ac_b_ex.fe_len <= 0); 2942 2943 sb = ac->ac_sb; 2944 sbi = EXT4_SB(sb); 2945 es = sbi->s_es; 2946 2947 2948 err = -EIO; 2949 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2950 if (!bitmap_bh) 2951 goto out_err; 2952 2953 err = ext4_journal_get_write_access(handle, bitmap_bh); 2954 if (err) 2955 goto out_err; 2956 2957 err = -EIO; 2958 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 2959 if (!gdp) 2960 goto out_err; 2961 2962 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 2963 ext4_free_blks_count(sb, gdp)); 2964 2965 err = ext4_journal_get_write_access(handle, gdp_bh); 2966 if (err) 2967 goto out_err; 2968 2969 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2970 + ac->ac_b_ex.fe_start 2971 + le32_to_cpu(es->s_first_data_block); 2972 2973 len = ac->ac_b_ex.fe_len; 2974 if (!ext4_data_block_valid(sbi, block, len)) { 2975 ext4_error(sb, __func__, 2976 "Allocating blocks %llu-%llu which overlap " 2977 "fs metadata\n", block, block+len); 2978 /* File system mounted not to panic on error 2979 * Fix the bitmap and repeat the block allocation 2980 * We leak some of the blocks here. 2981 */ 2982 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2983 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2984 ac->ac_b_ex.fe_len); 2985 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2986 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2987 if (!err) 2988 err = -EAGAIN; 2989 goto out_err; 2990 } 2991 2992 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2993 #ifdef AGGRESSIVE_CHECK 2994 { 2995 int i; 2996 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 2997 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 2998 bitmap_bh->b_data)); 2999 } 3000 } 3001 #endif 3002 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); 3003 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3004 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3005 ext4_free_blks_set(sb, gdp, 3006 ext4_free_blocks_after_init(sb, 3007 ac->ac_b_ex.fe_group, gdp)); 3008 } 3009 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 3010 ext4_free_blks_set(sb, gdp, len); 3011 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3012 3013 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3014 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3015 /* 3016 * Now reduce the dirty block count also. Should not go negative 3017 */ 3018 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 3019 /* release all the reserved blocks if non delalloc */ 3020 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 3021 else { 3022 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 3023 ac->ac_b_ex.fe_len); 3024 /* convert reserved quota blocks to real quota blocks */ 3025 vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len); 3026 } 3027 3028 if (sbi->s_log_groups_per_flex) { 3029 ext4_group_t flex_group = ext4_flex_group(sbi, 3030 ac->ac_b_ex.fe_group); 3031 atomic_sub(ac->ac_b_ex.fe_len, 3032 &sbi->s_flex_groups[flex_group].free_blocks); 3033 } 3034 3035 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3036 if (err) 3037 goto out_err; 3038 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 3039 3040 out_err: 3041 sb->s_dirt = 1; 3042 brelse(bitmap_bh); 3043 return err; 3044 } 3045 3046 /* 3047 * here we normalize request for locality group 3048 * Group request are normalized to s_strip size if we set the same via mount 3049 * option. If not we set it to s_mb_group_prealloc which can be configured via 3050 * /sys/fs/ext4/<partition>/mb_group_prealloc 3051 * 3052 * XXX: should we try to preallocate more than the group has now? 3053 */ 3054 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 3055 { 3056 struct super_block *sb = ac->ac_sb; 3057 struct ext4_locality_group *lg = ac->ac_lg; 3058 3059 BUG_ON(lg == NULL); 3060 if (EXT4_SB(sb)->s_stripe) 3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3062 else 3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3064 mb_debug("#%u: goal %u blocks for locality group\n", 3065 current->pid, ac->ac_g_ex.fe_len); 3066 } 3067 3068 /* 3069 * Normalization means making request better in terms of 3070 * size and alignment 3071 */ 3072 static noinline_for_stack void 3073 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 3074 struct ext4_allocation_request *ar) 3075 { 3076 int bsbits, max; 3077 ext4_lblk_t end; 3078 loff_t size, orig_size, start_off; 3079 ext4_lblk_t start, orig_start; 3080 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3081 struct ext4_prealloc_space *pa; 3082 3083 /* do normalize only data requests, metadata requests 3084 do not need preallocation */ 3085 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3086 return; 3087 3088 /* sometime caller may want exact blocks */ 3089 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3090 return; 3091 3092 /* caller may indicate that preallocation isn't 3093 * required (it's a tail, for example) */ 3094 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 3095 return; 3096 3097 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 3098 ext4_mb_normalize_group_request(ac); 3099 return ; 3100 } 3101 3102 bsbits = ac->ac_sb->s_blocksize_bits; 3103 3104 /* first, let's learn actual file size 3105 * given current request is allocated */ 3106 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3107 size = size << bsbits; 3108 if (size < i_size_read(ac->ac_inode)) 3109 size = i_size_read(ac->ac_inode); 3110 3111 /* max size of free chunks */ 3112 max = 2 << bsbits; 3113 3114 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 3115 (req <= (size) || max <= (chunk_size)) 3116 3117 /* first, try to predict filesize */ 3118 /* XXX: should this table be tunable? */ 3119 start_off = 0; 3120 if (size <= 16 * 1024) { 3121 size = 16 * 1024; 3122 } else if (size <= 32 * 1024) { 3123 size = 32 * 1024; 3124 } else if (size <= 64 * 1024) { 3125 size = 64 * 1024; 3126 } else if (size <= 128 * 1024) { 3127 size = 128 * 1024; 3128 } else if (size <= 256 * 1024) { 3129 size = 256 * 1024; 3130 } else if (size <= 512 * 1024) { 3131 size = 512 * 1024; 3132 } else if (size <= 1024 * 1024) { 3133 size = 1024 * 1024; 3134 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 3135 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3136 (21 - bsbits)) << 21; 3137 size = 2 * 1024 * 1024; 3138 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 3139 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3140 (22 - bsbits)) << 22; 3141 size = 4 * 1024 * 1024; 3142 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 3143 (8<<20)>>bsbits, max, 8 * 1024)) { 3144 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3145 (23 - bsbits)) << 23; 3146 size = 8 * 1024 * 1024; 3147 } else { 3148 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 3149 size = ac->ac_o_ex.fe_len << bsbits; 3150 } 3151 orig_size = size = size >> bsbits; 3152 orig_start = start = start_off >> bsbits; 3153 3154 /* don't cover already allocated blocks in selected range */ 3155 if (ar->pleft && start <= ar->lleft) { 3156 size -= ar->lleft + 1 - start; 3157 start = ar->lleft + 1; 3158 } 3159 if (ar->pright && start + size - 1 >= ar->lright) 3160 size -= start + size - ar->lright; 3161 3162 end = start + size; 3163 3164 /* check we don't cross already preallocated blocks */ 3165 rcu_read_lock(); 3166 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3167 ext4_lblk_t pa_end; 3168 3169 if (pa->pa_deleted) 3170 continue; 3171 spin_lock(&pa->pa_lock); 3172 if (pa->pa_deleted) { 3173 spin_unlock(&pa->pa_lock); 3174 continue; 3175 } 3176 3177 pa_end = pa->pa_lstart + pa->pa_len; 3178 3179 /* PA must not overlap original request */ 3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3182 3183 /* skip PA normalized request doesn't overlap with */ 3184 if (pa->pa_lstart >= end) { 3185 spin_unlock(&pa->pa_lock); 3186 continue; 3187 } 3188 if (pa_end <= start) { 3189 spin_unlock(&pa->pa_lock); 3190 continue; 3191 } 3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3193 3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 3195 BUG_ON(pa_end < start); 3196 start = pa_end; 3197 } 3198 3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 3200 BUG_ON(pa->pa_lstart > end); 3201 end = pa->pa_lstart; 3202 } 3203 spin_unlock(&pa->pa_lock); 3204 } 3205 rcu_read_unlock(); 3206 size = end - start; 3207 3208 /* XXX: extra loop to check we really don't overlap preallocations */ 3209 rcu_read_lock(); 3210 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3211 ext4_lblk_t pa_end; 3212 spin_lock(&pa->pa_lock); 3213 if (pa->pa_deleted == 0) { 3214 pa_end = pa->pa_lstart + pa->pa_len; 3215 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3216 } 3217 spin_unlock(&pa->pa_lock); 3218 } 3219 rcu_read_unlock(); 3220 3221 if (start + size <= ac->ac_o_ex.fe_logical && 3222 start > ac->ac_o_ex.fe_logical) { 3223 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", 3224 (unsigned long) start, (unsigned long) size, 3225 (unsigned long) ac->ac_o_ex.fe_logical); 3226 } 3227 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3228 start > ac->ac_o_ex.fe_logical); 3229 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3230 3231 /* now prepare goal request */ 3232 3233 /* XXX: is it better to align blocks WRT to logical 3234 * placement or satisfy big request as is */ 3235 ac->ac_g_ex.fe_logical = start; 3236 ac->ac_g_ex.fe_len = size; 3237 3238 /* define goal start in order to merge */ 3239 if (ar->pright && (ar->lright == (start + size))) { 3240 /* merge to the right */ 3241 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 3242 &ac->ac_f_ex.fe_group, 3243 &ac->ac_f_ex.fe_start); 3244 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3245 } 3246 if (ar->pleft && (ar->lleft + 1 == start)) { 3247 /* merge to the left */ 3248 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 3249 &ac->ac_f_ex.fe_group, 3250 &ac->ac_f_ex.fe_start); 3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3252 } 3253 3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3255 (unsigned) orig_size, (unsigned) start); 3256 } 3257 3258 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 3259 { 3260 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3261 3262 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3263 atomic_inc(&sbi->s_bal_reqs); 3264 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3265 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3266 atomic_inc(&sbi->s_bal_success); 3267 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3268 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3269 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 3270 atomic_inc(&sbi->s_bal_goals); 3271 if (ac->ac_found > sbi->s_mb_max_to_scan) 3272 atomic_inc(&sbi->s_bal_breaks); 3273 } 3274 3275 ext4_mb_store_history(ac); 3276 } 3277 3278 /* 3279 * use blocks preallocated to inode 3280 */ 3281 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3282 struct ext4_prealloc_space *pa) 3283 { 3284 ext4_fsblk_t start; 3285 ext4_fsblk_t end; 3286 int len; 3287 3288 /* found preallocated blocks, use them */ 3289 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3290 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); 3291 len = end - start; 3292 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3293 &ac->ac_b_ex.fe_start); 3294 ac->ac_b_ex.fe_len = len; 3295 ac->ac_status = AC_STATUS_FOUND; 3296 ac->ac_pa = pa; 3297 3298 BUG_ON(start < pa->pa_pstart); 3299 BUG_ON(start + len > pa->pa_pstart + pa->pa_len); 3300 BUG_ON(pa->pa_free < len); 3301 pa->pa_free -= len; 3302 3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3304 } 3305 3306 /* 3307 * use blocks preallocated to locality group 3308 */ 3309 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 3310 struct ext4_prealloc_space *pa) 3311 { 3312 unsigned int len = ac->ac_o_ex.fe_len; 3313 3314 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3315 &ac->ac_b_ex.fe_group, 3316 &ac->ac_b_ex.fe_start); 3317 ac->ac_b_ex.fe_len = len; 3318 ac->ac_status = AC_STATUS_FOUND; 3319 ac->ac_pa = pa; 3320 3321 /* we don't correct pa_pstart or pa_plen here to avoid 3322 * possible race when the group is being loaded concurrently 3323 * instead we correct pa later, after blocks are marked 3324 * in on-disk bitmap -- see ext4_mb_release_context() 3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3326 */ 3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3328 } 3329 3330 /* 3331 * Return the prealloc space that have minimal distance 3332 * from the goal block. @cpa is the prealloc 3333 * space that is having currently known minimal distance 3334 * from the goal block. 3335 */ 3336 static struct ext4_prealloc_space * 3337 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 3338 struct ext4_prealloc_space *pa, 3339 struct ext4_prealloc_space *cpa) 3340 { 3341 ext4_fsblk_t cur_distance, new_distance; 3342 3343 if (cpa == NULL) { 3344 atomic_inc(&pa->pa_count); 3345 return pa; 3346 } 3347 cur_distance = abs(goal_block - cpa->pa_pstart); 3348 new_distance = abs(goal_block - pa->pa_pstart); 3349 3350 if (cur_distance < new_distance) 3351 return cpa; 3352 3353 /* drop the previous reference */ 3354 atomic_dec(&cpa->pa_count); 3355 atomic_inc(&pa->pa_count); 3356 return pa; 3357 } 3358 3359 /* 3360 * search goal blocks in preallocated space 3361 */ 3362 static noinline_for_stack int 3363 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3364 { 3365 int order, i; 3366 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3367 struct ext4_locality_group *lg; 3368 struct ext4_prealloc_space *pa, *cpa = NULL; 3369 ext4_fsblk_t goal_block; 3370 3371 /* only data can be preallocated */ 3372 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3373 return 0; 3374 3375 /* first, try per-file preallocation */ 3376 rcu_read_lock(); 3377 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3378 3379 /* all fields in this condition don't change, 3380 * so we can skip locking for them */ 3381 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3383 continue; 3384 3385 /* found preallocated blocks, use them */ 3386 spin_lock(&pa->pa_lock); 3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3388 atomic_inc(&pa->pa_count); 3389 ext4_mb_use_inode_pa(ac, pa); 3390 spin_unlock(&pa->pa_lock); 3391 ac->ac_criteria = 10; 3392 rcu_read_unlock(); 3393 return 1; 3394 } 3395 spin_unlock(&pa->pa_lock); 3396 } 3397 rcu_read_unlock(); 3398 3399 /* can we use group allocation? */ 3400 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 3401 return 0; 3402 3403 /* inode may have no locality group for some reason */ 3404 lg = ac->ac_lg; 3405 if (lg == NULL) 3406 return 0; 3407 order = fls(ac->ac_o_ex.fe_len) - 1; 3408 if (order > PREALLOC_TB_SIZE - 1) 3409 /* The max size of hash table is PREALLOC_TB_SIZE */ 3410 order = PREALLOC_TB_SIZE - 1; 3411 3412 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3413 ac->ac_g_ex.fe_start + 3414 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); 3415 /* 3416 * search for the prealloc space that is having 3417 * minimal distance from the goal block. 3418 */ 3419 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3420 rcu_read_lock(); 3421 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3422 pa_inode_list) { 3423 spin_lock(&pa->pa_lock); 3424 if (pa->pa_deleted == 0 && 3425 pa->pa_free >= ac->ac_o_ex.fe_len) { 3426 3427 cpa = ext4_mb_check_group_pa(goal_block, 3428 pa, cpa); 3429 } 3430 spin_unlock(&pa->pa_lock); 3431 } 3432 rcu_read_unlock(); 3433 } 3434 if (cpa) { 3435 ext4_mb_use_group_pa(ac, cpa); 3436 ac->ac_criteria = 20; 3437 return 1; 3438 } 3439 return 0; 3440 } 3441 3442 /* 3443 * the function goes through all block freed in the group 3444 * but not yet committed and marks them used in in-core bitmap. 3445 * buddy must be generated from this bitmap 3446 * Need to be called with the ext4 group lock held 3447 */ 3448 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3449 ext4_group_t group) 3450 { 3451 struct rb_node *n; 3452 struct ext4_group_info *grp; 3453 struct ext4_free_data *entry; 3454 3455 grp = ext4_get_group_info(sb, group); 3456 n = rb_first(&(grp->bb_free_root)); 3457 3458 while (n) { 3459 entry = rb_entry(n, struct ext4_free_data, node); 3460 mb_set_bits(bitmap, entry->start_blk, entry->count); 3461 n = rb_next(n); 3462 } 3463 return; 3464 } 3465 3466 /* 3467 * the function goes through all preallocation in this group and marks them 3468 * used in in-core bitmap. buddy must be generated from this bitmap 3469 * Need to be called with ext4 group lock held 3470 */ 3471 static noinline_for_stack 3472 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3473 ext4_group_t group) 3474 { 3475 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3476 struct ext4_prealloc_space *pa; 3477 struct list_head *cur; 3478 ext4_group_t groupnr; 3479 ext4_grpblk_t start; 3480 int preallocated = 0; 3481 int count = 0; 3482 int len; 3483 3484 /* all form of preallocation discards first load group, 3485 * so the only competing code is preallocation use. 3486 * we don't need any locking here 3487 * notice we do NOT ignore preallocations with pa_deleted 3488 * otherwise we could leave used blocks available for 3489 * allocation in buddy when concurrent ext4_mb_put_pa() 3490 * is dropping preallocation 3491 */ 3492 list_for_each(cur, &grp->bb_prealloc_list) { 3493 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3494 spin_lock(&pa->pa_lock); 3495 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3496 &groupnr, &start); 3497 len = pa->pa_len; 3498 spin_unlock(&pa->pa_lock); 3499 if (unlikely(len == 0)) 3500 continue; 3501 BUG_ON(groupnr != group); 3502 mb_set_bits(bitmap, start, len); 3503 preallocated += len; 3504 count++; 3505 } 3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3507 } 3508 3509 static void ext4_mb_pa_callback(struct rcu_head *head) 3510 { 3511 struct ext4_prealloc_space *pa; 3512 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3513 kmem_cache_free(ext4_pspace_cachep, pa); 3514 } 3515 3516 /* 3517 * drops a reference to preallocated space descriptor 3518 * if this was the last reference and the space is consumed 3519 */ 3520 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3521 struct super_block *sb, struct ext4_prealloc_space *pa) 3522 { 3523 ext4_group_t grp; 3524 ext4_fsblk_t grp_blk; 3525 3526 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3527 return; 3528 3529 /* in this short window concurrent discard can set pa_deleted */ 3530 spin_lock(&pa->pa_lock); 3531 if (pa->pa_deleted == 1) { 3532 spin_unlock(&pa->pa_lock); 3533 return; 3534 } 3535 3536 pa->pa_deleted = 1; 3537 spin_unlock(&pa->pa_lock); 3538 3539 grp_blk = pa->pa_pstart; 3540 /* 3541 * If doing group-based preallocation, pa_pstart may be in the 3542 * next group when pa is used up 3543 */ 3544 if (pa->pa_type == MB_GROUP_PA) 3545 grp_blk--; 3546 3547 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3548 3549 /* 3550 * possible race: 3551 * 3552 * P1 (buddy init) P2 (regular allocation) 3553 * find block B in PA 3554 * copy on-disk bitmap to buddy 3555 * mark B in on-disk bitmap 3556 * drop PA from group 3557 * mark all PAs in buddy 3558 * 3559 * thus, P1 initializes buddy with B available. to prevent this 3560 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 3561 * against that pair 3562 */ 3563 ext4_lock_group(sb, grp); 3564 list_del(&pa->pa_group_list); 3565 ext4_unlock_group(sb, grp); 3566 3567 spin_lock(pa->pa_obj_lock); 3568 list_del_rcu(&pa->pa_inode_list); 3569 spin_unlock(pa->pa_obj_lock); 3570 3571 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3572 } 3573 3574 /* 3575 * creates new preallocated space for given inode 3576 */ 3577 static noinline_for_stack int 3578 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3579 { 3580 struct super_block *sb = ac->ac_sb; 3581 struct ext4_prealloc_space *pa; 3582 struct ext4_group_info *grp; 3583 struct ext4_inode_info *ei; 3584 3585 /* preallocate only when found space is larger then requested */ 3586 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3587 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3588 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3589 3590 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3591 if (pa == NULL) 3592 return -ENOMEM; 3593 3594 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 3595 int winl; 3596 int wins; 3597 int win; 3598 int offs; 3599 3600 /* we can't allocate as much as normalizer wants. 3601 * so, found space must get proper lstart 3602 * to cover original request */ 3603 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 3604 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 3605 3606 /* we're limited by original request in that 3607 * logical block must be covered any way 3608 * winl is window we can move our chunk within */ 3609 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3610 3611 /* also, we should cover whole original request */ 3612 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; 3613 3614 /* the smallest one defines real window */ 3615 win = min(winl, wins); 3616 3617 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; 3618 if (offs && offs < win) 3619 win = offs; 3620 3621 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; 3622 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3623 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3624 } 3625 3626 /* preallocation can change ac_b_ex, thus we store actually 3627 * allocated blocks for history */ 3628 ac->ac_f_ex = ac->ac_b_ex; 3629 3630 pa->pa_lstart = ac->ac_b_ex.fe_logical; 3631 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3632 pa->pa_len = ac->ac_b_ex.fe_len; 3633 pa->pa_free = pa->pa_len; 3634 atomic_set(&pa->pa_count, 1); 3635 spin_lock_init(&pa->pa_lock); 3636 INIT_LIST_HEAD(&pa->pa_inode_list); 3637 INIT_LIST_HEAD(&pa->pa_group_list); 3638 pa->pa_deleted = 0; 3639 pa->pa_type = MB_INODE_PA; 3640 3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3643 trace_ext4_mb_new_inode_pa(ac, pa); 3644 3645 ext4_mb_use_inode_pa(ac, pa); 3646 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3647 3648 ei = EXT4_I(ac->ac_inode); 3649 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3650 3651 pa->pa_obj_lock = &ei->i_prealloc_lock; 3652 pa->pa_inode = ac->ac_inode; 3653 3654 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3655 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3656 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3657 3658 spin_lock(pa->pa_obj_lock); 3659 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3660 spin_unlock(pa->pa_obj_lock); 3661 3662 return 0; 3663 } 3664 3665 /* 3666 * creates new preallocated space for locality group inodes belongs to 3667 */ 3668 static noinline_for_stack int 3669 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 3670 { 3671 struct super_block *sb = ac->ac_sb; 3672 struct ext4_locality_group *lg; 3673 struct ext4_prealloc_space *pa; 3674 struct ext4_group_info *grp; 3675 3676 /* preallocate only when found space is larger then requested */ 3677 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3678 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3679 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3680 3681 BUG_ON(ext4_pspace_cachep == NULL); 3682 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3683 if (pa == NULL) 3684 return -ENOMEM; 3685 3686 /* preallocation can change ac_b_ex, thus we store actually 3687 * allocated blocks for history */ 3688 ac->ac_f_ex = ac->ac_b_ex; 3689 3690 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3691 pa->pa_lstart = pa->pa_pstart; 3692 pa->pa_len = ac->ac_b_ex.fe_len; 3693 pa->pa_free = pa->pa_len; 3694 atomic_set(&pa->pa_count, 1); 3695 spin_lock_init(&pa->pa_lock); 3696 INIT_LIST_HEAD(&pa->pa_inode_list); 3697 INIT_LIST_HEAD(&pa->pa_group_list); 3698 pa->pa_deleted = 0; 3699 pa->pa_type = MB_GROUP_PA; 3700 3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3703 trace_ext4_mb_new_group_pa(ac, pa); 3704 3705 ext4_mb_use_group_pa(ac, pa); 3706 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3707 3708 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3709 lg = ac->ac_lg; 3710 BUG_ON(lg == NULL); 3711 3712 pa->pa_obj_lock = &lg->lg_prealloc_lock; 3713 pa->pa_inode = NULL; 3714 3715 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3716 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3717 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3718 3719 /* 3720 * We will later add the new pa to the right bucket 3721 * after updating the pa_free in ext4_mb_release_context 3722 */ 3723 return 0; 3724 } 3725 3726 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 3727 { 3728 int err; 3729 3730 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 3731 err = ext4_mb_new_group_pa(ac); 3732 else 3733 err = ext4_mb_new_inode_pa(ac); 3734 return err; 3735 } 3736 3737 /* 3738 * finds all unused blocks in on-disk bitmap, frees them in 3739 * in-core bitmap and buddy. 3740 * @pa must be unlinked from inode and group lists, so that 3741 * nobody else can find/use it. 3742 * the caller MUST hold group/inode locks. 3743 * TODO: optimize the case when there are no in-core structures yet 3744 */ 3745 static noinline_for_stack int 3746 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3747 struct ext4_prealloc_space *pa, 3748 struct ext4_allocation_context *ac) 3749 { 3750 struct super_block *sb = e4b->bd_sb; 3751 struct ext4_sb_info *sbi = EXT4_SB(sb); 3752 unsigned int end; 3753 unsigned int next; 3754 ext4_group_t group; 3755 ext4_grpblk_t bit; 3756 unsigned long long grp_blk_start; 3757 sector_t start; 3758 int err = 0; 3759 int free = 0; 3760 3761 BUG_ON(pa->pa_deleted == 0); 3762 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3763 grp_blk_start = pa->pa_pstart - bit; 3764 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3765 end = bit + pa->pa_len; 3766 3767 if (ac) { 3768 ac->ac_sb = sb; 3769 ac->ac_inode = pa->pa_inode; 3770 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3771 } 3772 3773 while (bit < end) { 3774 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3775 if (bit >= end) 3776 break; 3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3780 mb_debug(" free preallocated %u/%u in group %u\n", 3781 (unsigned) start, (unsigned) next - bit, 3782 (unsigned) group); 3783 free += next - bit; 3784 3785 if (ac) { 3786 ac->ac_b_ex.fe_group = group; 3787 ac->ac_b_ex.fe_start = bit; 3788 ac->ac_b_ex.fe_len = next - bit; 3789 ac->ac_b_ex.fe_logical = 0; 3790 ext4_mb_store_history(ac); 3791 } 3792 3793 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3794 next - bit); 3795 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3796 bit = next + 1; 3797 } 3798 if (free != pa->pa_free) { 3799 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", 3800 pa, (unsigned long) pa->pa_lstart, 3801 (unsigned long) pa->pa_pstart, 3802 (unsigned long) pa->pa_len); 3803 ext4_grp_locked_error(sb, group, 3804 __func__, "free %u, pa_free %u", 3805 free, pa->pa_free); 3806 /* 3807 * pa is already deleted so we use the value obtained 3808 * from the bitmap and continue. 3809 */ 3810 } 3811 atomic_add(free, &sbi->s_mb_discarded); 3812 3813 return err; 3814 } 3815 3816 static noinline_for_stack int 3817 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3818 struct ext4_prealloc_space *pa, 3819 struct ext4_allocation_context *ac) 3820 { 3821 struct super_block *sb = e4b->bd_sb; 3822 ext4_group_t group; 3823 ext4_grpblk_t bit; 3824 3825 if (ac) 3826 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3827 3828 trace_ext4_mb_release_group_pa(ac, pa); 3829 BUG_ON(pa->pa_deleted == 0); 3830 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3831 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3832 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3833 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3834 3835 if (ac) { 3836 ac->ac_sb = sb; 3837 ac->ac_inode = NULL; 3838 ac->ac_b_ex.fe_group = group; 3839 ac->ac_b_ex.fe_start = bit; 3840 ac->ac_b_ex.fe_len = pa->pa_len; 3841 ac->ac_b_ex.fe_logical = 0; 3842 ext4_mb_store_history(ac); 3843 } 3844 3845 return 0; 3846 } 3847 3848 /* 3849 * releases all preallocations in given group 3850 * 3851 * first, we need to decide discard policy: 3852 * - when do we discard 3853 * 1) ENOSPC 3854 * - how many do we discard 3855 * 1) how many requested 3856 */ 3857 static noinline_for_stack int 3858 ext4_mb_discard_group_preallocations(struct super_block *sb, 3859 ext4_group_t group, int needed) 3860 { 3861 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3862 struct buffer_head *bitmap_bh = NULL; 3863 struct ext4_prealloc_space *pa, *tmp; 3864 struct ext4_allocation_context *ac; 3865 struct list_head list; 3866 struct ext4_buddy e4b; 3867 int err; 3868 int busy = 0; 3869 int free = 0; 3870 3871 mb_debug("discard preallocation for group %u\n", group); 3872 3873 if (list_empty(&grp->bb_prealloc_list)) 3874 return 0; 3875 3876 bitmap_bh = ext4_read_block_bitmap(sb, group); 3877 if (bitmap_bh == NULL) { 3878 ext4_error(sb, __func__, "Error in reading block " 3879 "bitmap for %u", group); 3880 return 0; 3881 } 3882 3883 err = ext4_mb_load_buddy(sb, group, &e4b); 3884 if (err) { 3885 ext4_error(sb, __func__, "Error in loading buddy " 3886 "information for %u", group); 3887 put_bh(bitmap_bh); 3888 return 0; 3889 } 3890 3891 if (needed == 0) 3892 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3893 3894 INIT_LIST_HEAD(&list); 3895 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3896 if (ac) 3897 ac->ac_sb = sb; 3898 repeat: 3899 ext4_lock_group(sb, group); 3900 list_for_each_entry_safe(pa, tmp, 3901 &grp->bb_prealloc_list, pa_group_list) { 3902 spin_lock(&pa->pa_lock); 3903 if (atomic_read(&pa->pa_count)) { 3904 spin_unlock(&pa->pa_lock); 3905 busy = 1; 3906 continue; 3907 } 3908 if (pa->pa_deleted) { 3909 spin_unlock(&pa->pa_lock); 3910 continue; 3911 } 3912 3913 /* seems this one can be freed ... */ 3914 pa->pa_deleted = 1; 3915 3916 /* we can trust pa_free ... */ 3917 free += pa->pa_free; 3918 3919 spin_unlock(&pa->pa_lock); 3920 3921 list_del(&pa->pa_group_list); 3922 list_add(&pa->u.pa_tmp_list, &list); 3923 } 3924 3925 /* if we still need more blocks and some PAs were used, try again */ 3926 if (free < needed && busy) { 3927 busy = 0; 3928 ext4_unlock_group(sb, group); 3929 /* 3930 * Yield the CPU here so that we don't get soft lockup 3931 * in non preempt case. 3932 */ 3933 yield(); 3934 goto repeat; 3935 } 3936 3937 /* found anything to free? */ 3938 if (list_empty(&list)) { 3939 BUG_ON(free != 0); 3940 goto out; 3941 } 3942 3943 /* now free all selected PAs */ 3944 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3945 3946 /* remove from object (inode or locality group) */ 3947 spin_lock(pa->pa_obj_lock); 3948 list_del_rcu(&pa->pa_inode_list); 3949 spin_unlock(pa->pa_obj_lock); 3950 3951 if (pa->pa_type == MB_GROUP_PA) 3952 ext4_mb_release_group_pa(&e4b, pa, ac); 3953 else 3954 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3955 3956 list_del(&pa->u.pa_tmp_list); 3957 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3958 } 3959 3960 out: 3961 ext4_unlock_group(sb, group); 3962 if (ac) 3963 kmem_cache_free(ext4_ac_cachep, ac); 3964 ext4_mb_release_desc(&e4b); 3965 put_bh(bitmap_bh); 3966 return free; 3967 } 3968 3969 /* 3970 * releases all non-used preallocated blocks for given inode 3971 * 3972 * It's important to discard preallocations under i_data_sem 3973 * We don't want another block to be served from the prealloc 3974 * space when we are discarding the inode prealloc space. 3975 * 3976 * FIXME!! Make sure it is valid at all the call sites 3977 */ 3978 void ext4_discard_preallocations(struct inode *inode) 3979 { 3980 struct ext4_inode_info *ei = EXT4_I(inode); 3981 struct super_block *sb = inode->i_sb; 3982 struct buffer_head *bitmap_bh = NULL; 3983 struct ext4_prealloc_space *pa, *tmp; 3984 struct ext4_allocation_context *ac; 3985 ext4_group_t group = 0; 3986 struct list_head list; 3987 struct ext4_buddy e4b; 3988 int err; 3989 3990 if (!S_ISREG(inode->i_mode)) { 3991 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3992 return; 3993 } 3994 3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 3996 trace_ext4_discard_preallocations(inode); 3997 3998 INIT_LIST_HEAD(&list); 3999 4000 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4001 if (ac) { 4002 ac->ac_sb = sb; 4003 ac->ac_inode = inode; 4004 } 4005 repeat: 4006 /* first, collect all pa's in the inode */ 4007 spin_lock(&ei->i_prealloc_lock); 4008 while (!list_empty(&ei->i_prealloc_list)) { 4009 pa = list_entry(ei->i_prealloc_list.next, 4010 struct ext4_prealloc_space, pa_inode_list); 4011 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 4012 spin_lock(&pa->pa_lock); 4013 if (atomic_read(&pa->pa_count)) { 4014 /* this shouldn't happen often - nobody should 4015 * use preallocation while we're discarding it */ 4016 spin_unlock(&pa->pa_lock); 4017 spin_unlock(&ei->i_prealloc_lock); 4018 printk(KERN_ERR "uh-oh! used pa while discarding\n"); 4019 WARN_ON(1); 4020 schedule_timeout_uninterruptible(HZ); 4021 goto repeat; 4022 4023 } 4024 if (pa->pa_deleted == 0) { 4025 pa->pa_deleted = 1; 4026 spin_unlock(&pa->pa_lock); 4027 list_del_rcu(&pa->pa_inode_list); 4028 list_add(&pa->u.pa_tmp_list, &list); 4029 continue; 4030 } 4031 4032 /* someone is deleting pa right now */ 4033 spin_unlock(&pa->pa_lock); 4034 spin_unlock(&ei->i_prealloc_lock); 4035 4036 /* we have to wait here because pa_deleted 4037 * doesn't mean pa is already unlinked from 4038 * the list. as we might be called from 4039 * ->clear_inode() the inode will get freed 4040 * and concurrent thread which is unlinking 4041 * pa from inode's list may access already 4042 * freed memory, bad-bad-bad */ 4043 4044 /* XXX: if this happens too often, we can 4045 * add a flag to force wait only in case 4046 * of ->clear_inode(), but not in case of 4047 * regular truncate */ 4048 schedule_timeout_uninterruptible(HZ); 4049 goto repeat; 4050 } 4051 spin_unlock(&ei->i_prealloc_lock); 4052 4053 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4054 BUG_ON(pa->pa_type != MB_INODE_PA); 4055 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4056 4057 err = ext4_mb_load_buddy(sb, group, &e4b); 4058 if (err) { 4059 ext4_error(sb, __func__, "Error in loading buddy " 4060 "information for %u", group); 4061 continue; 4062 } 4063 4064 bitmap_bh = ext4_read_block_bitmap(sb, group); 4065 if (bitmap_bh == NULL) { 4066 ext4_error(sb, __func__, "Error in reading block " 4067 "bitmap for %u", group); 4068 ext4_mb_release_desc(&e4b); 4069 continue; 4070 } 4071 4072 ext4_lock_group(sb, group); 4073 list_del(&pa->pa_group_list); 4074 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 4075 ext4_unlock_group(sb, group); 4076 4077 ext4_mb_release_desc(&e4b); 4078 put_bh(bitmap_bh); 4079 4080 list_del(&pa->u.pa_tmp_list); 4081 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4082 } 4083 if (ac) 4084 kmem_cache_free(ext4_ac_cachep, ac); 4085 } 4086 4087 /* 4088 * finds all preallocated spaces and return blocks being freed to them 4089 * if preallocated space becomes full (no block is used from the space) 4090 * then the function frees space in buddy 4091 * XXX: at the moment, truncate (which is the only way to free blocks) 4092 * discards all preallocations 4093 */ 4094 static void ext4_mb_return_to_preallocation(struct inode *inode, 4095 struct ext4_buddy *e4b, 4096 sector_t block, int count) 4097 { 4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4099 } 4100 #ifdef MB_DEBUG 4101 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4102 { 4103 struct super_block *sb = ac->ac_sb; 4104 ext4_group_t ngroups, i; 4105 4106 printk(KERN_ERR "EXT4-fs: Can't allocate:" 4107 " Allocation context details:\n"); 4108 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 4109 ac->ac_status, ac->ac_flags); 4110 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " 4111 "best %lu/%lu/%lu@%lu cr %d\n", 4112 (unsigned long)ac->ac_o_ex.fe_group, 4113 (unsigned long)ac->ac_o_ex.fe_start, 4114 (unsigned long)ac->ac_o_ex.fe_len, 4115 (unsigned long)ac->ac_o_ex.fe_logical, 4116 (unsigned long)ac->ac_g_ex.fe_group, 4117 (unsigned long)ac->ac_g_ex.fe_start, 4118 (unsigned long)ac->ac_g_ex.fe_len, 4119 (unsigned long)ac->ac_g_ex.fe_logical, 4120 (unsigned long)ac->ac_b_ex.fe_group, 4121 (unsigned long)ac->ac_b_ex.fe_start, 4122 (unsigned long)ac->ac_b_ex.fe_len, 4123 (unsigned long)ac->ac_b_ex.fe_logical, 4124 (int)ac->ac_criteria); 4125 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 4126 ac->ac_found); 4127 printk(KERN_ERR "EXT4-fs: groups: \n"); 4128 ngroups = ext4_get_groups_count(sb); 4129 for (i = 0; i < ngroups; i++) { 4130 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 4131 struct ext4_prealloc_space *pa; 4132 ext4_grpblk_t start; 4133 struct list_head *cur; 4134 ext4_lock_group(sb, i); 4135 list_for_each(cur, &grp->bb_prealloc_list) { 4136 pa = list_entry(cur, struct ext4_prealloc_space, 4137 pa_group_list); 4138 spin_lock(&pa->pa_lock); 4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4140 NULL, &start); 4141 spin_unlock(&pa->pa_lock); 4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4143 start, pa->pa_len); 4144 } 4145 ext4_unlock_group(sb, i); 4146 4147 if (grp->bb_free == 0) 4148 continue; 4149 printk(KERN_ERR "%lu: %d/%d \n", 4150 i, grp->bb_free, grp->bb_fragments); 4151 } 4152 printk(KERN_ERR "\n"); 4153 } 4154 #else 4155 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4156 { 4157 return; 4158 } 4159 #endif 4160 4161 /* 4162 * We use locality group preallocation for small size file. The size of the 4163 * file is determined by the current size or the resulting size after 4164 * allocation which ever is larger 4165 * 4166 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 4167 */ 4168 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4169 { 4170 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4171 int bsbits = ac->ac_sb->s_blocksize_bits; 4172 loff_t size, isize; 4173 4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4175 return; 4176 4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4178 isize = i_size_read(ac->ac_inode) >> bsbits; 4179 size = max(size, isize); 4180 4181 /* don't use group allocation for large files */ 4182 if (size >= sbi->s_mb_stream_request) 4183 return; 4184 4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4186 return; 4187 4188 BUG_ON(ac->ac_lg != NULL); 4189 /* 4190 * locality group prealloc space are per cpu. The reason for having 4191 * per cpu locality group is to reduce the contention between block 4192 * request from multiple CPUs. 4193 */ 4194 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); 4195 4196 /* we're going to use group allocation */ 4197 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4198 4199 /* serialize all allocations in the group */ 4200 mutex_lock(&ac->ac_lg->lg_mutex); 4201 } 4202 4203 static noinline_for_stack int 4204 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 4205 struct ext4_allocation_request *ar) 4206 { 4207 struct super_block *sb = ar->inode->i_sb; 4208 struct ext4_sb_info *sbi = EXT4_SB(sb); 4209 struct ext4_super_block *es = sbi->s_es; 4210 ext4_group_t group; 4211 unsigned int len; 4212 ext4_fsblk_t goal; 4213 ext4_grpblk_t block; 4214 4215 /* we can't allocate > group size */ 4216 len = ar->len; 4217 4218 /* just a dirty hack to filter too big requests */ 4219 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) 4220 len = EXT4_BLOCKS_PER_GROUP(sb) - 10; 4221 4222 /* start searching from the goal */ 4223 goal = ar->goal; 4224 if (goal < le32_to_cpu(es->s_first_data_block) || 4225 goal >= ext4_blocks_count(es)) 4226 goal = le32_to_cpu(es->s_first_data_block); 4227 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4228 4229 /* set up allocation goals */ 4230 memset(ac, 0, sizeof(struct ext4_allocation_context)); 4231 ac->ac_b_ex.fe_logical = ar->logical; 4232 ac->ac_status = AC_STATUS_CONTINUE; 4233 ac->ac_sb = sb; 4234 ac->ac_inode = ar->inode; 4235 ac->ac_o_ex.fe_logical = ar->logical; 4236 ac->ac_o_ex.fe_group = group; 4237 ac->ac_o_ex.fe_start = block; 4238 ac->ac_o_ex.fe_len = len; 4239 ac->ac_g_ex.fe_logical = ar->logical; 4240 ac->ac_g_ex.fe_group = group; 4241 ac->ac_g_ex.fe_start = block; 4242 ac->ac_g_ex.fe_len = len; 4243 ac->ac_flags = ar->flags; 4244 4245 /* we have to define context: we'll we work with a file or 4246 * locality group. this is a policy, actually */ 4247 ext4_mb_group_or_file(ac); 4248 4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4250 "left: %u/%u, right %u/%u to %swritable\n", 4251 (unsigned) ar->len, (unsigned) ar->logical, 4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4253 (unsigned) ar->lleft, (unsigned) ar->pleft, 4254 (unsigned) ar->lright, (unsigned) ar->pright, 4255 atomic_read(&ar->inode->i_writecount) ? "" : "non-"); 4256 return 0; 4257 4258 } 4259 4260 static noinline_for_stack void 4261 ext4_mb_discard_lg_preallocations(struct super_block *sb, 4262 struct ext4_locality_group *lg, 4263 int order, int total_entries) 4264 { 4265 ext4_group_t group = 0; 4266 struct ext4_buddy e4b; 4267 struct list_head discard_list; 4268 struct ext4_prealloc_space *pa, *tmp; 4269 struct ext4_allocation_context *ac; 4270 4271 mb_debug("discard locality group preallocation\n"); 4272 4273 INIT_LIST_HEAD(&discard_list); 4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4275 if (ac) 4276 ac->ac_sb = sb; 4277 4278 spin_lock(&lg->lg_prealloc_lock); 4279 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4280 pa_inode_list) { 4281 spin_lock(&pa->pa_lock); 4282 if (atomic_read(&pa->pa_count)) { 4283 /* 4284 * This is the pa that we just used 4285 * for block allocation. So don't 4286 * free that 4287 */ 4288 spin_unlock(&pa->pa_lock); 4289 continue; 4290 } 4291 if (pa->pa_deleted) { 4292 spin_unlock(&pa->pa_lock); 4293 continue; 4294 } 4295 /* only lg prealloc space */ 4296 BUG_ON(pa->pa_type != MB_GROUP_PA); 4297 4298 /* seems this one can be freed ... */ 4299 pa->pa_deleted = 1; 4300 spin_unlock(&pa->pa_lock); 4301 4302 list_del_rcu(&pa->pa_inode_list); 4303 list_add(&pa->u.pa_tmp_list, &discard_list); 4304 4305 total_entries--; 4306 if (total_entries <= 5) { 4307 /* 4308 * we want to keep only 5 entries 4309 * allowing it to grow to 8. This 4310 * mak sure we don't call discard 4311 * soon for this list. 4312 */ 4313 break; 4314 } 4315 } 4316 spin_unlock(&lg->lg_prealloc_lock); 4317 4318 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4319 4320 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4321 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4322 ext4_error(sb, __func__, "Error in loading buddy " 4323 "information for %u", group); 4324 continue; 4325 } 4326 ext4_lock_group(sb, group); 4327 list_del(&pa->pa_group_list); 4328 ext4_mb_release_group_pa(&e4b, pa, ac); 4329 ext4_unlock_group(sb, group); 4330 4331 ext4_mb_release_desc(&e4b); 4332 list_del(&pa->u.pa_tmp_list); 4333 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4334 } 4335 if (ac) 4336 kmem_cache_free(ext4_ac_cachep, ac); 4337 } 4338 4339 /* 4340 * We have incremented pa_count. So it cannot be freed at this 4341 * point. Also we hold lg_mutex. So no parallel allocation is 4342 * possible from this lg. That means pa_free cannot be updated. 4343 * 4344 * A parallel ext4_mb_discard_group_preallocations is possible. 4345 * which can cause the lg_prealloc_list to be updated. 4346 */ 4347 4348 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 4349 { 4350 int order, added = 0, lg_prealloc_count = 1; 4351 struct super_block *sb = ac->ac_sb; 4352 struct ext4_locality_group *lg = ac->ac_lg; 4353 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 4354 4355 order = fls(pa->pa_free) - 1; 4356 if (order > PREALLOC_TB_SIZE - 1) 4357 /* The max size of hash table is PREALLOC_TB_SIZE */ 4358 order = PREALLOC_TB_SIZE - 1; 4359 /* Add the prealloc space to lg */ 4360 rcu_read_lock(); 4361 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 4362 pa_inode_list) { 4363 spin_lock(&tmp_pa->pa_lock); 4364 if (tmp_pa->pa_deleted) { 4365 spin_unlock(&tmp_pa->pa_lock); 4366 continue; 4367 } 4368 if (!added && pa->pa_free < tmp_pa->pa_free) { 4369 /* Add to the tail of the previous entry */ 4370 list_add_tail_rcu(&pa->pa_inode_list, 4371 &tmp_pa->pa_inode_list); 4372 added = 1; 4373 /* 4374 * we want to count the total 4375 * number of entries in the list 4376 */ 4377 } 4378 spin_unlock(&tmp_pa->pa_lock); 4379 lg_prealloc_count++; 4380 } 4381 if (!added) 4382 list_add_tail_rcu(&pa->pa_inode_list, 4383 &lg->lg_prealloc_list[order]); 4384 rcu_read_unlock(); 4385 4386 /* Now trim the list to be not more than 8 elements */ 4387 if (lg_prealloc_count > 8) { 4388 ext4_mb_discard_lg_preallocations(sb, lg, 4389 order, lg_prealloc_count); 4390 return; 4391 } 4392 return ; 4393 } 4394 4395 /* 4396 * release all resource we used in allocation 4397 */ 4398 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4399 { 4400 struct ext4_prealloc_space *pa = ac->ac_pa; 4401 if (pa) { 4402 if (pa->pa_type == MB_GROUP_PA) { 4403 /* see comment in ext4_mb_use_group_pa() */ 4404 spin_lock(&pa->pa_lock); 4405 pa->pa_pstart += ac->ac_b_ex.fe_len; 4406 pa->pa_lstart += ac->ac_b_ex.fe_len; 4407 pa->pa_free -= ac->ac_b_ex.fe_len; 4408 pa->pa_len -= ac->ac_b_ex.fe_len; 4409 spin_unlock(&pa->pa_lock); 4410 } 4411 } 4412 if (ac->alloc_semp) 4413 up_read(ac->alloc_semp); 4414 if (pa) { 4415 /* 4416 * We want to add the pa to the right bucket. 4417 * Remove it from the list and while adding 4418 * make sure the list to which we are adding 4419 * doesn't grow big. We need to release 4420 * alloc_semp before calling ext4_mb_add_n_trim() 4421 */ 4422 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4423 spin_lock(pa->pa_obj_lock); 4424 list_del_rcu(&pa->pa_inode_list); 4425 spin_unlock(pa->pa_obj_lock); 4426 ext4_mb_add_n_trim(ac); 4427 } 4428 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4429 } 4430 if (ac->ac_bitmap_page) 4431 page_cache_release(ac->ac_bitmap_page); 4432 if (ac->ac_buddy_page) 4433 page_cache_release(ac->ac_buddy_page); 4434 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4435 mutex_unlock(&ac->ac_lg->lg_mutex); 4436 ext4_mb_collect_stats(ac); 4437 return 0; 4438 } 4439 4440 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4441 { 4442 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4443 int ret; 4444 int freed = 0; 4445 4446 trace_ext4_mb_discard_preallocations(sb, needed); 4447 for (i = 0; i < ngroups && needed > 0; i++) { 4448 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4449 freed += ret; 4450 needed -= ret; 4451 } 4452 4453 return freed; 4454 } 4455 4456 /* 4457 * Main entry point into mballoc to allocate blocks 4458 * it tries to use preallocation first, then falls back 4459 * to usual allocation 4460 */ 4461 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4462 struct ext4_allocation_request *ar, int *errp) 4463 { 4464 int freed; 4465 struct ext4_allocation_context *ac = NULL; 4466 struct ext4_sb_info *sbi; 4467 struct super_block *sb; 4468 ext4_fsblk_t block = 0; 4469 unsigned int inquota = 0; 4470 unsigned int reserv_blks = 0; 4471 4472 sb = ar->inode->i_sb; 4473 sbi = EXT4_SB(sb); 4474 4475 trace_ext4_request_blocks(ar); 4476 4477 /* 4478 * For delayed allocation, we could skip the ENOSPC and 4479 * EDQUOT check, as blocks and quotas have been already 4480 * reserved when data being copied into pagecache. 4481 */ 4482 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4483 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4484 else { 4485 /* Without delayed allocation we need to verify 4486 * there is enough free blocks to do block allocation 4487 * and verify allocation doesn't exceed the quota limits. 4488 */ 4489 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4490 /* let others to free the space */ 4491 yield(); 4492 ar->len = ar->len >> 1; 4493 } 4494 if (!ar->len) { 4495 *errp = -ENOSPC; 4496 return 0; 4497 } 4498 reserv_blks = ar->len; 4499 while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { 4500 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4501 ar->len--; 4502 } 4503 inquota = ar->len; 4504 if (ar->len == 0) { 4505 *errp = -EDQUOT; 4506 goto out3; 4507 } 4508 } 4509 4510 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4511 if (!ac) { 4512 ar->len = 0; 4513 *errp = -ENOMEM; 4514 goto out1; 4515 } 4516 4517 *errp = ext4_mb_initialize_context(ac, ar); 4518 if (*errp) { 4519 ar->len = 0; 4520 goto out2; 4521 } 4522 4523 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4524 if (!ext4_mb_use_preallocated(ac)) { 4525 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4526 ext4_mb_normalize_request(ac, ar); 4527 repeat: 4528 /* allocate space in core */ 4529 ext4_mb_regular_allocator(ac); 4530 4531 /* as we've just preallocated more space than 4532 * user requested orinally, we store allocated 4533 * space in a special descriptor */ 4534 if (ac->ac_status == AC_STATUS_FOUND && 4535 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4536 ext4_mb_new_preallocation(ac); 4537 } 4538 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4539 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4540 if (*errp == -EAGAIN) { 4541 /* 4542 * drop the reference that we took 4543 * in ext4_mb_use_best_found 4544 */ 4545 ext4_mb_release_context(ac); 4546 ac->ac_b_ex.fe_group = 0; 4547 ac->ac_b_ex.fe_start = 0; 4548 ac->ac_b_ex.fe_len = 0; 4549 ac->ac_status = AC_STATUS_CONTINUE; 4550 goto repeat; 4551 } else if (*errp) { 4552 ac->ac_b_ex.fe_len = 0; 4553 ar->len = 0; 4554 ext4_mb_show_ac(ac); 4555 } else { 4556 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4557 ar->len = ac->ac_b_ex.fe_len; 4558 } 4559 } else { 4560 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 4561 if (freed) 4562 goto repeat; 4563 *errp = -ENOSPC; 4564 ac->ac_b_ex.fe_len = 0; 4565 ar->len = 0; 4566 ext4_mb_show_ac(ac); 4567 } 4568 4569 ext4_mb_release_context(ac); 4570 4571 out2: 4572 kmem_cache_free(ext4_ac_cachep, ac); 4573 out1: 4574 if (inquota && ar->len < inquota) 4575 vfs_dq_free_block(ar->inode, inquota - ar->len); 4576 out3: 4577 if (!ar->len) { 4578 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4579 /* release all the reserved blocks if non delalloc */ 4580 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4581 reserv_blks); 4582 } 4583 4584 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 4585 4586 return block; 4587 } 4588 4589 /* 4590 * We can merge two free data extents only if the physical blocks 4591 * are contiguous, AND the extents were freed by the same transaction, 4592 * AND the blocks are associated with the same group. 4593 */ 4594 static int can_merge(struct ext4_free_data *entry1, 4595 struct ext4_free_data *entry2) 4596 { 4597 if ((entry1->t_tid == entry2->t_tid) && 4598 (entry1->group == entry2->group) && 4599 ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4600 return 1; 4601 return 0; 4602 } 4603 4604 static noinline_for_stack int 4605 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4606 struct ext4_free_data *new_entry) 4607 { 4608 ext4_grpblk_t block; 4609 struct ext4_free_data *entry; 4610 struct ext4_group_info *db = e4b->bd_info; 4611 struct super_block *sb = e4b->bd_sb; 4612 struct ext4_sb_info *sbi = EXT4_SB(sb); 4613 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4614 struct rb_node *parent = NULL, *new_node; 4615 4616 BUG_ON(!ext4_handle_valid(handle)); 4617 BUG_ON(e4b->bd_bitmap_page == NULL); 4618 BUG_ON(e4b->bd_buddy_page == NULL); 4619 4620 new_node = &new_entry->node; 4621 block = new_entry->start_blk; 4622 4623 if (!*n) { 4624 /* first free block exent. We need to 4625 protect buddy cache from being freed, 4626 * otherwise we'll refresh it from 4627 * on-disk bitmap and lose not-yet-available 4628 * blocks */ 4629 page_cache_get(e4b->bd_buddy_page); 4630 page_cache_get(e4b->bd_bitmap_page); 4631 } 4632 while (*n) { 4633 parent = *n; 4634 entry = rb_entry(parent, struct ext4_free_data, node); 4635 if (block < entry->start_blk) 4636 n = &(*n)->rb_left; 4637 else if (block >= (entry->start_blk + entry->count)) 4638 n = &(*n)->rb_right; 4639 else { 4640 ext4_grp_locked_error(sb, e4b->bd_group, __func__, 4641 "Double free of blocks %d (%d %d)", 4642 block, entry->start_blk, entry->count); 4643 return 0; 4644 } 4645 } 4646 4647 rb_link_node(new_node, parent, n); 4648 rb_insert_color(new_node, &db->bb_free_root); 4649 4650 /* Now try to see the extent can be merged to left and right */ 4651 node = rb_prev(new_node); 4652 if (node) { 4653 entry = rb_entry(node, struct ext4_free_data, node); 4654 if (can_merge(entry, new_entry)) { 4655 new_entry->start_blk = entry->start_blk; 4656 new_entry->count += entry->count; 4657 rb_erase(node, &(db->bb_free_root)); 4658 spin_lock(&sbi->s_md_lock); 4659 list_del(&entry->list); 4660 spin_unlock(&sbi->s_md_lock); 4661 kmem_cache_free(ext4_free_ext_cachep, entry); 4662 } 4663 } 4664 4665 node = rb_next(new_node); 4666 if (node) { 4667 entry = rb_entry(node, struct ext4_free_data, node); 4668 if (can_merge(new_entry, entry)) { 4669 new_entry->count += entry->count; 4670 rb_erase(node, &(db->bb_free_root)); 4671 spin_lock(&sbi->s_md_lock); 4672 list_del(&entry->list); 4673 spin_unlock(&sbi->s_md_lock); 4674 kmem_cache_free(ext4_free_ext_cachep, entry); 4675 } 4676 } 4677 /* Add the extent to transaction's private list */ 4678 spin_lock(&sbi->s_md_lock); 4679 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4680 spin_unlock(&sbi->s_md_lock); 4681 return 0; 4682 } 4683 4684 /* 4685 * Main entry point into mballoc to free blocks 4686 */ 4687 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4688 ext4_fsblk_t block, unsigned long count, 4689 int metadata, unsigned long *freed) 4690 { 4691 struct buffer_head *bitmap_bh = NULL; 4692 struct super_block *sb = inode->i_sb; 4693 struct ext4_allocation_context *ac = NULL; 4694 struct ext4_group_desc *gdp; 4695 struct ext4_super_block *es; 4696 unsigned int overflow; 4697 ext4_grpblk_t bit; 4698 struct buffer_head *gd_bh; 4699 ext4_group_t block_group; 4700 struct ext4_sb_info *sbi; 4701 struct ext4_buddy e4b; 4702 int err = 0; 4703 int ret; 4704 4705 *freed = 0; 4706 4707 sbi = EXT4_SB(sb); 4708 es = EXT4_SB(sb)->s_es; 4709 if (block < le32_to_cpu(es->s_first_data_block) || 4710 block + count < block || 4711 block + count > ext4_blocks_count(es)) { 4712 ext4_error(sb, __func__, 4713 "Freeing blocks not in datazone - " 4714 "block = %llu, count = %lu", block, count); 4715 goto error_return; 4716 } 4717 4718 ext4_debug("freeing block %llu\n", block); 4719 trace_ext4_free_blocks(inode, block, count, metadata); 4720 4721 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4722 if (ac) { 4723 ac->ac_op = EXT4_MB_HISTORY_FREE; 4724 ac->ac_inode = inode; 4725 ac->ac_sb = sb; 4726 } 4727 4728 do_more: 4729 overflow = 0; 4730 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4731 4732 /* 4733 * Check to see if we are freeing blocks across a group 4734 * boundary. 4735 */ 4736 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4737 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4738 count -= overflow; 4739 } 4740 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4741 if (!bitmap_bh) { 4742 err = -EIO; 4743 goto error_return; 4744 } 4745 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4746 if (!gdp) { 4747 err = -EIO; 4748 goto error_return; 4749 } 4750 4751 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4752 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4753 in_range(block, ext4_inode_table(sb, gdp), 4754 EXT4_SB(sb)->s_itb_per_group) || 4755 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4756 EXT4_SB(sb)->s_itb_per_group)) { 4757 4758 ext4_error(sb, __func__, 4759 "Freeing blocks in system zone - " 4760 "Block = %llu, count = %lu", block, count); 4761 /* err = 0. ext4_std_error should be a no op */ 4762 goto error_return; 4763 } 4764 4765 BUFFER_TRACE(bitmap_bh, "getting write access"); 4766 err = ext4_journal_get_write_access(handle, bitmap_bh); 4767 if (err) 4768 goto error_return; 4769 4770 /* 4771 * We are about to modify some metadata. Call the journal APIs 4772 * to unshare ->b_data if a currently-committing transaction is 4773 * using it 4774 */ 4775 BUFFER_TRACE(gd_bh, "get_write_access"); 4776 err = ext4_journal_get_write_access(handle, gd_bh); 4777 if (err) 4778 goto error_return; 4779 #ifdef AGGRESSIVE_CHECK 4780 { 4781 int i; 4782 for (i = 0; i < count; i++) 4783 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4784 } 4785 #endif 4786 if (ac) { 4787 ac->ac_b_ex.fe_group = block_group; 4788 ac->ac_b_ex.fe_start = bit; 4789 ac->ac_b_ex.fe_len = count; 4790 ext4_mb_store_history(ac); 4791 } 4792 4793 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4794 if (err) 4795 goto error_return; 4796 if (metadata && ext4_handle_valid(handle)) { 4797 struct ext4_free_data *new_entry; 4798 /* 4799 * blocks being freed are metadata. these blocks shouldn't 4800 * be used until this transaction is committed 4801 */ 4802 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4803 new_entry->start_blk = bit; 4804 new_entry->group = block_group; 4805 new_entry->count = count; 4806 new_entry->t_tid = handle->h_transaction->t_tid; 4807 4808 ext4_lock_group(sb, block_group); 4809 mb_clear_bits(bitmap_bh->b_data, bit, count); 4810 ext4_mb_free_metadata(handle, &e4b, new_entry); 4811 } else { 4812 /* need to update group_info->bb_free and bitmap 4813 * with group lock held. generate_buddy look at 4814 * them with group lock_held 4815 */ 4816 ext4_lock_group(sb, block_group); 4817 mb_clear_bits(bitmap_bh->b_data, bit, count); 4818 mb_free_blocks(inode, &e4b, bit, count); 4819 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4820 } 4821 4822 ret = ext4_free_blks_count(sb, gdp) + count; 4823 ext4_free_blks_set(sb, gdp, ret); 4824 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4825 ext4_unlock_group(sb, block_group); 4826 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4827 4828 if (sbi->s_log_groups_per_flex) { 4829 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4830 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4831 } 4832 4833 ext4_mb_release_desc(&e4b); 4834 4835 *freed += count; 4836 4837 /* We dirtied the bitmap block */ 4838 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4839 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4840 4841 /* And the group descriptor block */ 4842 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4843 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4844 if (!err) 4845 err = ret; 4846 4847 if (overflow && !err) { 4848 block += count; 4849 count = overflow; 4850 put_bh(bitmap_bh); 4851 goto do_more; 4852 } 4853 sb->s_dirt = 1; 4854 error_return: 4855 brelse(bitmap_bh); 4856 ext4_std_error(sb, err); 4857 if (ac) 4858 kmem_cache_free(ext4_ac_cachep, ac); 4859 return; 4860 } 4861