1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * suballoc.c 5 * 6 * metadata alloc and free 7 * Inspired by ext3 block groups. 8 * 9 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public 22 * License along with this program; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 021110-1307, USA. 25 */ 26 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33 #include <cluster/masklog.h> 34 35 #include "ocfs2.h" 36 37 #include "alloc.h" 38 #include "dlmglue.h" 39 #include "inode.h" 40 #include "journal.h" 41 #include "localalloc.h" 42 #include "suballoc.h" 43 #include "super.h" 44 #include "sysfile.h" 45 #include "uptodate.h" 46 47 #include "buffer_head_io.h" 48 49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 52 static int ocfs2_block_group_fill(handle_t *handle, 53 struct inode *alloc_inode, 54 struct buffer_head *bg_bh, 55 u64 group_blkno, 56 u16 my_chain, 57 struct ocfs2_chain_list *cl); 58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 59 struct inode *alloc_inode, 60 struct buffer_head *bh); 61 62 static int ocfs2_cluster_group_search(struct inode *inode, 63 struct buffer_head *group_bh, 64 u32 bits_wanted, u32 min_bits, 65 u16 *bit_off, u16 *bits_found); 66 static int ocfs2_block_group_search(struct inode *inode, 67 struct buffer_head *group_bh, 68 u32 bits_wanted, u32 min_bits, 69 u16 *bit_off, u16 *bits_found); 70 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 71 struct ocfs2_alloc_context *ac, 72 handle_t *handle, 73 u32 bits_wanted, 74 u32 min_bits, 75 u16 *bit_off, 76 unsigned int *num_bits, 77 u64 *bg_blkno); 78 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 79 int nr); 80 static inline int ocfs2_block_group_set_bits(handle_t *handle, 81 struct inode *alloc_inode, 82 struct ocfs2_group_desc *bg, 83 struct buffer_head *group_bh, 84 unsigned int bit_off, 85 unsigned int num_bits); 86 static inline int ocfs2_block_group_clear_bits(handle_t *handle, 87 struct inode *alloc_inode, 88 struct ocfs2_group_desc *bg, 89 struct buffer_head *group_bh, 90 unsigned int bit_off, 91 unsigned int num_bits); 92 93 static int ocfs2_relink_block_group(handle_t *handle, 94 struct inode *alloc_inode, 95 struct buffer_head *fe_bh, 96 struct buffer_head *bg_bh, 97 struct buffer_head *prev_bg_bh, 98 u16 chain); 99 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 100 u32 wanted); 101 static int ocfs2_free_suballoc_bits(handle_t *handle, 102 struct inode *alloc_inode, 103 struct buffer_head *alloc_bh, 104 unsigned int start_bit, 105 u64 bg_blkno, 106 unsigned int count); 107 static inline u64 ocfs2_which_suballoc_group(u64 block, 108 unsigned int bit); 109 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 110 u64 bg_blkno, 111 u16 bg_bit_off); 112 static inline u64 ocfs2_which_cluster_group(struct inode *inode, 113 u32 cluster); 114 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 115 u64 data_blkno, 116 u64 *bg_blkno, 117 u16 *bg_bit_off); 118 119 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 120 { 121 struct inode *inode = ac->ac_inode; 122 123 if (inode) { 124 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 125 ocfs2_meta_unlock(inode, 1); 126 127 mutex_unlock(&inode->i_mutex); 128 129 iput(inode); 130 } 131 if (ac->ac_bh) 132 brelse(ac->ac_bh); 133 kfree(ac); 134 } 135 136 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 137 { 138 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 139 } 140 141 /* somewhat more expensive than our other checks, so use sparingly. */ 142 static int ocfs2_check_group_descriptor(struct super_block *sb, 143 struct ocfs2_dinode *di, 144 struct ocfs2_group_desc *gd) 145 { 146 unsigned int max_bits; 147 148 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 149 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); 150 return -EIO; 151 } 152 153 if (di->i_blkno != gd->bg_parent_dinode) { 154 ocfs2_error(sb, "Group descriptor # %llu has bad parent " 155 "pointer (%llu, expected %llu)", 156 (unsigned long long)le64_to_cpu(gd->bg_blkno), 157 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 158 (unsigned long long)le64_to_cpu(di->i_blkno)); 159 return -EIO; 160 } 161 162 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 163 if (le16_to_cpu(gd->bg_bits) > max_bits) { 164 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", 165 (unsigned long long)le64_to_cpu(gd->bg_blkno), 166 le16_to_cpu(gd->bg_bits)); 167 return -EIO; 168 } 169 170 if (le16_to_cpu(gd->bg_chain) >= 171 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 172 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", 173 (unsigned long long)le64_to_cpu(gd->bg_blkno), 174 le16_to_cpu(gd->bg_chain)); 175 return -EIO; 176 } 177 178 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 179 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 180 "claims that %u are free", 181 (unsigned long long)le64_to_cpu(gd->bg_blkno), 182 le16_to_cpu(gd->bg_bits), 183 le16_to_cpu(gd->bg_free_bits_count)); 184 return -EIO; 185 } 186 187 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 188 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 189 "max bitmap bits of %u", 190 (unsigned long long)le64_to_cpu(gd->bg_blkno), 191 le16_to_cpu(gd->bg_bits), 192 8 * le16_to_cpu(gd->bg_size)); 193 return -EIO; 194 } 195 196 return 0; 197 } 198 199 static int ocfs2_block_group_fill(handle_t *handle, 200 struct inode *alloc_inode, 201 struct buffer_head *bg_bh, 202 u64 group_blkno, 203 u16 my_chain, 204 struct ocfs2_chain_list *cl) 205 { 206 int status = 0; 207 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 208 struct super_block * sb = alloc_inode->i_sb; 209 210 mlog_entry_void(); 211 212 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 213 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " 214 "b_blocknr (%llu)", 215 (unsigned long long)group_blkno, 216 (unsigned long long) bg_bh->b_blocknr); 217 status = -EIO; 218 goto bail; 219 } 220 221 status = ocfs2_journal_access(handle, 222 alloc_inode, 223 bg_bh, 224 OCFS2_JOURNAL_ACCESS_CREATE); 225 if (status < 0) { 226 mlog_errno(status); 227 goto bail; 228 } 229 230 memset(bg, 0, sb->s_blocksize); 231 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 232 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 233 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 234 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 235 bg->bg_chain = cpu_to_le16(my_chain); 236 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 237 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 238 bg->bg_blkno = cpu_to_le64(group_blkno); 239 /* set the 1st bit in the bitmap to account for the descriptor block */ 240 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 241 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 242 243 status = ocfs2_journal_dirty(handle, bg_bh); 244 if (status < 0) 245 mlog_errno(status); 246 247 /* There is no need to zero out or otherwise initialize the 248 * other blocks in a group - All valid FS metadata in a block 249 * group stores the superblock fs_generation value at 250 * allocation time. */ 251 252 bail: 253 mlog_exit(status); 254 return status; 255 } 256 257 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 258 { 259 u16 curr, best; 260 261 best = curr = 0; 262 while (curr < le16_to_cpu(cl->cl_count)) { 263 if (le32_to_cpu(cl->cl_recs[best].c_total) > 264 le32_to_cpu(cl->cl_recs[curr].c_total)) 265 best = curr; 266 curr++; 267 } 268 return best; 269 } 270 271 /* 272 * We expect the block group allocator to already be locked. 273 */ 274 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 275 struct inode *alloc_inode, 276 struct buffer_head *bh) 277 { 278 int status, credits; 279 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 280 struct ocfs2_chain_list *cl; 281 struct ocfs2_alloc_context *ac = NULL; 282 handle_t *handle = NULL; 283 u32 bit_off, num_bits; 284 u16 alloc_rec; 285 u64 bg_blkno; 286 struct buffer_head *bg_bh = NULL; 287 struct ocfs2_group_desc *bg; 288 289 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 290 291 mlog_entry_void(); 292 293 cl = &fe->id2.i_chain; 294 status = ocfs2_reserve_clusters(osb, 295 le16_to_cpu(cl->cl_cpg), 296 &ac); 297 if (status < 0) { 298 if (status != -ENOSPC) 299 mlog_errno(status); 300 goto bail; 301 } 302 303 credits = ocfs2_calc_group_alloc_credits(osb->sb, 304 le16_to_cpu(cl->cl_cpg)); 305 handle = ocfs2_start_trans(osb, credits); 306 if (IS_ERR(handle)) { 307 status = PTR_ERR(handle); 308 handle = NULL; 309 mlog_errno(status); 310 goto bail; 311 } 312 313 status = ocfs2_claim_clusters(osb, 314 handle, 315 ac, 316 le16_to_cpu(cl->cl_cpg), 317 &bit_off, 318 &num_bits); 319 if (status < 0) { 320 if (status != -ENOSPC) 321 mlog_errno(status); 322 goto bail; 323 } 324 325 alloc_rec = ocfs2_find_smallest_chain(cl); 326 327 /* setup the group */ 328 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 329 mlog(0, "new descriptor, record %u, at block %llu\n", 330 alloc_rec, (unsigned long long)bg_blkno); 331 332 bg_bh = sb_getblk(osb->sb, bg_blkno); 333 if (!bg_bh) { 334 status = -EIO; 335 mlog_errno(status); 336 goto bail; 337 } 338 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); 339 340 status = ocfs2_block_group_fill(handle, 341 alloc_inode, 342 bg_bh, 343 bg_blkno, 344 alloc_rec, 345 cl); 346 if (status < 0) { 347 mlog_errno(status); 348 goto bail; 349 } 350 351 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 352 353 status = ocfs2_journal_access(handle, alloc_inode, 354 bh, OCFS2_JOURNAL_ACCESS_WRITE); 355 if (status < 0) { 356 mlog_errno(status); 357 goto bail; 358 } 359 360 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 361 le16_to_cpu(bg->bg_free_bits_count)); 362 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 363 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 364 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 365 le16_add_cpu(&cl->cl_next_free_rec, 1); 366 367 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 368 le16_to_cpu(bg->bg_free_bits_count)); 369 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 370 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 371 372 status = ocfs2_journal_dirty(handle, bh); 373 if (status < 0) { 374 mlog_errno(status); 375 goto bail; 376 } 377 378 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 379 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 380 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 381 le32_to_cpu(fe->i_clusters))); 382 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 383 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 384 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 385 386 status = 0; 387 bail: 388 if (handle) 389 ocfs2_commit_trans(osb, handle); 390 391 if (ac) 392 ocfs2_free_alloc_context(ac); 393 394 if (bg_bh) 395 brelse(bg_bh); 396 397 mlog_exit(status); 398 return status; 399 } 400 401 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 402 struct ocfs2_alloc_context *ac, 403 int type, 404 u32 slot) 405 { 406 int status; 407 u32 bits_wanted = ac->ac_bits_wanted; 408 struct inode *alloc_inode; 409 struct buffer_head *bh = NULL; 410 struct ocfs2_dinode *fe; 411 u32 free_bits; 412 413 mlog_entry_void(); 414 415 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 416 if (!alloc_inode) { 417 mlog_errno(-EINVAL); 418 return -EINVAL; 419 } 420 421 mutex_lock(&alloc_inode->i_mutex); 422 423 status = ocfs2_meta_lock(alloc_inode, &bh, 1); 424 if (status < 0) { 425 mutex_unlock(&alloc_inode->i_mutex); 426 iput(alloc_inode); 427 428 mlog_errno(status); 429 return status; 430 } 431 432 ac->ac_inode = alloc_inode; 433 434 fe = (struct ocfs2_dinode *) bh->b_data; 435 if (!OCFS2_IS_VALID_DINODE(fe)) { 436 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 437 status = -EIO; 438 goto bail; 439 } 440 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 441 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 442 (unsigned long long)le64_to_cpu(fe->i_blkno)); 443 status = -EIO; 444 goto bail; 445 } 446 447 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 448 le32_to_cpu(fe->id1.bitmap1.i_used); 449 450 if (bits_wanted > free_bits) { 451 /* cluster bitmap never grows */ 452 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 453 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", 454 bits_wanted, free_bits); 455 status = -ENOSPC; 456 goto bail; 457 } 458 459 status = ocfs2_block_group_alloc(osb, alloc_inode, bh); 460 if (status < 0) { 461 if (status != -ENOSPC) 462 mlog_errno(status); 463 goto bail; 464 } 465 atomic_inc(&osb->alloc_stats.bg_extends); 466 467 /* You should never ask for this much metadata */ 468 BUG_ON(bits_wanted > 469 (le32_to_cpu(fe->id1.bitmap1.i_total) 470 - le32_to_cpu(fe->id1.bitmap1.i_used))); 471 } 472 473 get_bh(bh); 474 ac->ac_bh = bh; 475 bail: 476 if (bh) 477 brelse(bh); 478 479 mlog_exit(status); 480 return status; 481 } 482 483 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 484 struct ocfs2_dinode *fe, 485 struct ocfs2_alloc_context **ac) 486 { 487 int status; 488 u32 slot; 489 490 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 491 if (!(*ac)) { 492 status = -ENOMEM; 493 mlog_errno(status); 494 goto bail; 495 } 496 497 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 498 (*ac)->ac_which = OCFS2_AC_USE_META; 499 500 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS 501 slot = 0; 502 #else 503 slot = osb->slot_num; 504 #endif 505 506 (*ac)->ac_group_search = ocfs2_block_group_search; 507 508 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 509 EXTENT_ALLOC_SYSTEM_INODE, slot); 510 if (status < 0) { 511 if (status != -ENOSPC) 512 mlog_errno(status); 513 goto bail; 514 } 515 516 status = 0; 517 bail: 518 if ((status < 0) && *ac) { 519 ocfs2_free_alloc_context(*ac); 520 *ac = NULL; 521 } 522 523 mlog_exit(status); 524 return status; 525 } 526 527 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 528 struct ocfs2_alloc_context **ac) 529 { 530 int status; 531 532 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 533 if (!(*ac)) { 534 status = -ENOMEM; 535 mlog_errno(status); 536 goto bail; 537 } 538 539 (*ac)->ac_bits_wanted = 1; 540 (*ac)->ac_which = OCFS2_AC_USE_INODE; 541 542 (*ac)->ac_group_search = ocfs2_block_group_search; 543 544 status = ocfs2_reserve_suballoc_bits(osb, *ac, 545 INODE_ALLOC_SYSTEM_INODE, 546 osb->slot_num); 547 if (status < 0) { 548 if (status != -ENOSPC) 549 mlog_errno(status); 550 goto bail; 551 } 552 553 status = 0; 554 bail: 555 if ((status < 0) && *ac) { 556 ocfs2_free_alloc_context(*ac); 557 *ac = NULL; 558 } 559 560 mlog_exit(status); 561 return status; 562 } 563 564 /* local alloc code has to do the same thing, so rather than do this 565 * twice.. */ 566 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 567 struct ocfs2_alloc_context *ac) 568 { 569 int status; 570 571 ac->ac_which = OCFS2_AC_USE_MAIN; 572 ac->ac_group_search = ocfs2_cluster_group_search; 573 574 status = ocfs2_reserve_suballoc_bits(osb, ac, 575 GLOBAL_BITMAP_SYSTEM_INODE, 576 OCFS2_INVALID_SLOT); 577 if (status < 0 && status != -ENOSPC) { 578 mlog_errno(status); 579 goto bail; 580 } 581 582 bail: 583 return status; 584 } 585 586 /* Callers don't need to care which bitmap (local alloc or main) to 587 * use so we figure it out for them, but unfortunately this clutters 588 * things a bit. */ 589 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 590 u32 bits_wanted, 591 struct ocfs2_alloc_context **ac) 592 { 593 int status; 594 595 mlog_entry_void(); 596 597 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 598 if (!(*ac)) { 599 status = -ENOMEM; 600 mlog_errno(status); 601 goto bail; 602 } 603 604 (*ac)->ac_bits_wanted = bits_wanted; 605 606 status = -ENOSPC; 607 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 608 status = ocfs2_reserve_local_alloc_bits(osb, 609 bits_wanted, 610 *ac); 611 if ((status < 0) && (status != -ENOSPC)) { 612 mlog_errno(status); 613 goto bail; 614 } else if (status == -ENOSPC) { 615 /* reserve_local_bits will return enospc with 616 * the local alloc inode still locked, so we 617 * can change this safely here. */ 618 mlog(0, "Disabling local alloc\n"); 619 /* We set to OCFS2_LA_DISABLED so that umount 620 * can clean up what's left of the local 621 * allocation */ 622 osb->local_alloc_state = OCFS2_LA_DISABLED; 623 } 624 } 625 626 if (status == -ENOSPC) { 627 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 628 if (status < 0) { 629 if (status != -ENOSPC) 630 mlog_errno(status); 631 goto bail; 632 } 633 } 634 635 status = 0; 636 bail: 637 if ((status < 0) && *ac) { 638 ocfs2_free_alloc_context(*ac); 639 *ac = NULL; 640 } 641 642 mlog_exit(status); 643 return status; 644 } 645 646 /* 647 * More or less lifted from ext3. I'll leave their description below: 648 * 649 * "For ext3 allocations, we must not reuse any blocks which are 650 * allocated in the bitmap buffer's "last committed data" copy. This 651 * prevents deletes from freeing up the page for reuse until we have 652 * committed the delete transaction. 653 * 654 * If we didn't do this, then deleting something and reallocating it as 655 * data would allow the old block to be overwritten before the 656 * transaction committed (because we force data to disk before commit). 657 * This would lead to corruption if we crashed between overwriting the 658 * data and committing the delete. 659 * 660 * @@@ We may want to make this allocation behaviour conditional on 661 * data-writes at some point, and disable it for metadata allocations or 662 * sync-data inodes." 663 * 664 * Note: OCFS2 already does this differently for metadata vs data 665 * allocations, as those bitmaps are seperate and undo access is never 666 * called on a metadata group descriptor. 667 */ 668 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 669 int nr) 670 { 671 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 672 673 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 674 return 0; 675 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) 676 return 1; 677 678 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; 679 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 680 } 681 682 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 683 struct buffer_head *bg_bh, 684 unsigned int bits_wanted, 685 unsigned int total_bits, 686 u16 *bit_off, 687 u16 *bits_found) 688 { 689 void *bitmap; 690 u16 best_offset, best_size; 691 int offset, start, found, status = 0; 692 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 693 694 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 695 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); 696 return -EIO; 697 } 698 699 found = start = best_offset = best_size = 0; 700 bitmap = bg->bg_bitmap; 701 702 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { 703 if (offset == total_bits) 704 break; 705 706 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 707 /* We found a zero, but we can't use it as it 708 * hasn't been put to disk yet! */ 709 found = 0; 710 start = offset + 1; 711 } else if (offset == start) { 712 /* we found a zero */ 713 found++; 714 /* move start to the next bit to test */ 715 start++; 716 } else { 717 /* got a zero after some ones */ 718 found = 1; 719 start = offset + 1; 720 } 721 if (found > best_size) { 722 best_size = found; 723 best_offset = start - found; 724 } 725 /* we got everything we needed */ 726 if (found == bits_wanted) { 727 /* mlog(0, "Found it all!\n"); */ 728 break; 729 } 730 } 731 732 /* XXX: I think the first clause is equivalent to the second 733 * - jlbec */ 734 if (found == bits_wanted) { 735 *bit_off = start - found; 736 *bits_found = found; 737 } else if (best_size) { 738 *bit_off = best_offset; 739 *bits_found = best_size; 740 } else { 741 status = -ENOSPC; 742 /* No error log here -- see the comment above 743 * ocfs2_test_bg_bit_allocatable */ 744 } 745 746 return status; 747 } 748 749 static inline int ocfs2_block_group_set_bits(handle_t *handle, 750 struct inode *alloc_inode, 751 struct ocfs2_group_desc *bg, 752 struct buffer_head *group_bh, 753 unsigned int bit_off, 754 unsigned int num_bits) 755 { 756 int status; 757 void *bitmap = bg->bg_bitmap; 758 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 759 760 mlog_entry_void(); 761 762 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 763 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 764 status = -EIO; 765 goto bail; 766 } 767 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 768 769 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 770 num_bits); 771 772 if (ocfs2_is_cluster_bitmap(alloc_inode)) 773 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 774 775 status = ocfs2_journal_access(handle, 776 alloc_inode, 777 group_bh, 778 journal_type); 779 if (status < 0) { 780 mlog_errno(status); 781 goto bail; 782 } 783 784 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 785 786 while(num_bits--) 787 ocfs2_set_bit(bit_off++, bitmap); 788 789 status = ocfs2_journal_dirty(handle, 790 group_bh); 791 if (status < 0) { 792 mlog_errno(status); 793 goto bail; 794 } 795 796 bail: 797 mlog_exit(status); 798 return status; 799 } 800 801 /* find the one with the most empty bits */ 802 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 803 { 804 u16 curr, best; 805 806 BUG_ON(!cl->cl_next_free_rec); 807 808 best = curr = 0; 809 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 810 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 811 le32_to_cpu(cl->cl_recs[best].c_free)) 812 best = curr; 813 curr++; 814 } 815 816 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 817 return best; 818 } 819 820 static int ocfs2_relink_block_group(handle_t *handle, 821 struct inode *alloc_inode, 822 struct buffer_head *fe_bh, 823 struct buffer_head *bg_bh, 824 struct buffer_head *prev_bg_bh, 825 u16 chain) 826 { 827 int status; 828 /* there is a really tiny chance the journal calls could fail, 829 * but we wouldn't want inconsistent blocks in *any* case. */ 830 u64 fe_ptr, bg_ptr, prev_bg_ptr; 831 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 832 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 833 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 834 835 if (!OCFS2_IS_VALID_DINODE(fe)) { 836 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 837 status = -EIO; 838 goto out; 839 } 840 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 841 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 842 status = -EIO; 843 goto out; 844 } 845 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { 846 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); 847 status = -EIO; 848 goto out; 849 } 850 851 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 852 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 853 (unsigned long long)le64_to_cpu(bg->bg_blkno), 854 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 855 856 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); 857 bg_ptr = le64_to_cpu(bg->bg_next_group); 858 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 859 860 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, 861 OCFS2_JOURNAL_ACCESS_WRITE); 862 if (status < 0) { 863 mlog_errno(status); 864 goto out_rollback; 865 } 866 867 prev_bg->bg_next_group = bg->bg_next_group; 868 869 status = ocfs2_journal_dirty(handle, prev_bg_bh); 870 if (status < 0) { 871 mlog_errno(status); 872 goto out_rollback; 873 } 874 875 status = ocfs2_journal_access(handle, alloc_inode, bg_bh, 876 OCFS2_JOURNAL_ACCESS_WRITE); 877 if (status < 0) { 878 mlog_errno(status); 879 goto out_rollback; 880 } 881 882 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 883 884 status = ocfs2_journal_dirty(handle, bg_bh); 885 if (status < 0) { 886 mlog_errno(status); 887 goto out_rollback; 888 } 889 890 status = ocfs2_journal_access(handle, alloc_inode, fe_bh, 891 OCFS2_JOURNAL_ACCESS_WRITE); 892 if (status < 0) { 893 mlog_errno(status); 894 goto out_rollback; 895 } 896 897 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 898 899 status = ocfs2_journal_dirty(handle, fe_bh); 900 if (status < 0) { 901 mlog_errno(status); 902 goto out_rollback; 903 } 904 905 status = 0; 906 out_rollback: 907 if (status < 0) { 908 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 909 bg->bg_next_group = cpu_to_le64(bg_ptr); 910 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 911 } 912 out: 913 mlog_exit(status); 914 return status; 915 } 916 917 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 918 u32 wanted) 919 { 920 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 921 } 922 923 /* return 0 on success, -ENOSPC to keep searching and any other < 0 924 * value on error. */ 925 static int ocfs2_cluster_group_search(struct inode *inode, 926 struct buffer_head *group_bh, 927 u32 bits_wanted, u32 min_bits, 928 u16 *bit_off, u16 *bits_found) 929 { 930 int search = -ENOSPC; 931 int ret; 932 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 933 u16 tmp_off, tmp_found; 934 unsigned int max_bits, gd_cluster_off; 935 936 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 937 938 if (gd->bg_free_bits_count) { 939 max_bits = le16_to_cpu(gd->bg_bits); 940 941 /* Tail groups in cluster bitmaps which aren't cpg 942 * aligned are prone to partial extention by a failed 943 * fs resize. If the file system resize never got to 944 * update the dinode cluster count, then we don't want 945 * to trust any clusters past it, regardless of what 946 * the group descriptor says. */ 947 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 948 le64_to_cpu(gd->bg_blkno)); 949 if ((gd_cluster_off + max_bits) > 950 OCFS2_I(inode)->ip_clusters) { 951 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 952 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", 953 (unsigned long long)le64_to_cpu(gd->bg_blkno), 954 le16_to_cpu(gd->bg_bits), 955 OCFS2_I(inode)->ip_clusters, max_bits); 956 } 957 958 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 959 group_bh, bits_wanted, 960 max_bits, 961 &tmp_off, &tmp_found); 962 if (ret) 963 return ret; 964 965 /* ocfs2_block_group_find_clear_bits() might 966 * return success, but we still want to return 967 * -ENOSPC unless it found the minimum number 968 * of bits. */ 969 if (min_bits <= tmp_found) { 970 *bit_off = tmp_off; 971 *bits_found = tmp_found; 972 search = 0; /* success */ 973 } 974 } 975 976 return search; 977 } 978 979 static int ocfs2_block_group_search(struct inode *inode, 980 struct buffer_head *group_bh, 981 u32 bits_wanted, u32 min_bits, 982 u16 *bit_off, u16 *bits_found) 983 { 984 int ret = -ENOSPC; 985 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 986 987 BUG_ON(min_bits != 1); 988 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 989 990 if (bg->bg_free_bits_count) 991 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 992 group_bh, bits_wanted, 993 le16_to_cpu(bg->bg_bits), 994 bit_off, bits_found); 995 996 return ret; 997 } 998 999 static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1000 handle_t *handle, 1001 struct buffer_head *di_bh, 1002 u32 num_bits, 1003 u16 chain) 1004 { 1005 int ret; 1006 u32 tmp_used; 1007 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1008 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1009 1010 ret = ocfs2_journal_access(handle, inode, di_bh, 1011 OCFS2_JOURNAL_ACCESS_WRITE); 1012 if (ret < 0) { 1013 mlog_errno(ret); 1014 goto out; 1015 } 1016 1017 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1018 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1019 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1020 1021 ret = ocfs2_journal_dirty(handle, di_bh); 1022 if (ret < 0) 1023 mlog_errno(ret); 1024 1025 out: 1026 return ret; 1027 } 1028 1029 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1030 handle_t *handle, 1031 u32 bits_wanted, 1032 u32 min_bits, 1033 u16 *bit_off, 1034 unsigned int *num_bits, 1035 u64 gd_blkno, 1036 u16 *bits_left) 1037 { 1038 int ret; 1039 u16 found; 1040 struct buffer_head *group_bh = NULL; 1041 struct ocfs2_group_desc *gd; 1042 struct inode *alloc_inode = ac->ac_inode; 1043 1044 ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, 1045 &group_bh, OCFS2_BH_CACHED, alloc_inode); 1046 if (ret < 0) { 1047 mlog_errno(ret); 1048 return ret; 1049 } 1050 1051 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1052 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 1053 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd); 1054 ret = -EIO; 1055 goto out; 1056 } 1057 1058 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1059 bit_off, &found); 1060 if (ret < 0) { 1061 if (ret != -ENOSPC) 1062 mlog_errno(ret); 1063 goto out; 1064 } 1065 1066 *num_bits = found; 1067 1068 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1069 *num_bits, 1070 le16_to_cpu(gd->bg_chain)); 1071 if (ret < 0) { 1072 mlog_errno(ret); 1073 goto out; 1074 } 1075 1076 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1077 *bit_off, *num_bits); 1078 if (ret < 0) 1079 mlog_errno(ret); 1080 1081 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1082 1083 out: 1084 brelse(group_bh); 1085 1086 return ret; 1087 } 1088 1089 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1090 handle_t *handle, 1091 u32 bits_wanted, 1092 u32 min_bits, 1093 u16 *bit_off, 1094 unsigned int *num_bits, 1095 u64 *bg_blkno, 1096 u16 *bits_left) 1097 { 1098 int status; 1099 u16 chain, tmp_bits; 1100 u32 tmp_used; 1101 u64 next_group; 1102 struct inode *alloc_inode = ac->ac_inode; 1103 struct buffer_head *group_bh = NULL; 1104 struct buffer_head *prev_group_bh = NULL; 1105 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1106 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1107 struct ocfs2_group_desc *bg; 1108 1109 chain = ac->ac_chain; 1110 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n", 1111 bits_wanted, chain, 1112 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1113 1114 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), 1115 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1116 &group_bh, OCFS2_BH_CACHED, alloc_inode); 1117 if (status < 0) { 1118 mlog_errno(status); 1119 goto bail; 1120 } 1121 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1122 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); 1123 if (status) { 1124 mlog_errno(status); 1125 goto bail; 1126 } 1127 1128 status = -ENOSPC; 1129 /* for now, the chain search is a bit simplistic. We just use 1130 * the 1st group with any empty bits. */ 1131 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1132 bits_wanted, min_bits, bit_off, 1133 &tmp_bits)) == -ENOSPC) { 1134 if (!bg->bg_next_group) 1135 break; 1136 1137 if (prev_group_bh) { 1138 brelse(prev_group_bh); 1139 prev_group_bh = NULL; 1140 } 1141 next_group = le64_to_cpu(bg->bg_next_group); 1142 prev_group_bh = group_bh; 1143 group_bh = NULL; 1144 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), 1145 next_group, &group_bh, 1146 OCFS2_BH_CACHED, alloc_inode); 1147 if (status < 0) { 1148 mlog_errno(status); 1149 goto bail; 1150 } 1151 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1152 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); 1153 if (status) { 1154 mlog_errno(status); 1155 goto bail; 1156 } 1157 } 1158 if (status < 0) { 1159 if (status != -ENOSPC) 1160 mlog_errno(status); 1161 goto bail; 1162 } 1163 1164 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1165 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1166 1167 *num_bits = tmp_bits; 1168 1169 BUG_ON(*num_bits == 0); 1170 1171 /* 1172 * Keep track of previous block descriptor read. When 1173 * we find a target, if we have read more than X 1174 * number of descriptors, and the target is reasonably 1175 * empty, relink him to top of his chain. 1176 * 1177 * We've read 0 extra blocks and only send one more to 1178 * the transaction, yet the next guy to search has a 1179 * much easier time. 1180 * 1181 * Do this *after* figuring out how many bits we're taking out 1182 * of our target group. 1183 */ 1184 if (ac->ac_allow_chain_relink && 1185 (prev_group_bh) && 1186 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1187 status = ocfs2_relink_block_group(handle, alloc_inode, 1188 ac->ac_bh, group_bh, 1189 prev_group_bh, chain); 1190 if (status < 0) { 1191 mlog_errno(status); 1192 goto bail; 1193 } 1194 } 1195 1196 /* Ok, claim our bits now: set the info on dinode, chainlist 1197 * and then the group */ 1198 status = ocfs2_journal_access(handle, 1199 alloc_inode, 1200 ac->ac_bh, 1201 OCFS2_JOURNAL_ACCESS_WRITE); 1202 if (status < 0) { 1203 mlog_errno(status); 1204 goto bail; 1205 } 1206 1207 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1208 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); 1209 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); 1210 1211 status = ocfs2_journal_dirty(handle, 1212 ac->ac_bh); 1213 if (status < 0) { 1214 mlog_errno(status); 1215 goto bail; 1216 } 1217 1218 status = ocfs2_block_group_set_bits(handle, 1219 alloc_inode, 1220 bg, 1221 group_bh, 1222 *bit_off, 1223 *num_bits); 1224 if (status < 0) { 1225 mlog_errno(status); 1226 goto bail; 1227 } 1228 1229 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1230 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1231 1232 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1233 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1234 bail: 1235 if (group_bh) 1236 brelse(group_bh); 1237 if (prev_group_bh) 1238 brelse(prev_group_bh); 1239 1240 mlog_exit(status); 1241 return status; 1242 } 1243 1244 /* will give out up to bits_wanted contiguous bits. */ 1245 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1246 struct ocfs2_alloc_context *ac, 1247 handle_t *handle, 1248 u32 bits_wanted, 1249 u32 min_bits, 1250 u16 *bit_off, 1251 unsigned int *num_bits, 1252 u64 *bg_blkno) 1253 { 1254 int status; 1255 u16 victim, i; 1256 u16 bits_left = 0; 1257 u64 hint_blkno = ac->ac_last_group; 1258 struct ocfs2_chain_list *cl; 1259 struct ocfs2_dinode *fe; 1260 1261 mlog_entry_void(); 1262 1263 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1264 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1265 BUG_ON(!ac->ac_bh); 1266 1267 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1268 if (!OCFS2_IS_VALID_DINODE(fe)) { 1269 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); 1270 status = -EIO; 1271 goto bail; 1272 } 1273 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1274 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1275 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1276 "bits but only %u total.", 1277 (unsigned long long)le64_to_cpu(fe->i_blkno), 1278 le32_to_cpu(fe->id1.bitmap1.i_used), 1279 le32_to_cpu(fe->id1.bitmap1.i_total)); 1280 status = -EIO; 1281 goto bail; 1282 } 1283 1284 if (hint_blkno) { 1285 /* Attempt to short-circuit the usual search mechanism 1286 * by jumping straight to the most recently used 1287 * allocation group. This helps us mantain some 1288 * contiguousness across allocations. */ 1289 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1290 min_bits, bit_off, num_bits, 1291 hint_blkno, &bits_left); 1292 if (!status) { 1293 /* Be careful to update *bg_blkno here as the 1294 * caller is expecting it to be filled in, and 1295 * ocfs2_search_one_group() won't do that for 1296 * us. */ 1297 *bg_blkno = hint_blkno; 1298 goto set_hint; 1299 } 1300 if (status < 0 && status != -ENOSPC) { 1301 mlog_errno(status); 1302 goto bail; 1303 } 1304 } 1305 1306 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1307 1308 victim = ocfs2_find_victim_chain(cl); 1309 ac->ac_chain = victim; 1310 ac->ac_allow_chain_relink = 1; 1311 1312 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1313 num_bits, bg_blkno, &bits_left); 1314 if (!status) 1315 goto set_hint; 1316 if (status < 0 && status != -ENOSPC) { 1317 mlog_errno(status); 1318 goto bail; 1319 } 1320 1321 mlog(0, "Search of victim chain %u came up with nothing, " 1322 "trying all chains now.\n", victim); 1323 1324 /* If we didn't pick a good victim, then just default to 1325 * searching each chain in order. Don't allow chain relinking 1326 * because we only calculate enough journal credits for one 1327 * relink per alloc. */ 1328 ac->ac_allow_chain_relink = 0; 1329 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 1330 if (i == victim) 1331 continue; 1332 if (!cl->cl_recs[i].c_free) 1333 continue; 1334 1335 ac->ac_chain = i; 1336 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1337 bit_off, num_bits, bg_blkno, 1338 &bits_left); 1339 if (!status) 1340 break; 1341 if (status < 0 && status != -ENOSPC) { 1342 mlog_errno(status); 1343 goto bail; 1344 } 1345 } 1346 1347 set_hint: 1348 if (status != -ENOSPC) { 1349 /* If the next search of this group is not likely to 1350 * yield a suitable extent, then we reset the last 1351 * group hint so as to not waste a disk read */ 1352 if (bits_left < min_bits) 1353 ac->ac_last_group = 0; 1354 else 1355 ac->ac_last_group = *bg_blkno; 1356 } 1357 1358 bail: 1359 mlog_exit(status); 1360 return status; 1361 } 1362 1363 int ocfs2_claim_metadata(struct ocfs2_super *osb, 1364 handle_t *handle, 1365 struct ocfs2_alloc_context *ac, 1366 u32 bits_wanted, 1367 u16 *suballoc_bit_start, 1368 unsigned int *num_bits, 1369 u64 *blkno_start) 1370 { 1371 int status; 1372 u64 bg_blkno; 1373 1374 BUG_ON(!ac); 1375 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1376 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1377 1378 status = ocfs2_claim_suballoc_bits(osb, 1379 ac, 1380 handle, 1381 bits_wanted, 1382 1, 1383 suballoc_bit_start, 1384 num_bits, 1385 &bg_blkno); 1386 if (status < 0) { 1387 mlog_errno(status); 1388 goto bail; 1389 } 1390 atomic_inc(&osb->alloc_stats.bg_allocs); 1391 1392 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 1393 ac->ac_bits_given += (*num_bits); 1394 status = 0; 1395 bail: 1396 mlog_exit(status); 1397 return status; 1398 } 1399 1400 int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1401 handle_t *handle, 1402 struct ocfs2_alloc_context *ac, 1403 u16 *suballoc_bit, 1404 u64 *fe_blkno) 1405 { 1406 int status; 1407 unsigned int num_bits; 1408 u64 bg_blkno; 1409 1410 mlog_entry_void(); 1411 1412 BUG_ON(!ac); 1413 BUG_ON(ac->ac_bits_given != 0); 1414 BUG_ON(ac->ac_bits_wanted != 1); 1415 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1416 1417 status = ocfs2_claim_suballoc_bits(osb, 1418 ac, 1419 handle, 1420 1, 1421 1, 1422 suballoc_bit, 1423 &num_bits, 1424 &bg_blkno); 1425 if (status < 0) { 1426 mlog_errno(status); 1427 goto bail; 1428 } 1429 atomic_inc(&osb->alloc_stats.bg_allocs); 1430 1431 BUG_ON(num_bits != 1); 1432 1433 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1434 ac->ac_bits_given++; 1435 status = 0; 1436 bail: 1437 mlog_exit(status); 1438 return status; 1439 } 1440 1441 /* translate a group desc. blkno and it's bitmap offset into 1442 * disk cluster offset. */ 1443 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 1444 u64 bg_blkno, 1445 u16 bg_bit_off) 1446 { 1447 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1448 u32 cluster = 0; 1449 1450 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1451 1452 if (bg_blkno != osb->first_cluster_group_blkno) 1453 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 1454 cluster += (u32) bg_bit_off; 1455 return cluster; 1456 } 1457 1458 /* given a cluster offset, calculate which block group it belongs to 1459 * and return that block offset. */ 1460 static inline u64 ocfs2_which_cluster_group(struct inode *inode, 1461 u32 cluster) 1462 { 1463 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1464 u32 group_no; 1465 1466 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1467 1468 group_no = cluster / osb->bitmap_cpg; 1469 if (!group_no) 1470 return osb->first_cluster_group_blkno; 1471 return ocfs2_clusters_to_blocks(inode->i_sb, 1472 group_no * osb->bitmap_cpg); 1473 } 1474 1475 /* given the block number of a cluster start, calculate which cluster 1476 * group and descriptor bitmap offset that corresponds to. */ 1477 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 1478 u64 data_blkno, 1479 u64 *bg_blkno, 1480 u16 *bg_bit_off) 1481 { 1482 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1483 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 1484 1485 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1486 1487 *bg_blkno = ocfs2_which_cluster_group(inode, 1488 data_cluster); 1489 1490 if (*bg_blkno == osb->first_cluster_group_blkno) 1491 *bg_bit_off = (u16) data_cluster; 1492 else 1493 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 1494 data_blkno - *bg_blkno); 1495 } 1496 1497 /* 1498 * min_bits - minimum contiguous chunk from this total allocation we 1499 * can handle. set to what we asked for originally for a full 1500 * contig. allocation, set to '1' to indicate we can deal with extents 1501 * of any size. 1502 */ 1503 int ocfs2_claim_clusters(struct ocfs2_super *osb, 1504 handle_t *handle, 1505 struct ocfs2_alloc_context *ac, 1506 u32 min_clusters, 1507 u32 *cluster_start, 1508 u32 *num_clusters) 1509 { 1510 int status; 1511 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 1512 u64 bg_blkno = 0; 1513 u16 bg_bit_off; 1514 1515 mlog_entry_void(); 1516 1517 BUG_ON(!ac); 1518 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1519 1520 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 1521 && ac->ac_which != OCFS2_AC_USE_MAIN); 1522 1523 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 1524 status = ocfs2_claim_local_alloc_bits(osb, 1525 handle, 1526 ac, 1527 bits_wanted, 1528 cluster_start, 1529 num_clusters); 1530 if (!status) 1531 atomic_inc(&osb->alloc_stats.local_data); 1532 } else { 1533 if (min_clusters > (osb->bitmap_cpg - 1)) { 1534 /* The only paths asking for contiguousness 1535 * should know about this already. */ 1536 mlog(ML_ERROR, "minimum allocation requested exceeds " 1537 "group bitmap size!"); 1538 status = -ENOSPC; 1539 goto bail; 1540 } 1541 /* clamp the current request down to a realistic size. */ 1542 if (bits_wanted > (osb->bitmap_cpg - 1)) 1543 bits_wanted = osb->bitmap_cpg - 1; 1544 1545 status = ocfs2_claim_suballoc_bits(osb, 1546 ac, 1547 handle, 1548 bits_wanted, 1549 min_clusters, 1550 &bg_bit_off, 1551 num_clusters, 1552 &bg_blkno); 1553 if (!status) { 1554 *cluster_start = 1555 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 1556 bg_blkno, 1557 bg_bit_off); 1558 atomic_inc(&osb->alloc_stats.bitmap_data); 1559 } 1560 } 1561 if (status < 0) { 1562 if (status != -ENOSPC) 1563 mlog_errno(status); 1564 goto bail; 1565 } 1566 1567 ac->ac_bits_given += *num_clusters; 1568 1569 bail: 1570 mlog_exit(status); 1571 return status; 1572 } 1573 1574 static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1575 struct inode *alloc_inode, 1576 struct ocfs2_group_desc *bg, 1577 struct buffer_head *group_bh, 1578 unsigned int bit_off, 1579 unsigned int num_bits) 1580 { 1581 int status; 1582 unsigned int tmp; 1583 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1584 struct ocfs2_group_desc *undo_bg = NULL; 1585 1586 mlog_entry_void(); 1587 1588 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 1589 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 1590 status = -EIO; 1591 goto bail; 1592 } 1593 1594 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1595 1596 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1597 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1598 1599 status = ocfs2_journal_access(handle, alloc_inode, group_bh, 1600 journal_type); 1601 if (status < 0) { 1602 mlog_errno(status); 1603 goto bail; 1604 } 1605 1606 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1607 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; 1608 1609 tmp = num_bits; 1610 while(tmp--) { 1611 ocfs2_clear_bit((bit_off + tmp), 1612 (unsigned long *) bg->bg_bitmap); 1613 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1614 ocfs2_set_bit(bit_off + tmp, 1615 (unsigned long *) undo_bg->bg_bitmap); 1616 } 1617 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 1618 1619 status = ocfs2_journal_dirty(handle, group_bh); 1620 if (status < 0) 1621 mlog_errno(status); 1622 bail: 1623 return status; 1624 } 1625 1626 /* 1627 * expects the suballoc inode to already be locked. 1628 */ 1629 static int ocfs2_free_suballoc_bits(handle_t *handle, 1630 struct inode *alloc_inode, 1631 struct buffer_head *alloc_bh, 1632 unsigned int start_bit, 1633 u64 bg_blkno, 1634 unsigned int count) 1635 { 1636 int status = 0; 1637 u32 tmp_used; 1638 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1639 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 1640 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 1641 struct buffer_head *group_bh = NULL; 1642 struct ocfs2_group_desc *group; 1643 1644 mlog_entry_void(); 1645 1646 if (!OCFS2_IS_VALID_DINODE(fe)) { 1647 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1648 status = -EIO; 1649 goto bail; 1650 } 1651 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 1652 1653 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 1654 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 1655 (unsigned long long)bg_blkno, start_bit); 1656 1657 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, 1658 alloc_inode); 1659 if (status < 0) { 1660 mlog_errno(status); 1661 goto bail; 1662 } 1663 1664 group = (struct ocfs2_group_desc *) group_bh->b_data; 1665 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); 1666 if (status) { 1667 mlog_errno(status); 1668 goto bail; 1669 } 1670 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 1671 1672 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 1673 group, group_bh, 1674 start_bit, count); 1675 if (status < 0) { 1676 mlog_errno(status); 1677 goto bail; 1678 } 1679 1680 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, 1681 OCFS2_JOURNAL_ACCESS_WRITE); 1682 if (status < 0) { 1683 mlog_errno(status); 1684 goto bail; 1685 } 1686 1687 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 1688 count); 1689 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1690 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 1691 1692 status = ocfs2_journal_dirty(handle, alloc_bh); 1693 if (status < 0) { 1694 mlog_errno(status); 1695 goto bail; 1696 } 1697 1698 bail: 1699 if (group_bh) 1700 brelse(group_bh); 1701 1702 mlog_exit(status); 1703 return status; 1704 } 1705 1706 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 1707 { 1708 u64 group = block - (u64) bit; 1709 1710 return group; 1711 } 1712 1713 int ocfs2_free_dinode(handle_t *handle, 1714 struct inode *inode_alloc_inode, 1715 struct buffer_head *inode_alloc_bh, 1716 struct ocfs2_dinode *di) 1717 { 1718 u64 blk = le64_to_cpu(di->i_blkno); 1719 u16 bit = le16_to_cpu(di->i_suballoc_bit); 1720 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 1721 1722 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 1723 inode_alloc_bh, bit, bg_blkno, 1); 1724 } 1725 1726 int ocfs2_free_extent_block(handle_t *handle, 1727 struct inode *eb_alloc_inode, 1728 struct buffer_head *eb_alloc_bh, 1729 struct ocfs2_extent_block *eb) 1730 { 1731 u64 blk = le64_to_cpu(eb->h_blkno); 1732 u16 bit = le16_to_cpu(eb->h_suballoc_bit); 1733 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 1734 1735 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, 1736 bit, bg_blkno, 1); 1737 } 1738 1739 int ocfs2_free_clusters(handle_t *handle, 1740 struct inode *bitmap_inode, 1741 struct buffer_head *bitmap_bh, 1742 u64 start_blk, 1743 unsigned int num_clusters) 1744 { 1745 int status; 1746 u16 bg_start_bit; 1747 u64 bg_blkno; 1748 struct ocfs2_dinode *fe; 1749 1750 /* You can't ever have a contiguous set of clusters 1751 * bigger than a block group bitmap so we never have to worry 1752 * about looping on them. */ 1753 1754 mlog_entry_void(); 1755 1756 /* This is expensive. We can safely remove once this stuff has 1757 * gotten tested really well. */ 1758 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); 1759 1760 fe = (struct ocfs2_dinode *) bitmap_bh->b_data; 1761 1762 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 1763 &bg_start_bit); 1764 1765 mlog(0, "want to free %u clusters starting at block %llu\n", 1766 num_clusters, (unsigned long long)start_blk); 1767 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 1768 (unsigned long long)bg_blkno, bg_start_bit); 1769 1770 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 1771 bg_start_bit, bg_blkno, 1772 num_clusters); 1773 if (status < 0) 1774 mlog_errno(status); 1775 1776 mlog_exit(status); 1777 return status; 1778 } 1779 1780 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 1781 { 1782 printk("Block Group:\n"); 1783 printk("bg_signature: %s\n", bg->bg_signature); 1784 printk("bg_size: %u\n", bg->bg_size); 1785 printk("bg_bits: %u\n", bg->bg_bits); 1786 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); 1787 printk("bg_chain: %u\n", bg->bg_chain); 1788 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); 1789 printk("bg_next_group: %llu\n", 1790 (unsigned long long)bg->bg_next_group); 1791 printk("bg_parent_dinode: %llu\n", 1792 (unsigned long long)bg->bg_parent_dinode); 1793 printk("bg_blkno: %llu\n", 1794 (unsigned long long)bg->bg_blkno); 1795 } 1796 1797 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) 1798 { 1799 int i; 1800 1801 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); 1802 printk("i_signature: %s\n", fe->i_signature); 1803 printk("i_size: %llu\n", 1804 (unsigned long long)fe->i_size); 1805 printk("i_clusters: %u\n", fe->i_clusters); 1806 printk("i_generation: %u\n", 1807 le32_to_cpu(fe->i_generation)); 1808 printk("id1.bitmap1.i_used: %u\n", 1809 le32_to_cpu(fe->id1.bitmap1.i_used)); 1810 printk("id1.bitmap1.i_total: %u\n", 1811 le32_to_cpu(fe->id1.bitmap1.i_total)); 1812 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); 1813 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); 1814 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); 1815 printk("id2.i_chain.cl_next_free_rec: %u\n", 1816 fe->id2.i_chain.cl_next_free_rec); 1817 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { 1818 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, 1819 fe->id2.i_chain.cl_recs[i].c_free); 1820 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, 1821 fe->id2.i_chain.cl_recs[i].c_total); 1822 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, 1823 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); 1824 } 1825 } 1826