1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * suballoc.c 5 * 6 * metadata alloc and free 7 * Inspired by ext3 block groups. 8 * 9 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public 22 * License along with this program; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 021110-1307, USA. 25 */ 26 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33 #include <cluster/masklog.h> 34 35 #include "ocfs2.h" 36 37 #include "alloc.h" 38 #include "blockcheck.h" 39 #include "dlmglue.h" 40 #include "inode.h" 41 #include "journal.h" 42 #include "localalloc.h" 43 #include "suballoc.h" 44 #include "super.h" 45 #include "sysfile.h" 46 #include "uptodate.h" 47 48 #include "buffer_head_io.h" 49 50 #define NOT_ALLOC_NEW_GROUP 0 51 #define ALLOC_NEW_GROUP 0x1 52 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 53 54 #define OCFS2_MAX_TO_STEAL 1024 55 56 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 57 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 58 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 59 static int ocfs2_block_group_fill(handle_t *handle, 60 struct inode *alloc_inode, 61 struct buffer_head *bg_bh, 62 u64 group_blkno, 63 u16 my_chain, 64 struct ocfs2_chain_list *cl); 65 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 66 struct inode *alloc_inode, 67 struct buffer_head *bh, 68 u64 max_block, 69 u64 *last_alloc_group, 70 int flags); 71 72 static int ocfs2_cluster_group_search(struct inode *inode, 73 struct buffer_head *group_bh, 74 u32 bits_wanted, u32 min_bits, 75 u64 max_block, 76 u16 *bit_off, u16 *bits_found); 77 static int ocfs2_block_group_search(struct inode *inode, 78 struct buffer_head *group_bh, 79 u32 bits_wanted, u32 min_bits, 80 u64 max_block, 81 u16 *bit_off, u16 *bits_found); 82 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 83 struct ocfs2_alloc_context *ac, 84 handle_t *handle, 85 u32 bits_wanted, 86 u32 min_bits, 87 u16 *bit_off, 88 unsigned int *num_bits, 89 u64 *bg_blkno); 90 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 91 int nr); 92 static inline int ocfs2_block_group_set_bits(handle_t *handle, 93 struct inode *alloc_inode, 94 struct ocfs2_group_desc *bg, 95 struct buffer_head *group_bh, 96 unsigned int bit_off, 97 unsigned int num_bits); 98 static inline int ocfs2_block_group_clear_bits(handle_t *handle, 99 struct inode *alloc_inode, 100 struct ocfs2_group_desc *bg, 101 struct buffer_head *group_bh, 102 unsigned int bit_off, 103 unsigned int num_bits); 104 105 static int ocfs2_relink_block_group(handle_t *handle, 106 struct inode *alloc_inode, 107 struct buffer_head *fe_bh, 108 struct buffer_head *bg_bh, 109 struct buffer_head *prev_bg_bh, 110 u16 chain); 111 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 112 u32 wanted); 113 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 114 u64 bg_blkno, 115 u16 bg_bit_off); 116 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 117 u64 data_blkno, 118 u64 *bg_blkno, 119 u16 *bg_bit_off); 120 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 121 u32 bits_wanted, u64 max_block, 122 int flags, 123 struct ocfs2_alloc_context **ac); 124 125 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 126 { 127 struct inode *inode = ac->ac_inode; 128 129 if (inode) { 130 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 131 ocfs2_inode_unlock(inode, 1); 132 133 mutex_unlock(&inode->i_mutex); 134 135 iput(inode); 136 ac->ac_inode = NULL; 137 } 138 brelse(ac->ac_bh); 139 ac->ac_bh = NULL; 140 } 141 142 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 143 { 144 ocfs2_free_ac_resource(ac); 145 kfree(ac); 146 } 147 148 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 149 { 150 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 151 } 152 153 #define do_error(fmt, ...) \ 154 do{ \ 155 if (clean_error) \ 156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 157 else \ 158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 159 } while (0) 160 161 static int ocfs2_validate_gd_self(struct super_block *sb, 162 struct buffer_head *bh, 163 int clean_error) 164 { 165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 166 167 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 168 do_error("Group descriptor #%llu has bad signature %.*s", 169 (unsigned long long)bh->b_blocknr, 7, 170 gd->bg_signature); 171 return -EINVAL; 172 } 173 174 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 175 do_error("Group descriptor #%llu has an invalid bg_blkno " 176 "of %llu", 177 (unsigned long long)bh->b_blocknr, 178 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 179 return -EINVAL; 180 } 181 182 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 183 do_error("Group descriptor #%llu has an invalid " 184 "fs_generation of #%u", 185 (unsigned long long)bh->b_blocknr, 186 le32_to_cpu(gd->bg_generation)); 187 return -EINVAL; 188 } 189 190 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 191 do_error("Group descriptor #%llu has bit count %u but " 192 "claims that %u are free", 193 (unsigned long long)bh->b_blocknr, 194 le16_to_cpu(gd->bg_bits), 195 le16_to_cpu(gd->bg_free_bits_count)); 196 return -EINVAL; 197 } 198 199 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 200 do_error("Group descriptor #%llu has bit count %u but " 201 "max bitmap bits of %u", 202 (unsigned long long)bh->b_blocknr, 203 le16_to_cpu(gd->bg_bits), 204 8 * le16_to_cpu(gd->bg_size)); 205 return -EINVAL; 206 } 207 208 return 0; 209 } 210 211 static int ocfs2_validate_gd_parent(struct super_block *sb, 212 struct ocfs2_dinode *di, 213 struct buffer_head *bh, 214 int clean_error) 215 { 216 unsigned int max_bits; 217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 218 219 if (di->i_blkno != gd->bg_parent_dinode) { 220 do_error("Group descriptor #%llu has bad parent " 221 "pointer (%llu, expected %llu)", 222 (unsigned long long)bh->b_blocknr, 223 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 224 (unsigned long long)le64_to_cpu(di->i_blkno)); 225 return -EINVAL; 226 } 227 228 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 229 if (le16_to_cpu(gd->bg_bits) > max_bits) { 230 do_error("Group descriptor #%llu has bit count of %u", 231 (unsigned long long)bh->b_blocknr, 232 le16_to_cpu(gd->bg_bits)); 233 return -EINVAL; 234 } 235 236 if (le16_to_cpu(gd->bg_chain) >= 237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 238 do_error("Group descriptor #%llu has bad chain %u", 239 (unsigned long long)bh->b_blocknr, 240 le16_to_cpu(gd->bg_chain)); 241 return -EINVAL; 242 } 243 244 return 0; 245 } 246 247 #undef do_error 248 249 /* 250 * This version only prints errors. It does not fail the filesystem, and 251 * exists only for resize. 252 */ 253 int ocfs2_check_group_descriptor(struct super_block *sb, 254 struct ocfs2_dinode *di, 255 struct buffer_head *bh) 256 { 257 int rc; 258 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 259 260 BUG_ON(!buffer_uptodate(bh)); 261 262 /* 263 * If the ecc fails, we return the error but otherwise 264 * leave the filesystem running. We know any error is 265 * local to this block. 266 */ 267 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 268 if (rc) { 269 mlog(ML_ERROR, 270 "Checksum failed for group descriptor %llu\n", 271 (unsigned long long)bh->b_blocknr); 272 } else 273 rc = ocfs2_validate_gd_self(sb, bh, 1); 274 if (!rc) 275 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 276 277 return rc; 278 } 279 280 static int ocfs2_validate_group_descriptor(struct super_block *sb, 281 struct buffer_head *bh) 282 { 283 int rc; 284 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 285 286 mlog(0, "Validating group descriptor %llu\n", 287 (unsigned long long)bh->b_blocknr); 288 289 BUG_ON(!buffer_uptodate(bh)); 290 291 /* 292 * If the ecc fails, we return the error but otherwise 293 * leave the filesystem running. We know any error is 294 * local to this block. 295 */ 296 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 297 if (rc) 298 return rc; 299 300 /* 301 * Errors after here are fatal. 302 */ 303 304 return ocfs2_validate_gd_self(sb, bh, 0); 305 } 306 307 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 308 u64 gd_blkno, struct buffer_head **bh) 309 { 310 int rc; 311 struct buffer_head *tmp = *bh; 312 313 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 314 ocfs2_validate_group_descriptor); 315 if (rc) 316 goto out; 317 318 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 319 if (rc) { 320 brelse(tmp); 321 goto out; 322 } 323 324 /* If ocfs2_read_block() got us a new bh, pass it up. */ 325 if (!*bh) 326 *bh = tmp; 327 328 out: 329 return rc; 330 } 331 332 static int ocfs2_block_group_fill(handle_t *handle, 333 struct inode *alloc_inode, 334 struct buffer_head *bg_bh, 335 u64 group_blkno, 336 u16 my_chain, 337 struct ocfs2_chain_list *cl) 338 { 339 int status = 0; 340 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 341 struct super_block * sb = alloc_inode->i_sb; 342 343 mlog_entry_void(); 344 345 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 346 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " 347 "b_blocknr (%llu)", 348 (unsigned long long)group_blkno, 349 (unsigned long long) bg_bh->b_blocknr); 350 status = -EIO; 351 goto bail; 352 } 353 354 status = ocfs2_journal_access_gd(handle, 355 INODE_CACHE(alloc_inode), 356 bg_bh, 357 OCFS2_JOURNAL_ACCESS_CREATE); 358 if (status < 0) { 359 mlog_errno(status); 360 goto bail; 361 } 362 363 memset(bg, 0, sb->s_blocksize); 364 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 365 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 366 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 367 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 368 bg->bg_chain = cpu_to_le16(my_chain); 369 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 370 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 371 bg->bg_blkno = cpu_to_le64(group_blkno); 372 /* set the 1st bit in the bitmap to account for the descriptor block */ 373 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 374 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 375 376 status = ocfs2_journal_dirty(handle, bg_bh); 377 if (status < 0) 378 mlog_errno(status); 379 380 /* There is no need to zero out or otherwise initialize the 381 * other blocks in a group - All valid FS metadata in a block 382 * group stores the superblock fs_generation value at 383 * allocation time. */ 384 385 bail: 386 mlog_exit(status); 387 return status; 388 } 389 390 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 391 { 392 u16 curr, best; 393 394 best = curr = 0; 395 while (curr < le16_to_cpu(cl->cl_count)) { 396 if (le32_to_cpu(cl->cl_recs[best].c_total) > 397 le32_to_cpu(cl->cl_recs[curr].c_total)) 398 best = curr; 399 curr++; 400 } 401 return best; 402 } 403 404 /* 405 * We expect the block group allocator to already be locked. 406 */ 407 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 408 struct inode *alloc_inode, 409 struct buffer_head *bh, 410 u64 max_block, 411 u64 *last_alloc_group, 412 int flags) 413 { 414 int status, credits; 415 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 416 struct ocfs2_chain_list *cl; 417 struct ocfs2_alloc_context *ac = NULL; 418 handle_t *handle = NULL; 419 u32 bit_off, num_bits; 420 u16 alloc_rec; 421 u64 bg_blkno; 422 struct buffer_head *bg_bh = NULL; 423 struct ocfs2_group_desc *bg; 424 425 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 426 427 mlog_entry_void(); 428 429 cl = &fe->id2.i_chain; 430 status = ocfs2_reserve_clusters_with_limit(osb, 431 le16_to_cpu(cl->cl_cpg), 432 max_block, flags, &ac); 433 if (status < 0) { 434 if (status != -ENOSPC) 435 mlog_errno(status); 436 goto bail; 437 } 438 439 credits = ocfs2_calc_group_alloc_credits(osb->sb, 440 le16_to_cpu(cl->cl_cpg)); 441 handle = ocfs2_start_trans(osb, credits); 442 if (IS_ERR(handle)) { 443 status = PTR_ERR(handle); 444 handle = NULL; 445 mlog_errno(status); 446 goto bail; 447 } 448 449 if (last_alloc_group && *last_alloc_group != 0) { 450 mlog(0, "use old allocation group %llu for block group alloc\n", 451 (unsigned long long)*last_alloc_group); 452 ac->ac_last_group = *last_alloc_group; 453 } 454 status = ocfs2_claim_clusters(osb, 455 handle, 456 ac, 457 le16_to_cpu(cl->cl_cpg), 458 &bit_off, 459 &num_bits); 460 if (status < 0) { 461 if (status != -ENOSPC) 462 mlog_errno(status); 463 goto bail; 464 } 465 466 alloc_rec = ocfs2_find_smallest_chain(cl); 467 468 /* setup the group */ 469 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 470 mlog(0, "new descriptor, record %u, at block %llu\n", 471 alloc_rec, (unsigned long long)bg_blkno); 472 473 bg_bh = sb_getblk(osb->sb, bg_blkno); 474 if (!bg_bh) { 475 status = -EIO; 476 mlog_errno(status); 477 goto bail; 478 } 479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 480 481 status = ocfs2_block_group_fill(handle, 482 alloc_inode, 483 bg_bh, 484 bg_blkno, 485 alloc_rec, 486 cl); 487 if (status < 0) { 488 mlog_errno(status); 489 goto bail; 490 } 491 492 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 493 494 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 495 bh, OCFS2_JOURNAL_ACCESS_WRITE); 496 if (status < 0) { 497 mlog_errno(status); 498 goto bail; 499 } 500 501 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 502 le16_to_cpu(bg->bg_free_bits_count)); 503 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 504 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 505 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 506 le16_add_cpu(&cl->cl_next_free_rec, 1); 507 508 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 509 le16_to_cpu(bg->bg_free_bits_count)); 510 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 511 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 512 513 status = ocfs2_journal_dirty(handle, bh); 514 if (status < 0) { 515 mlog_errno(status); 516 goto bail; 517 } 518 519 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 520 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 521 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 522 le32_to_cpu(fe->i_clusters))); 523 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 524 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 525 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 526 527 status = 0; 528 529 /* save the new last alloc group so that the caller can cache it. */ 530 if (last_alloc_group) 531 *last_alloc_group = ac->ac_last_group; 532 533 bail: 534 if (handle) 535 ocfs2_commit_trans(osb, handle); 536 537 if (ac) 538 ocfs2_free_alloc_context(ac); 539 540 brelse(bg_bh); 541 542 mlog_exit(status); 543 return status; 544 } 545 546 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 547 struct ocfs2_alloc_context *ac, 548 int type, 549 u32 slot, 550 u64 *last_alloc_group, 551 int flags) 552 { 553 int status; 554 u32 bits_wanted = ac->ac_bits_wanted; 555 struct inode *alloc_inode; 556 struct buffer_head *bh = NULL; 557 struct ocfs2_dinode *fe; 558 u32 free_bits; 559 560 mlog_entry_void(); 561 562 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 563 if (!alloc_inode) { 564 mlog_errno(-EINVAL); 565 return -EINVAL; 566 } 567 568 mutex_lock(&alloc_inode->i_mutex); 569 570 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 571 if (status < 0) { 572 mutex_unlock(&alloc_inode->i_mutex); 573 iput(alloc_inode); 574 575 mlog_errno(status); 576 return status; 577 } 578 579 ac->ac_inode = alloc_inode; 580 ac->ac_alloc_slot = slot; 581 582 fe = (struct ocfs2_dinode *) bh->b_data; 583 584 /* The bh was validated by the inode read inside 585 * ocfs2_inode_lock(). Any corruption is a code bug. */ 586 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 587 588 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 589 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 590 (unsigned long long)le64_to_cpu(fe->i_blkno)); 591 status = -EIO; 592 goto bail; 593 } 594 595 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 596 le32_to_cpu(fe->id1.bitmap1.i_used); 597 598 if (bits_wanted > free_bits) { 599 /* cluster bitmap never grows */ 600 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 601 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", 602 bits_wanted, free_bits); 603 status = -ENOSPC; 604 goto bail; 605 } 606 607 if (!(flags & ALLOC_NEW_GROUP)) { 608 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 609 "and we don't alloc a new group for it.\n", 610 slot, bits_wanted, free_bits); 611 status = -ENOSPC; 612 goto bail; 613 } 614 615 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 616 ac->ac_max_block, 617 last_alloc_group, flags); 618 if (status < 0) { 619 if (status != -ENOSPC) 620 mlog_errno(status); 621 goto bail; 622 } 623 atomic_inc(&osb->alloc_stats.bg_extends); 624 625 /* You should never ask for this much metadata */ 626 BUG_ON(bits_wanted > 627 (le32_to_cpu(fe->id1.bitmap1.i_total) 628 - le32_to_cpu(fe->id1.bitmap1.i_used))); 629 } 630 631 get_bh(bh); 632 ac->ac_bh = bh; 633 bail: 634 brelse(bh); 635 636 mlog_exit(status); 637 return status; 638 } 639 640 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 641 { 642 spin_lock(&osb->osb_lock); 643 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 644 spin_unlock(&osb->osb_lock); 645 atomic_set(&osb->s_num_inodes_stolen, 0); 646 } 647 648 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 649 { 650 spin_lock(&osb->osb_lock); 651 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 652 spin_unlock(&osb->osb_lock); 653 atomic_set(&osb->s_num_meta_stolen, 0); 654 } 655 656 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 657 { 658 ocfs2_init_inode_steal_slot(osb); 659 ocfs2_init_meta_steal_slot(osb); 660 } 661 662 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 663 { 664 spin_lock(&osb->osb_lock); 665 if (type == INODE_ALLOC_SYSTEM_INODE) 666 osb->s_inode_steal_slot = slot; 667 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 668 osb->s_meta_steal_slot = slot; 669 spin_unlock(&osb->osb_lock); 670 } 671 672 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 673 { 674 int slot = OCFS2_INVALID_SLOT; 675 676 spin_lock(&osb->osb_lock); 677 if (type == INODE_ALLOC_SYSTEM_INODE) 678 slot = osb->s_inode_steal_slot; 679 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 680 slot = osb->s_meta_steal_slot; 681 spin_unlock(&osb->osb_lock); 682 683 return slot; 684 } 685 686 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 687 { 688 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 689 } 690 691 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 692 { 693 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 694 } 695 696 static int ocfs2_steal_resource(struct ocfs2_super *osb, 697 struct ocfs2_alloc_context *ac, 698 int type) 699 { 700 int i, status = -ENOSPC; 701 int slot = __ocfs2_get_steal_slot(osb, type); 702 703 /* Start to steal resource from the first slot after ours. */ 704 if (slot == OCFS2_INVALID_SLOT) 705 slot = osb->slot_num + 1; 706 707 for (i = 0; i < osb->max_slots; i++, slot++) { 708 if (slot == osb->max_slots) 709 slot = 0; 710 711 if (slot == osb->slot_num) 712 continue; 713 714 status = ocfs2_reserve_suballoc_bits(osb, ac, 715 type, 716 (u32)slot, NULL, 717 NOT_ALLOC_NEW_GROUP); 718 if (status >= 0) { 719 __ocfs2_set_steal_slot(osb, slot, type); 720 break; 721 } 722 723 ocfs2_free_ac_resource(ac); 724 } 725 726 return status; 727 } 728 729 static int ocfs2_steal_inode(struct ocfs2_super *osb, 730 struct ocfs2_alloc_context *ac) 731 { 732 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 733 } 734 735 static int ocfs2_steal_meta(struct ocfs2_super *osb, 736 struct ocfs2_alloc_context *ac) 737 { 738 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 739 } 740 741 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 742 int blocks, 743 struct ocfs2_alloc_context **ac) 744 { 745 int status; 746 int slot = ocfs2_get_meta_steal_slot(osb); 747 748 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 749 if (!(*ac)) { 750 status = -ENOMEM; 751 mlog_errno(status); 752 goto bail; 753 } 754 755 (*ac)->ac_bits_wanted = blocks; 756 (*ac)->ac_which = OCFS2_AC_USE_META; 757 (*ac)->ac_group_search = ocfs2_block_group_search; 758 759 if (slot != OCFS2_INVALID_SLOT && 760 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 761 goto extent_steal; 762 763 atomic_set(&osb->s_num_meta_stolen, 0); 764 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 765 EXTENT_ALLOC_SYSTEM_INODE, 766 (u32)osb->slot_num, NULL, 767 ALLOC_NEW_GROUP); 768 769 770 if (status >= 0) { 771 status = 0; 772 if (slot != OCFS2_INVALID_SLOT) 773 ocfs2_init_meta_steal_slot(osb); 774 goto bail; 775 } else if (status < 0 && status != -ENOSPC) { 776 mlog_errno(status); 777 goto bail; 778 } 779 780 ocfs2_free_ac_resource(*ac); 781 782 extent_steal: 783 status = ocfs2_steal_meta(osb, *ac); 784 atomic_inc(&osb->s_num_meta_stolen); 785 if (status < 0) { 786 if (status != -ENOSPC) 787 mlog_errno(status); 788 goto bail; 789 } 790 791 status = 0; 792 bail: 793 if ((status < 0) && *ac) { 794 ocfs2_free_alloc_context(*ac); 795 *ac = NULL; 796 } 797 798 mlog_exit(status); 799 return status; 800 } 801 802 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 803 struct ocfs2_extent_list *root_el, 804 struct ocfs2_alloc_context **ac) 805 { 806 return ocfs2_reserve_new_metadata_blocks(osb, 807 ocfs2_extend_meta_needed(root_el), 808 ac); 809 } 810 811 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 812 struct ocfs2_alloc_context **ac) 813 { 814 int status; 815 int slot = ocfs2_get_inode_steal_slot(osb); 816 u64 alloc_group; 817 818 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 819 if (!(*ac)) { 820 status = -ENOMEM; 821 mlog_errno(status); 822 goto bail; 823 } 824 825 (*ac)->ac_bits_wanted = 1; 826 (*ac)->ac_which = OCFS2_AC_USE_INODE; 827 828 (*ac)->ac_group_search = ocfs2_block_group_search; 829 830 /* 831 * stat(2) can't handle i_ino > 32bits, so we tell the 832 * lower levels not to allocate us a block group past that 833 * limit. The 'inode64' mount option avoids this behavior. 834 */ 835 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 836 (*ac)->ac_max_block = (u32)~0U; 837 838 /* 839 * slot is set when we successfully steal inode from other nodes. 840 * It is reset in 3 places: 841 * 1. when we flush the truncate log 842 * 2. when we complete local alloc recovery. 843 * 3. when we successfully allocate from our own slot. 844 * After it is set, we will go on stealing inodes until we find the 845 * need to check our slots to see whether there is some space for us. 846 */ 847 if (slot != OCFS2_INVALID_SLOT && 848 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 849 goto inode_steal; 850 851 atomic_set(&osb->s_num_inodes_stolen, 0); 852 alloc_group = osb->osb_inode_alloc_group; 853 status = ocfs2_reserve_suballoc_bits(osb, *ac, 854 INODE_ALLOC_SYSTEM_INODE, 855 (u32)osb->slot_num, 856 &alloc_group, 857 ALLOC_NEW_GROUP | 858 ALLOC_GROUPS_FROM_GLOBAL); 859 if (status >= 0) { 860 status = 0; 861 862 spin_lock(&osb->osb_lock); 863 osb->osb_inode_alloc_group = alloc_group; 864 spin_unlock(&osb->osb_lock); 865 mlog(0, "after reservation, new allocation group is " 866 "%llu\n", (unsigned long long)alloc_group); 867 868 /* 869 * Some inodes must be freed by us, so try to allocate 870 * from our own next time. 871 */ 872 if (slot != OCFS2_INVALID_SLOT) 873 ocfs2_init_inode_steal_slot(osb); 874 goto bail; 875 } else if (status < 0 && status != -ENOSPC) { 876 mlog_errno(status); 877 goto bail; 878 } 879 880 ocfs2_free_ac_resource(*ac); 881 882 inode_steal: 883 status = ocfs2_steal_inode(osb, *ac); 884 atomic_inc(&osb->s_num_inodes_stolen); 885 if (status < 0) { 886 if (status != -ENOSPC) 887 mlog_errno(status); 888 goto bail; 889 } 890 891 status = 0; 892 bail: 893 if ((status < 0) && *ac) { 894 ocfs2_free_alloc_context(*ac); 895 *ac = NULL; 896 } 897 898 mlog_exit(status); 899 return status; 900 } 901 902 /* local alloc code has to do the same thing, so rather than do this 903 * twice.. */ 904 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 905 struct ocfs2_alloc_context *ac) 906 { 907 int status; 908 909 ac->ac_which = OCFS2_AC_USE_MAIN; 910 ac->ac_group_search = ocfs2_cluster_group_search; 911 912 status = ocfs2_reserve_suballoc_bits(osb, ac, 913 GLOBAL_BITMAP_SYSTEM_INODE, 914 OCFS2_INVALID_SLOT, NULL, 915 ALLOC_NEW_GROUP); 916 if (status < 0 && status != -ENOSPC) { 917 mlog_errno(status); 918 goto bail; 919 } 920 921 bail: 922 return status; 923 } 924 925 /* Callers don't need to care which bitmap (local alloc or main) to 926 * use so we figure it out for them, but unfortunately this clutters 927 * things a bit. */ 928 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 929 u32 bits_wanted, u64 max_block, 930 int flags, 931 struct ocfs2_alloc_context **ac) 932 { 933 int status; 934 935 mlog_entry_void(); 936 937 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 938 if (!(*ac)) { 939 status = -ENOMEM; 940 mlog_errno(status); 941 goto bail; 942 } 943 944 (*ac)->ac_bits_wanted = bits_wanted; 945 (*ac)->ac_max_block = max_block; 946 947 status = -ENOSPC; 948 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 949 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 950 status = ocfs2_reserve_local_alloc_bits(osb, 951 bits_wanted, 952 *ac); 953 if (status == -EFBIG) { 954 /* The local alloc window is outside ac_max_block. 955 * use the main bitmap. */ 956 status = -ENOSPC; 957 } else if ((status < 0) && (status != -ENOSPC)) { 958 mlog_errno(status); 959 goto bail; 960 } 961 } 962 963 if (status == -ENOSPC) { 964 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 965 if (status < 0) { 966 if (status != -ENOSPC) 967 mlog_errno(status); 968 goto bail; 969 } 970 } 971 972 status = 0; 973 bail: 974 if ((status < 0) && *ac) { 975 ocfs2_free_alloc_context(*ac); 976 *ac = NULL; 977 } 978 979 mlog_exit(status); 980 return status; 981 } 982 983 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 984 u32 bits_wanted, 985 struct ocfs2_alloc_context **ac) 986 { 987 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 988 ALLOC_NEW_GROUP, ac); 989 } 990 991 /* 992 * More or less lifted from ext3. I'll leave their description below: 993 * 994 * "For ext3 allocations, we must not reuse any blocks which are 995 * allocated in the bitmap buffer's "last committed data" copy. This 996 * prevents deletes from freeing up the page for reuse until we have 997 * committed the delete transaction. 998 * 999 * If we didn't do this, then deleting something and reallocating it as 1000 * data would allow the old block to be overwritten before the 1001 * transaction committed (because we force data to disk before commit). 1002 * This would lead to corruption if we crashed between overwriting the 1003 * data and committing the delete. 1004 * 1005 * @@@ We may want to make this allocation behaviour conditional on 1006 * data-writes at some point, and disable it for metadata allocations or 1007 * sync-data inodes." 1008 * 1009 * Note: OCFS2 already does this differently for metadata vs data 1010 * allocations, as those bitmaps are separate and undo access is never 1011 * called on a metadata group descriptor. 1012 */ 1013 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1014 int nr) 1015 { 1016 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1017 int ret; 1018 1019 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1020 return 0; 1021 1022 if (!buffer_jbd(bg_bh)) 1023 return 1; 1024 1025 jbd_lock_bh_state(bg_bh); 1026 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; 1027 if (bg) 1028 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1029 else 1030 ret = 1; 1031 jbd_unlock_bh_state(bg_bh); 1032 1033 return ret; 1034 } 1035 1036 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1037 struct buffer_head *bg_bh, 1038 unsigned int bits_wanted, 1039 unsigned int total_bits, 1040 u16 *bit_off, 1041 u16 *bits_found) 1042 { 1043 void *bitmap; 1044 u16 best_offset, best_size; 1045 int offset, start, found, status = 0; 1046 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1047 1048 /* Callers got this descriptor from 1049 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1050 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1051 1052 found = start = best_offset = best_size = 0; 1053 bitmap = bg->bg_bitmap; 1054 1055 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { 1056 if (offset == total_bits) 1057 break; 1058 1059 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1060 /* We found a zero, but we can't use it as it 1061 * hasn't been put to disk yet! */ 1062 found = 0; 1063 start = offset + 1; 1064 } else if (offset == start) { 1065 /* we found a zero */ 1066 found++; 1067 /* move start to the next bit to test */ 1068 start++; 1069 } else { 1070 /* got a zero after some ones */ 1071 found = 1; 1072 start = offset + 1; 1073 } 1074 if (found > best_size) { 1075 best_size = found; 1076 best_offset = start - found; 1077 } 1078 /* we got everything we needed */ 1079 if (found == bits_wanted) { 1080 /* mlog(0, "Found it all!\n"); */ 1081 break; 1082 } 1083 } 1084 1085 /* XXX: I think the first clause is equivalent to the second 1086 * - jlbec */ 1087 if (found == bits_wanted) { 1088 *bit_off = start - found; 1089 *bits_found = found; 1090 } else if (best_size) { 1091 *bit_off = best_offset; 1092 *bits_found = best_size; 1093 } else { 1094 status = -ENOSPC; 1095 /* No error log here -- see the comment above 1096 * ocfs2_test_bg_bit_allocatable */ 1097 } 1098 1099 return status; 1100 } 1101 1102 static inline int ocfs2_block_group_set_bits(handle_t *handle, 1103 struct inode *alloc_inode, 1104 struct ocfs2_group_desc *bg, 1105 struct buffer_head *group_bh, 1106 unsigned int bit_off, 1107 unsigned int num_bits) 1108 { 1109 int status; 1110 void *bitmap = bg->bg_bitmap; 1111 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1112 1113 mlog_entry_void(); 1114 1115 /* All callers get the descriptor via 1116 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1117 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1118 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1119 1120 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 1121 num_bits); 1122 1123 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1124 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1125 1126 status = ocfs2_journal_access_gd(handle, 1127 INODE_CACHE(alloc_inode), 1128 group_bh, 1129 journal_type); 1130 if (status < 0) { 1131 mlog_errno(status); 1132 goto bail; 1133 } 1134 1135 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1136 1137 while(num_bits--) 1138 ocfs2_set_bit(bit_off++, bitmap); 1139 1140 status = ocfs2_journal_dirty(handle, 1141 group_bh); 1142 if (status < 0) { 1143 mlog_errno(status); 1144 goto bail; 1145 } 1146 1147 bail: 1148 mlog_exit(status); 1149 return status; 1150 } 1151 1152 /* find the one with the most empty bits */ 1153 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1154 { 1155 u16 curr, best; 1156 1157 BUG_ON(!cl->cl_next_free_rec); 1158 1159 best = curr = 0; 1160 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1161 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1162 le32_to_cpu(cl->cl_recs[best].c_free)) 1163 best = curr; 1164 curr++; 1165 } 1166 1167 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1168 return best; 1169 } 1170 1171 static int ocfs2_relink_block_group(handle_t *handle, 1172 struct inode *alloc_inode, 1173 struct buffer_head *fe_bh, 1174 struct buffer_head *bg_bh, 1175 struct buffer_head *prev_bg_bh, 1176 u16 chain) 1177 { 1178 int status; 1179 /* there is a really tiny chance the journal calls could fail, 1180 * but we wouldn't want inconsistent blocks in *any* case. */ 1181 u64 fe_ptr, bg_ptr, prev_bg_ptr; 1182 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1183 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1184 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1185 1186 /* The caller got these descriptors from 1187 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1188 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1189 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1190 1191 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 1192 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1193 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1194 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1195 1196 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); 1197 bg_ptr = le64_to_cpu(bg->bg_next_group); 1198 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1199 1200 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1201 prev_bg_bh, 1202 OCFS2_JOURNAL_ACCESS_WRITE); 1203 if (status < 0) { 1204 mlog_errno(status); 1205 goto out_rollback; 1206 } 1207 1208 prev_bg->bg_next_group = bg->bg_next_group; 1209 1210 status = ocfs2_journal_dirty(handle, prev_bg_bh); 1211 if (status < 0) { 1212 mlog_errno(status); 1213 goto out_rollback; 1214 } 1215 1216 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1217 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1218 if (status < 0) { 1219 mlog_errno(status); 1220 goto out_rollback; 1221 } 1222 1223 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1224 1225 status = ocfs2_journal_dirty(handle, bg_bh); 1226 if (status < 0) { 1227 mlog_errno(status); 1228 goto out_rollback; 1229 } 1230 1231 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1232 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1233 if (status < 0) { 1234 mlog_errno(status); 1235 goto out_rollback; 1236 } 1237 1238 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1239 1240 status = ocfs2_journal_dirty(handle, fe_bh); 1241 if (status < 0) { 1242 mlog_errno(status); 1243 goto out_rollback; 1244 } 1245 1246 status = 0; 1247 out_rollback: 1248 if (status < 0) { 1249 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 1250 bg->bg_next_group = cpu_to_le64(bg_ptr); 1251 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1252 } 1253 1254 mlog_exit(status); 1255 return status; 1256 } 1257 1258 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1259 u32 wanted) 1260 { 1261 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1262 } 1263 1264 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1265 * value on error. */ 1266 static int ocfs2_cluster_group_search(struct inode *inode, 1267 struct buffer_head *group_bh, 1268 u32 bits_wanted, u32 min_bits, 1269 u64 max_block, 1270 u16 *bit_off, u16 *bits_found) 1271 { 1272 int search = -ENOSPC; 1273 int ret; 1274 u64 blkoff; 1275 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1277 u16 tmp_off, tmp_found; 1278 unsigned int max_bits, gd_cluster_off; 1279 1280 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1281 1282 if (gd->bg_free_bits_count) { 1283 max_bits = le16_to_cpu(gd->bg_bits); 1284 1285 /* Tail groups in cluster bitmaps which aren't cpg 1286 * aligned are prone to partial extention by a failed 1287 * fs resize. If the file system resize never got to 1288 * update the dinode cluster count, then we don't want 1289 * to trust any clusters past it, regardless of what 1290 * the group descriptor says. */ 1291 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1292 le64_to_cpu(gd->bg_blkno)); 1293 if ((gd_cluster_off + max_bits) > 1294 OCFS2_I(inode)->ip_clusters) { 1295 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1296 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", 1297 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1298 le16_to_cpu(gd->bg_bits), 1299 OCFS2_I(inode)->ip_clusters, max_bits); 1300 } 1301 1302 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1303 group_bh, bits_wanted, 1304 max_bits, 1305 &tmp_off, &tmp_found); 1306 if (ret) 1307 return ret; 1308 1309 if (max_block) { 1310 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1311 gd_cluster_off + 1312 tmp_off + tmp_found); 1313 mlog(0, "Checking %llu against %llu\n", 1314 (unsigned long long)blkoff, 1315 (unsigned long long)max_block); 1316 if (blkoff > max_block) 1317 return -ENOSPC; 1318 } 1319 1320 /* ocfs2_block_group_find_clear_bits() might 1321 * return success, but we still want to return 1322 * -ENOSPC unless it found the minimum number 1323 * of bits. */ 1324 if (min_bits <= tmp_found) { 1325 *bit_off = tmp_off; 1326 *bits_found = tmp_found; 1327 search = 0; /* success */ 1328 } else if (tmp_found) { 1329 /* 1330 * Don't show bits which we'll be returning 1331 * for allocation to the local alloc bitmap. 1332 */ 1333 ocfs2_local_alloc_seen_free_bits(osb, tmp_found); 1334 } 1335 } 1336 1337 return search; 1338 } 1339 1340 static int ocfs2_block_group_search(struct inode *inode, 1341 struct buffer_head *group_bh, 1342 u32 bits_wanted, u32 min_bits, 1343 u64 max_block, 1344 u16 *bit_off, u16 *bits_found) 1345 { 1346 int ret = -ENOSPC; 1347 u64 blkoff; 1348 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1349 1350 BUG_ON(min_bits != 1); 1351 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1352 1353 if (bg->bg_free_bits_count) { 1354 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1355 group_bh, bits_wanted, 1356 le16_to_cpu(bg->bg_bits), 1357 bit_off, bits_found); 1358 if (!ret && max_block) { 1359 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + 1360 *bits_found; 1361 mlog(0, "Checking %llu against %llu\n", 1362 (unsigned long long)blkoff, 1363 (unsigned long long)max_block); 1364 if (blkoff > max_block) 1365 ret = -ENOSPC; 1366 } 1367 } 1368 1369 return ret; 1370 } 1371 1372 static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1373 handle_t *handle, 1374 struct buffer_head *di_bh, 1375 u32 num_bits, 1376 u16 chain) 1377 { 1378 int ret; 1379 u32 tmp_used; 1380 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1381 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1382 1383 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1384 OCFS2_JOURNAL_ACCESS_WRITE); 1385 if (ret < 0) { 1386 mlog_errno(ret); 1387 goto out; 1388 } 1389 1390 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1391 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1392 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1393 1394 ret = ocfs2_journal_dirty(handle, di_bh); 1395 if (ret < 0) 1396 mlog_errno(ret); 1397 1398 out: 1399 return ret; 1400 } 1401 1402 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1403 handle_t *handle, 1404 u32 bits_wanted, 1405 u32 min_bits, 1406 u16 *bit_off, 1407 unsigned int *num_bits, 1408 u64 gd_blkno, 1409 u16 *bits_left) 1410 { 1411 int ret; 1412 u16 found; 1413 struct buffer_head *group_bh = NULL; 1414 struct ocfs2_group_desc *gd; 1415 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1416 struct inode *alloc_inode = ac->ac_inode; 1417 1418 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, 1419 &group_bh); 1420 if (ret < 0) { 1421 mlog_errno(ret); 1422 return ret; 1423 } 1424 1425 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1426 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1427 ac->ac_max_block, bit_off, &found); 1428 if (ret < 0) { 1429 if (ret != -ENOSPC) 1430 mlog_errno(ret); 1431 goto out; 1432 } 1433 1434 *num_bits = found; 1435 1436 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1437 *num_bits, 1438 le16_to_cpu(gd->bg_chain)); 1439 if (ret < 0) { 1440 mlog_errno(ret); 1441 goto out; 1442 } 1443 1444 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1445 *bit_off, *num_bits); 1446 if (ret < 0) 1447 mlog_errno(ret); 1448 1449 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1450 1451 out: 1452 brelse(group_bh); 1453 1454 return ret; 1455 } 1456 1457 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1458 handle_t *handle, 1459 u32 bits_wanted, 1460 u32 min_bits, 1461 u16 *bit_off, 1462 unsigned int *num_bits, 1463 u64 *bg_blkno, 1464 u16 *bits_left) 1465 { 1466 int status; 1467 u16 chain, tmp_bits; 1468 u32 tmp_used; 1469 u64 next_group; 1470 struct inode *alloc_inode = ac->ac_inode; 1471 struct buffer_head *group_bh = NULL; 1472 struct buffer_head *prev_group_bh = NULL; 1473 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1474 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1475 struct ocfs2_group_desc *bg; 1476 1477 chain = ac->ac_chain; 1478 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n", 1479 bits_wanted, chain, 1480 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1481 1482 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1483 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1484 &group_bh); 1485 if (status < 0) { 1486 mlog_errno(status); 1487 goto bail; 1488 } 1489 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1490 1491 status = -ENOSPC; 1492 /* for now, the chain search is a bit simplistic. We just use 1493 * the 1st group with any empty bits. */ 1494 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1495 bits_wanted, min_bits, 1496 ac->ac_max_block, bit_off, 1497 &tmp_bits)) == -ENOSPC) { 1498 if (!bg->bg_next_group) 1499 break; 1500 1501 brelse(prev_group_bh); 1502 prev_group_bh = NULL; 1503 1504 next_group = le64_to_cpu(bg->bg_next_group); 1505 prev_group_bh = group_bh; 1506 group_bh = NULL; 1507 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1508 next_group, &group_bh); 1509 if (status < 0) { 1510 mlog_errno(status); 1511 goto bail; 1512 } 1513 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1514 } 1515 if (status < 0) { 1516 if (status != -ENOSPC) 1517 mlog_errno(status); 1518 goto bail; 1519 } 1520 1521 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1522 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1523 1524 *num_bits = tmp_bits; 1525 1526 BUG_ON(*num_bits == 0); 1527 1528 /* 1529 * Keep track of previous block descriptor read. When 1530 * we find a target, if we have read more than X 1531 * number of descriptors, and the target is reasonably 1532 * empty, relink him to top of his chain. 1533 * 1534 * We've read 0 extra blocks and only send one more to 1535 * the transaction, yet the next guy to search has a 1536 * much easier time. 1537 * 1538 * Do this *after* figuring out how many bits we're taking out 1539 * of our target group. 1540 */ 1541 if (ac->ac_allow_chain_relink && 1542 (prev_group_bh) && 1543 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1544 status = ocfs2_relink_block_group(handle, alloc_inode, 1545 ac->ac_bh, group_bh, 1546 prev_group_bh, chain); 1547 if (status < 0) { 1548 mlog_errno(status); 1549 goto bail; 1550 } 1551 } 1552 1553 /* Ok, claim our bits now: set the info on dinode, chainlist 1554 * and then the group */ 1555 status = ocfs2_journal_access_di(handle, 1556 INODE_CACHE(alloc_inode), 1557 ac->ac_bh, 1558 OCFS2_JOURNAL_ACCESS_WRITE); 1559 if (status < 0) { 1560 mlog_errno(status); 1561 goto bail; 1562 } 1563 1564 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1565 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); 1566 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); 1567 1568 status = ocfs2_journal_dirty(handle, 1569 ac->ac_bh); 1570 if (status < 0) { 1571 mlog_errno(status); 1572 goto bail; 1573 } 1574 1575 status = ocfs2_block_group_set_bits(handle, 1576 alloc_inode, 1577 bg, 1578 group_bh, 1579 *bit_off, 1580 *num_bits); 1581 if (status < 0) { 1582 mlog_errno(status); 1583 goto bail; 1584 } 1585 1586 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1587 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1588 1589 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1590 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1591 bail: 1592 brelse(group_bh); 1593 brelse(prev_group_bh); 1594 1595 mlog_exit(status); 1596 return status; 1597 } 1598 1599 /* will give out up to bits_wanted contiguous bits. */ 1600 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1601 struct ocfs2_alloc_context *ac, 1602 handle_t *handle, 1603 u32 bits_wanted, 1604 u32 min_bits, 1605 u16 *bit_off, 1606 unsigned int *num_bits, 1607 u64 *bg_blkno) 1608 { 1609 int status; 1610 u16 victim, i; 1611 u16 bits_left = 0; 1612 u64 hint_blkno = ac->ac_last_group; 1613 struct ocfs2_chain_list *cl; 1614 struct ocfs2_dinode *fe; 1615 1616 mlog_entry_void(); 1617 1618 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1619 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1620 BUG_ON(!ac->ac_bh); 1621 1622 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1623 1624 /* The bh was validated by the inode read during 1625 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 1626 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 1627 1628 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1629 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1630 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1631 "bits but only %u total.", 1632 (unsigned long long)le64_to_cpu(fe->i_blkno), 1633 le32_to_cpu(fe->id1.bitmap1.i_used), 1634 le32_to_cpu(fe->id1.bitmap1.i_total)); 1635 status = -EIO; 1636 goto bail; 1637 } 1638 1639 if (hint_blkno) { 1640 /* Attempt to short-circuit the usual search mechanism 1641 * by jumping straight to the most recently used 1642 * allocation group. This helps us mantain some 1643 * contiguousness across allocations. */ 1644 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1645 min_bits, bit_off, num_bits, 1646 hint_blkno, &bits_left); 1647 if (!status) { 1648 /* Be careful to update *bg_blkno here as the 1649 * caller is expecting it to be filled in, and 1650 * ocfs2_search_one_group() won't do that for 1651 * us. */ 1652 *bg_blkno = hint_blkno; 1653 goto set_hint; 1654 } 1655 if (status < 0 && status != -ENOSPC) { 1656 mlog_errno(status); 1657 goto bail; 1658 } 1659 } 1660 1661 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1662 1663 victim = ocfs2_find_victim_chain(cl); 1664 ac->ac_chain = victim; 1665 ac->ac_allow_chain_relink = 1; 1666 1667 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1668 num_bits, bg_blkno, &bits_left); 1669 if (!status) 1670 goto set_hint; 1671 if (status < 0 && status != -ENOSPC) { 1672 mlog_errno(status); 1673 goto bail; 1674 } 1675 1676 mlog(0, "Search of victim chain %u came up with nothing, " 1677 "trying all chains now.\n", victim); 1678 1679 /* If we didn't pick a good victim, then just default to 1680 * searching each chain in order. Don't allow chain relinking 1681 * because we only calculate enough journal credits for one 1682 * relink per alloc. */ 1683 ac->ac_allow_chain_relink = 0; 1684 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 1685 if (i == victim) 1686 continue; 1687 if (!cl->cl_recs[i].c_free) 1688 continue; 1689 1690 ac->ac_chain = i; 1691 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1692 bit_off, num_bits, bg_blkno, 1693 &bits_left); 1694 if (!status) 1695 break; 1696 if (status < 0 && status != -ENOSPC) { 1697 mlog_errno(status); 1698 goto bail; 1699 } 1700 } 1701 1702 set_hint: 1703 if (status != -ENOSPC) { 1704 /* If the next search of this group is not likely to 1705 * yield a suitable extent, then we reset the last 1706 * group hint so as to not waste a disk read */ 1707 if (bits_left < min_bits) 1708 ac->ac_last_group = 0; 1709 else 1710 ac->ac_last_group = *bg_blkno; 1711 } 1712 1713 bail: 1714 mlog_exit(status); 1715 return status; 1716 } 1717 1718 int ocfs2_claim_metadata(struct ocfs2_super *osb, 1719 handle_t *handle, 1720 struct ocfs2_alloc_context *ac, 1721 u32 bits_wanted, 1722 u16 *suballoc_bit_start, 1723 unsigned int *num_bits, 1724 u64 *blkno_start) 1725 { 1726 int status; 1727 u64 bg_blkno; 1728 1729 BUG_ON(!ac); 1730 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1731 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1732 1733 status = ocfs2_claim_suballoc_bits(osb, 1734 ac, 1735 handle, 1736 bits_wanted, 1737 1, 1738 suballoc_bit_start, 1739 num_bits, 1740 &bg_blkno); 1741 if (status < 0) { 1742 mlog_errno(status); 1743 goto bail; 1744 } 1745 atomic_inc(&osb->alloc_stats.bg_allocs); 1746 1747 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 1748 ac->ac_bits_given += (*num_bits); 1749 status = 0; 1750 bail: 1751 mlog_exit(status); 1752 return status; 1753 } 1754 1755 static void ocfs2_init_inode_ac_group(struct inode *dir, 1756 struct buffer_head *parent_fe_bh, 1757 struct ocfs2_alloc_context *ac) 1758 { 1759 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1760 /* 1761 * Try to allocate inodes from some specific group. 1762 * 1763 * If the parent dir has recorded the last group used in allocation, 1764 * cool, use it. Otherwise if we try to allocate new inode from the 1765 * same slot the parent dir belongs to, use the same chunk. 1766 * 1767 * We are very careful here to avoid the mistake of setting 1768 * ac_last_group to a group descriptor from a different (unlocked) slot. 1769 */ 1770 if (OCFS2_I(dir)->ip_last_used_group && 1771 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 1772 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 1773 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) 1774 ac->ac_last_group = ocfs2_which_suballoc_group( 1775 le64_to_cpu(fe->i_blkno), 1776 le16_to_cpu(fe->i_suballoc_bit)); 1777 } 1778 1779 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 1780 struct ocfs2_alloc_context *ac) 1781 { 1782 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 1783 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 1784 } 1785 1786 int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1787 handle_t *handle, 1788 struct inode *dir, 1789 struct buffer_head *parent_fe_bh, 1790 struct ocfs2_alloc_context *ac, 1791 u16 *suballoc_bit, 1792 u64 *fe_blkno) 1793 { 1794 int status; 1795 unsigned int num_bits; 1796 u64 bg_blkno; 1797 1798 mlog_entry_void(); 1799 1800 BUG_ON(!ac); 1801 BUG_ON(ac->ac_bits_given != 0); 1802 BUG_ON(ac->ac_bits_wanted != 1); 1803 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1804 1805 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 1806 1807 status = ocfs2_claim_suballoc_bits(osb, 1808 ac, 1809 handle, 1810 1, 1811 1, 1812 suballoc_bit, 1813 &num_bits, 1814 &bg_blkno); 1815 if (status < 0) { 1816 mlog_errno(status); 1817 goto bail; 1818 } 1819 atomic_inc(&osb->alloc_stats.bg_allocs); 1820 1821 BUG_ON(num_bits != 1); 1822 1823 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1824 ac->ac_bits_given++; 1825 ocfs2_save_inode_ac_group(dir, ac); 1826 status = 0; 1827 bail: 1828 mlog_exit(status); 1829 return status; 1830 } 1831 1832 /* translate a group desc. blkno and it's bitmap offset into 1833 * disk cluster offset. */ 1834 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 1835 u64 bg_blkno, 1836 u16 bg_bit_off) 1837 { 1838 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1839 u32 cluster = 0; 1840 1841 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1842 1843 if (bg_blkno != osb->first_cluster_group_blkno) 1844 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 1845 cluster += (u32) bg_bit_off; 1846 return cluster; 1847 } 1848 1849 /* given a cluster offset, calculate which block group it belongs to 1850 * and return that block offset. */ 1851 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 1852 { 1853 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1854 u32 group_no; 1855 1856 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1857 1858 group_no = cluster / osb->bitmap_cpg; 1859 if (!group_no) 1860 return osb->first_cluster_group_blkno; 1861 return ocfs2_clusters_to_blocks(inode->i_sb, 1862 group_no * osb->bitmap_cpg); 1863 } 1864 1865 /* given the block number of a cluster start, calculate which cluster 1866 * group and descriptor bitmap offset that corresponds to. */ 1867 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 1868 u64 data_blkno, 1869 u64 *bg_blkno, 1870 u16 *bg_bit_off) 1871 { 1872 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1873 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 1874 1875 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1876 1877 *bg_blkno = ocfs2_which_cluster_group(inode, 1878 data_cluster); 1879 1880 if (*bg_blkno == osb->first_cluster_group_blkno) 1881 *bg_bit_off = (u16) data_cluster; 1882 else 1883 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 1884 data_blkno - *bg_blkno); 1885 } 1886 1887 /* 1888 * min_bits - minimum contiguous chunk from this total allocation we 1889 * can handle. set to what we asked for originally for a full 1890 * contig. allocation, set to '1' to indicate we can deal with extents 1891 * of any size. 1892 */ 1893 int __ocfs2_claim_clusters(struct ocfs2_super *osb, 1894 handle_t *handle, 1895 struct ocfs2_alloc_context *ac, 1896 u32 min_clusters, 1897 u32 max_clusters, 1898 u32 *cluster_start, 1899 u32 *num_clusters) 1900 { 1901 int status; 1902 unsigned int bits_wanted = max_clusters; 1903 u64 bg_blkno = 0; 1904 u16 bg_bit_off; 1905 1906 mlog_entry_void(); 1907 1908 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1909 1910 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 1911 && ac->ac_which != OCFS2_AC_USE_MAIN); 1912 1913 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 1914 status = ocfs2_claim_local_alloc_bits(osb, 1915 handle, 1916 ac, 1917 bits_wanted, 1918 cluster_start, 1919 num_clusters); 1920 if (!status) 1921 atomic_inc(&osb->alloc_stats.local_data); 1922 } else { 1923 if (min_clusters > (osb->bitmap_cpg - 1)) { 1924 /* The only paths asking for contiguousness 1925 * should know about this already. */ 1926 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 1927 "group bitmap size %u!\n", min_clusters, 1928 osb->bitmap_cpg); 1929 status = -ENOSPC; 1930 goto bail; 1931 } 1932 /* clamp the current request down to a realistic size. */ 1933 if (bits_wanted > (osb->bitmap_cpg - 1)) 1934 bits_wanted = osb->bitmap_cpg - 1; 1935 1936 status = ocfs2_claim_suballoc_bits(osb, 1937 ac, 1938 handle, 1939 bits_wanted, 1940 min_clusters, 1941 &bg_bit_off, 1942 num_clusters, 1943 &bg_blkno); 1944 if (!status) { 1945 *cluster_start = 1946 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 1947 bg_blkno, 1948 bg_bit_off); 1949 atomic_inc(&osb->alloc_stats.bitmap_data); 1950 } 1951 } 1952 if (status < 0) { 1953 if (status != -ENOSPC) 1954 mlog_errno(status); 1955 goto bail; 1956 } 1957 1958 ac->ac_bits_given += *num_clusters; 1959 1960 bail: 1961 mlog_exit(status); 1962 return status; 1963 } 1964 1965 int ocfs2_claim_clusters(struct ocfs2_super *osb, 1966 handle_t *handle, 1967 struct ocfs2_alloc_context *ac, 1968 u32 min_clusters, 1969 u32 *cluster_start, 1970 u32 *num_clusters) 1971 { 1972 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 1973 1974 return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, 1975 bits_wanted, cluster_start, num_clusters); 1976 } 1977 1978 static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1979 struct inode *alloc_inode, 1980 struct ocfs2_group_desc *bg, 1981 struct buffer_head *group_bh, 1982 unsigned int bit_off, 1983 unsigned int num_bits) 1984 { 1985 int status; 1986 unsigned int tmp; 1987 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1988 struct ocfs2_group_desc *undo_bg = NULL; 1989 int cluster_bitmap = 0; 1990 1991 mlog_entry_void(); 1992 1993 /* The caller got this descriptor from 1994 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1995 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1996 1997 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1998 1999 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2000 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 2001 2002 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2003 group_bh, journal_type); 2004 if (status < 0) { 2005 mlog_errno(status); 2006 goto bail; 2007 } 2008 2009 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2010 cluster_bitmap = 1; 2011 2012 if (cluster_bitmap) { 2013 jbd_lock_bh_state(group_bh); 2014 undo_bg = (struct ocfs2_group_desc *) 2015 bh2jh(group_bh)->b_committed_data; 2016 BUG_ON(!undo_bg); 2017 } 2018 2019 tmp = num_bits; 2020 while(tmp--) { 2021 ocfs2_clear_bit((bit_off + tmp), 2022 (unsigned long *) bg->bg_bitmap); 2023 if (cluster_bitmap) 2024 ocfs2_set_bit(bit_off + tmp, 2025 (unsigned long *) undo_bg->bg_bitmap); 2026 } 2027 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2028 2029 if (cluster_bitmap) 2030 jbd_unlock_bh_state(group_bh); 2031 2032 status = ocfs2_journal_dirty(handle, group_bh); 2033 if (status < 0) 2034 mlog_errno(status); 2035 bail: 2036 return status; 2037 } 2038 2039 /* 2040 * expects the suballoc inode to already be locked. 2041 */ 2042 int ocfs2_free_suballoc_bits(handle_t *handle, 2043 struct inode *alloc_inode, 2044 struct buffer_head *alloc_bh, 2045 unsigned int start_bit, 2046 u64 bg_blkno, 2047 unsigned int count) 2048 { 2049 int status = 0; 2050 u32 tmp_used; 2051 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2052 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2053 struct buffer_head *group_bh = NULL; 2054 struct ocfs2_group_desc *group; 2055 2056 mlog_entry_void(); 2057 2058 /* The alloc_bh comes from ocfs2_free_dinode() or 2059 * ocfs2_free_clusters(). The callers have all locked the 2060 * allocator and gotten alloc_bh from the lock call. This 2061 * validates the dinode buffer. Any corruption that has happended 2062 * is a code bug. */ 2063 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2064 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2065 2066 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 2067 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 2068 (unsigned long long)bg_blkno, start_bit); 2069 2070 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2071 &group_bh); 2072 if (status < 0) { 2073 mlog_errno(status); 2074 goto bail; 2075 } 2076 group = (struct ocfs2_group_desc *) group_bh->b_data; 2077 2078 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2079 2080 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2081 group, group_bh, 2082 start_bit, count); 2083 if (status < 0) { 2084 mlog_errno(status); 2085 goto bail; 2086 } 2087 2088 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2089 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2090 if (status < 0) { 2091 mlog_errno(status); 2092 goto bail; 2093 } 2094 2095 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 2096 count); 2097 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2098 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2099 2100 status = ocfs2_journal_dirty(handle, alloc_bh); 2101 if (status < 0) { 2102 mlog_errno(status); 2103 goto bail; 2104 } 2105 2106 bail: 2107 brelse(group_bh); 2108 2109 mlog_exit(status); 2110 return status; 2111 } 2112 2113 int ocfs2_free_dinode(handle_t *handle, 2114 struct inode *inode_alloc_inode, 2115 struct buffer_head *inode_alloc_bh, 2116 struct ocfs2_dinode *di) 2117 { 2118 u64 blk = le64_to_cpu(di->i_blkno); 2119 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2120 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2121 2122 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2123 inode_alloc_bh, bit, bg_blkno, 1); 2124 } 2125 2126 int ocfs2_free_clusters(handle_t *handle, 2127 struct inode *bitmap_inode, 2128 struct buffer_head *bitmap_bh, 2129 u64 start_blk, 2130 unsigned int num_clusters) 2131 { 2132 int status; 2133 u16 bg_start_bit; 2134 u64 bg_blkno; 2135 struct ocfs2_dinode *fe; 2136 2137 /* You can't ever have a contiguous set of clusters 2138 * bigger than a block group bitmap so we never have to worry 2139 * about looping on them. */ 2140 2141 mlog_entry_void(); 2142 2143 /* This is expensive. We can safely remove once this stuff has 2144 * gotten tested really well. */ 2145 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); 2146 2147 fe = (struct ocfs2_dinode *) bitmap_bh->b_data; 2148 2149 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2150 &bg_start_bit); 2151 2152 mlog(0, "want to free %u clusters starting at block %llu\n", 2153 num_clusters, (unsigned long long)start_blk); 2154 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2155 (unsigned long long)bg_blkno, bg_start_bit); 2156 2157 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2158 bg_start_bit, bg_blkno, 2159 num_clusters); 2160 if (status < 0) { 2161 mlog_errno(status); 2162 goto out; 2163 } 2164 2165 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 2166 num_clusters); 2167 2168 out: 2169 mlog_exit(status); 2170 return status; 2171 } 2172 2173 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2174 { 2175 printk("Block Group:\n"); 2176 printk("bg_signature: %s\n", bg->bg_signature); 2177 printk("bg_size: %u\n", bg->bg_size); 2178 printk("bg_bits: %u\n", bg->bg_bits); 2179 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); 2180 printk("bg_chain: %u\n", bg->bg_chain); 2181 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); 2182 printk("bg_next_group: %llu\n", 2183 (unsigned long long)bg->bg_next_group); 2184 printk("bg_parent_dinode: %llu\n", 2185 (unsigned long long)bg->bg_parent_dinode); 2186 printk("bg_blkno: %llu\n", 2187 (unsigned long long)bg->bg_blkno); 2188 } 2189 2190 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) 2191 { 2192 int i; 2193 2194 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); 2195 printk("i_signature: %s\n", fe->i_signature); 2196 printk("i_size: %llu\n", 2197 (unsigned long long)fe->i_size); 2198 printk("i_clusters: %u\n", fe->i_clusters); 2199 printk("i_generation: %u\n", 2200 le32_to_cpu(fe->i_generation)); 2201 printk("id1.bitmap1.i_used: %u\n", 2202 le32_to_cpu(fe->id1.bitmap1.i_used)); 2203 printk("id1.bitmap1.i_total: %u\n", 2204 le32_to_cpu(fe->id1.bitmap1.i_total)); 2205 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); 2206 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); 2207 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); 2208 printk("id2.i_chain.cl_next_free_rec: %u\n", 2209 fe->id2.i_chain.cl_next_free_rec); 2210 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { 2211 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, 2212 fe->id2.i_chain.cl_recs[i].c_free); 2213 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, 2214 fe->id2.i_chain.cl_recs[i].c_total); 2215 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, 2216 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); 2217 } 2218 } 2219 2220 /* 2221 * For a given allocation, determine which allocators will need to be 2222 * accessed, and lock them, reserving the appropriate number of bits. 2223 * 2224 * Sparse file systems call this from ocfs2_write_begin_nolock() 2225 * and ocfs2_allocate_unwritten_extents(). 2226 * 2227 * File systems which don't support holes call this from 2228 * ocfs2_extend_allocation(). 2229 */ 2230 int ocfs2_lock_allocators(struct inode *inode, 2231 struct ocfs2_extent_tree *et, 2232 u32 clusters_to_add, u32 extents_to_split, 2233 struct ocfs2_alloc_context **data_ac, 2234 struct ocfs2_alloc_context **meta_ac) 2235 { 2236 int ret = 0, num_free_extents; 2237 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 2238 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2239 2240 *meta_ac = NULL; 2241 if (data_ac) 2242 *data_ac = NULL; 2243 2244 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2245 2246 num_free_extents = ocfs2_num_free_extents(osb, et); 2247 if (num_free_extents < 0) { 2248 ret = num_free_extents; 2249 mlog_errno(ret); 2250 goto out; 2251 } 2252 2253 /* 2254 * Sparse allocation file systems need to be more conservative 2255 * with reserving room for expansion - the actual allocation 2256 * happens while we've got a journal handle open so re-taking 2257 * a cluster lock (because we ran out of room for another 2258 * extent) will violate ordering rules. 2259 * 2260 * Most of the time we'll only be seeing this 1 cluster at a time 2261 * anyway. 2262 * 2263 * Always lock for any unwritten extents - we might want to 2264 * add blocks during a split. 2265 */ 2266 if (!num_free_extents || 2267 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 2268 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 2269 if (ret < 0) { 2270 if (ret != -ENOSPC) 2271 mlog_errno(ret); 2272 goto out; 2273 } 2274 } 2275 2276 if (clusters_to_add == 0) 2277 goto out; 2278 2279 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 2280 if (ret < 0) { 2281 if (ret != -ENOSPC) 2282 mlog_errno(ret); 2283 goto out; 2284 } 2285 2286 out: 2287 if (ret) { 2288 if (*meta_ac) { 2289 ocfs2_free_alloc_context(*meta_ac); 2290 *meta_ac = NULL; 2291 } 2292 2293 /* 2294 * We cannot have an error and a non null *data_ac. 2295 */ 2296 } 2297 2298 return ret; 2299 } 2300 2301 /* 2302 * Read the inode specified by blkno to get suballoc_slot and 2303 * suballoc_bit. 2304 */ 2305 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2306 u16 *suballoc_slot, u16 *suballoc_bit) 2307 { 2308 int status; 2309 struct buffer_head *inode_bh = NULL; 2310 struct ocfs2_dinode *inode_fe; 2311 2312 mlog_entry("blkno: %llu\n", (unsigned long long)blkno); 2313 2314 /* dirty read disk */ 2315 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2316 if (status < 0) { 2317 mlog(ML_ERROR, "read block %llu failed %d\n", 2318 (unsigned long long)blkno, status); 2319 goto bail; 2320 } 2321 2322 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 2323 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 2324 mlog(ML_ERROR, "invalid inode %llu requested\n", 2325 (unsigned long long)blkno); 2326 status = -EINVAL; 2327 goto bail; 2328 } 2329 2330 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 2331 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 2332 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 2333 (unsigned long long)blkno, 2334 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 2335 status = -EINVAL; 2336 goto bail; 2337 } 2338 2339 if (suballoc_slot) 2340 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2341 if (suballoc_bit) 2342 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2343 2344 bail: 2345 brelse(inode_bh); 2346 2347 mlog_exit(status); 2348 return status; 2349 } 2350 2351 /* 2352 * test whether bit is SET in allocator bitmap or not. on success, 0 2353 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 2354 * is returned and *res is meaningless. Call this after you have 2355 * cluster locked against suballoc, or you may get a result based on 2356 * non-up2date contents 2357 */ 2358 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2359 struct inode *suballoc, 2360 struct buffer_head *alloc_bh, u64 blkno, 2361 u16 bit, int *res) 2362 { 2363 struct ocfs2_dinode *alloc_fe; 2364 struct ocfs2_group_desc *group; 2365 struct buffer_head *group_bh = NULL; 2366 u64 bg_blkno; 2367 int status; 2368 2369 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2370 (unsigned int)bit); 2371 2372 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2373 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2374 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2375 (unsigned int)bit, 2376 ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); 2377 status = -EINVAL; 2378 goto bail; 2379 } 2380 2381 bg_blkno = ocfs2_which_suballoc_group(blkno, bit); 2382 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2383 &group_bh); 2384 if (status < 0) { 2385 mlog(ML_ERROR, "read group %llu failed %d\n", 2386 (unsigned long long)bg_blkno, status); 2387 goto bail; 2388 } 2389 2390 group = (struct ocfs2_group_desc *) group_bh->b_data; 2391 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 2392 2393 bail: 2394 brelse(group_bh); 2395 2396 mlog_exit(status); 2397 return status; 2398 } 2399 2400 /* 2401 * Test if the bit representing this inode (blkno) is set in the 2402 * suballocator. 2403 * 2404 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 2405 * 2406 * In the event of failure, a negative value is returned and *res is 2407 * meaningless. 2408 * 2409 * Callers must make sure to hold nfs_sync_lock to prevent 2410 * ocfs2_delete_inode() on another node from accessing the same 2411 * suballocator concurrently. 2412 */ 2413 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2414 { 2415 int status; 2416 u16 suballoc_bit = 0, suballoc_slot = 0; 2417 struct inode *inode_alloc_inode; 2418 struct buffer_head *alloc_bh = NULL; 2419 2420 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2421 2422 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2423 &suballoc_bit); 2424 if (status < 0) { 2425 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2426 goto bail; 2427 } 2428 2429 inode_alloc_inode = 2430 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, 2431 suballoc_slot); 2432 if (!inode_alloc_inode) { 2433 /* the error code could be inaccurate, but we are not able to 2434 * get the correct one. */ 2435 status = -EINVAL; 2436 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 2437 (u32)suballoc_slot); 2438 goto bail; 2439 } 2440 2441 mutex_lock(&inode_alloc_inode->i_mutex); 2442 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2443 if (status < 0) { 2444 mutex_unlock(&inode_alloc_inode->i_mutex); 2445 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2446 (u32)suballoc_slot, status); 2447 goto bail; 2448 } 2449 2450 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2451 blkno, suballoc_bit, res); 2452 if (status < 0) 2453 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2454 2455 ocfs2_inode_unlock(inode_alloc_inode, 0); 2456 mutex_unlock(&inode_alloc_inode->i_mutex); 2457 2458 iput(inode_alloc_inode); 2459 brelse(alloc_bh); 2460 bail: 2461 mlog_exit(status); 2462 return status; 2463 } 2464