1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * suballoc.c 5 * 6 * metadata alloc and free 7 * Inspired by ext3 block groups. 8 * 9 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public 22 * License along with this program; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 021110-1307, USA. 25 */ 26 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33 #include <cluster/masklog.h> 34 35 #include "ocfs2.h" 36 37 #include "alloc.h" 38 #include "blockcheck.h" 39 #include "dlmglue.h" 40 #include "inode.h" 41 #include "journal.h" 42 #include "localalloc.h" 43 #include "suballoc.h" 44 #include "super.h" 45 #include "sysfile.h" 46 #include "uptodate.h" 47 48 #include "buffer_head_io.h" 49 50 #define NOT_ALLOC_NEW_GROUP 0 51 #define ALLOC_NEW_GROUP 0x1 52 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 53 54 #define OCFS2_MAX_TO_STEAL 1024 55 56 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 57 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 58 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 59 static int ocfs2_block_group_fill(handle_t *handle, 60 struct inode *alloc_inode, 61 struct buffer_head *bg_bh, 62 u64 group_blkno, 63 u16 my_chain, 64 struct ocfs2_chain_list *cl); 65 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 66 struct inode *alloc_inode, 67 struct buffer_head *bh, 68 u64 max_block, 69 u64 *last_alloc_group, 70 int flags); 71 72 static int ocfs2_cluster_group_search(struct inode *inode, 73 struct buffer_head *group_bh, 74 u32 bits_wanted, u32 min_bits, 75 u64 max_block, 76 u16 *bit_off, u16 *bits_found); 77 static int ocfs2_block_group_search(struct inode *inode, 78 struct buffer_head *group_bh, 79 u32 bits_wanted, u32 min_bits, 80 u64 max_block, 81 u16 *bit_off, u16 *bits_found); 82 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 83 struct ocfs2_alloc_context *ac, 84 handle_t *handle, 85 u32 bits_wanted, 86 u32 min_bits, 87 u16 *bit_off, 88 unsigned int *num_bits, 89 u64 *bg_blkno); 90 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 91 int nr); 92 static inline int ocfs2_block_group_set_bits(handle_t *handle, 93 struct inode *alloc_inode, 94 struct ocfs2_group_desc *bg, 95 struct buffer_head *group_bh, 96 unsigned int bit_off, 97 unsigned int num_bits); 98 static int ocfs2_relink_block_group(handle_t *handle, 99 struct inode *alloc_inode, 100 struct buffer_head *fe_bh, 101 struct buffer_head *bg_bh, 102 struct buffer_head *prev_bg_bh, 103 u16 chain); 104 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 105 u32 wanted); 106 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 107 u64 bg_blkno, 108 u16 bg_bit_off); 109 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 110 u64 data_blkno, 111 u64 *bg_blkno, 112 u16 *bg_bit_off); 113 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 114 u32 bits_wanted, u64 max_block, 115 int flags, 116 struct ocfs2_alloc_context **ac); 117 118 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 119 { 120 struct inode *inode = ac->ac_inode; 121 122 if (inode) { 123 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 124 ocfs2_inode_unlock(inode, 1); 125 126 mutex_unlock(&inode->i_mutex); 127 128 iput(inode); 129 ac->ac_inode = NULL; 130 } 131 brelse(ac->ac_bh); 132 ac->ac_bh = NULL; 133 } 134 135 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 136 { 137 ocfs2_free_ac_resource(ac); 138 kfree(ac); 139 } 140 141 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 142 { 143 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 144 } 145 146 #define do_error(fmt, ...) \ 147 do{ \ 148 if (resize) \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 150 else \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 152 } while (0) 153 154 static int ocfs2_validate_gd_self(struct super_block *sb, 155 struct buffer_head *bh, 156 int resize) 157 { 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 159 160 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 161 do_error("Group descriptor #%llu has bad signature %.*s", 162 (unsigned long long)bh->b_blocknr, 7, 163 gd->bg_signature); 164 return -EINVAL; 165 } 166 167 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 168 do_error("Group descriptor #%llu has an invalid bg_blkno " 169 "of %llu", 170 (unsigned long long)bh->b_blocknr, 171 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 172 return -EINVAL; 173 } 174 175 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 176 do_error("Group descriptor #%llu has an invalid " 177 "fs_generation of #%u", 178 (unsigned long long)bh->b_blocknr, 179 le32_to_cpu(gd->bg_generation)); 180 return -EINVAL; 181 } 182 183 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 184 do_error("Group descriptor #%llu has bit count %u but " 185 "claims that %u are free", 186 (unsigned long long)bh->b_blocknr, 187 le16_to_cpu(gd->bg_bits), 188 le16_to_cpu(gd->bg_free_bits_count)); 189 return -EINVAL; 190 } 191 192 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 193 do_error("Group descriptor #%llu has bit count %u but " 194 "max bitmap bits of %u", 195 (unsigned long long)bh->b_blocknr, 196 le16_to_cpu(gd->bg_bits), 197 8 * le16_to_cpu(gd->bg_size)); 198 return -EINVAL; 199 } 200 201 return 0; 202 } 203 204 static int ocfs2_validate_gd_parent(struct super_block *sb, 205 struct ocfs2_dinode *di, 206 struct buffer_head *bh, 207 int resize) 208 { 209 unsigned int max_bits; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 211 212 if (di->i_blkno != gd->bg_parent_dinode) { 213 do_error("Group descriptor #%llu has bad parent " 214 "pointer (%llu, expected %llu)", 215 (unsigned long long)bh->b_blocknr, 216 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 217 (unsigned long long)le64_to_cpu(di->i_blkno)); 218 return -EINVAL; 219 } 220 221 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 222 if (le16_to_cpu(gd->bg_bits) > max_bits) { 223 do_error("Group descriptor #%llu has bit count of %u", 224 (unsigned long long)bh->b_blocknr, 225 le16_to_cpu(gd->bg_bits)); 226 return -EINVAL; 227 } 228 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 230 if ((le16_to_cpu(gd->bg_chain) > 231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 232 ((le16_to_cpu(gd->bg_chain) == 233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 234 do_error("Group descriptor #%llu has bad chain %u", 235 (unsigned long long)bh->b_blocknr, 236 le16_to_cpu(gd->bg_chain)); 237 return -EINVAL; 238 } 239 240 return 0; 241 } 242 243 #undef do_error 244 245 /* 246 * This version only prints errors. It does not fail the filesystem, and 247 * exists only for resize. 248 */ 249 int ocfs2_check_group_descriptor(struct super_block *sb, 250 struct ocfs2_dinode *di, 251 struct buffer_head *bh) 252 { 253 int rc; 254 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 255 256 BUG_ON(!buffer_uptodate(bh)); 257 258 /* 259 * If the ecc fails, we return the error but otherwise 260 * leave the filesystem running. We know any error is 261 * local to this block. 262 */ 263 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 264 if (rc) { 265 mlog(ML_ERROR, 266 "Checksum failed for group descriptor %llu\n", 267 (unsigned long long)bh->b_blocknr); 268 } else 269 rc = ocfs2_validate_gd_self(sb, bh, 1); 270 if (!rc) 271 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 272 273 return rc; 274 } 275 276 static int ocfs2_validate_group_descriptor(struct super_block *sb, 277 struct buffer_head *bh) 278 { 279 int rc; 280 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 281 282 mlog(0, "Validating group descriptor %llu\n", 283 (unsigned long long)bh->b_blocknr); 284 285 BUG_ON(!buffer_uptodate(bh)); 286 287 /* 288 * If the ecc fails, we return the error but otherwise 289 * leave the filesystem running. We know any error is 290 * local to this block. 291 */ 292 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 293 if (rc) 294 return rc; 295 296 /* 297 * Errors after here are fatal. 298 */ 299 300 return ocfs2_validate_gd_self(sb, bh, 0); 301 } 302 303 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 304 u64 gd_blkno, struct buffer_head **bh) 305 { 306 int rc; 307 struct buffer_head *tmp = *bh; 308 309 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 310 ocfs2_validate_group_descriptor); 311 if (rc) 312 goto out; 313 314 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 315 if (rc) { 316 brelse(tmp); 317 goto out; 318 } 319 320 /* If ocfs2_read_block() got us a new bh, pass it up. */ 321 if (!*bh) 322 *bh = tmp; 323 324 out: 325 return rc; 326 } 327 328 static int ocfs2_block_group_fill(handle_t *handle, 329 struct inode *alloc_inode, 330 struct buffer_head *bg_bh, 331 u64 group_blkno, 332 u16 my_chain, 333 struct ocfs2_chain_list *cl) 334 { 335 int status = 0; 336 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 337 struct super_block * sb = alloc_inode->i_sb; 338 339 mlog_entry_void(); 340 341 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 342 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " 343 "b_blocknr (%llu)", 344 (unsigned long long)group_blkno, 345 (unsigned long long) bg_bh->b_blocknr); 346 status = -EIO; 347 goto bail; 348 } 349 350 status = ocfs2_journal_access_gd(handle, 351 INODE_CACHE(alloc_inode), 352 bg_bh, 353 OCFS2_JOURNAL_ACCESS_CREATE); 354 if (status < 0) { 355 mlog_errno(status); 356 goto bail; 357 } 358 359 memset(bg, 0, sb->s_blocksize); 360 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 361 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 362 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 363 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 364 bg->bg_chain = cpu_to_le16(my_chain); 365 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 366 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 367 bg->bg_blkno = cpu_to_le64(group_blkno); 368 /* set the 1st bit in the bitmap to account for the descriptor block */ 369 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 370 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 371 372 status = ocfs2_journal_dirty(handle, bg_bh); 373 if (status < 0) 374 mlog_errno(status); 375 376 /* There is no need to zero out or otherwise initialize the 377 * other blocks in a group - All valid FS metadata in a block 378 * group stores the superblock fs_generation value at 379 * allocation time. */ 380 381 bail: 382 mlog_exit(status); 383 return status; 384 } 385 386 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 387 { 388 u16 curr, best; 389 390 best = curr = 0; 391 while (curr < le16_to_cpu(cl->cl_count)) { 392 if (le32_to_cpu(cl->cl_recs[best].c_total) > 393 le32_to_cpu(cl->cl_recs[curr].c_total)) 394 best = curr; 395 curr++; 396 } 397 return best; 398 } 399 400 /* 401 * We expect the block group allocator to already be locked. 402 */ 403 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 404 struct inode *alloc_inode, 405 struct buffer_head *bh, 406 u64 max_block, 407 u64 *last_alloc_group, 408 int flags) 409 { 410 int status, credits; 411 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 412 struct ocfs2_chain_list *cl; 413 struct ocfs2_alloc_context *ac = NULL; 414 handle_t *handle = NULL; 415 u32 bit_off, num_bits; 416 u16 alloc_rec; 417 u64 bg_blkno; 418 struct buffer_head *bg_bh = NULL; 419 struct ocfs2_group_desc *bg; 420 421 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 422 423 mlog_entry_void(); 424 425 cl = &fe->id2.i_chain; 426 status = ocfs2_reserve_clusters_with_limit(osb, 427 le16_to_cpu(cl->cl_cpg), 428 max_block, flags, &ac); 429 if (status < 0) { 430 if (status != -ENOSPC) 431 mlog_errno(status); 432 goto bail; 433 } 434 435 credits = ocfs2_calc_group_alloc_credits(osb->sb, 436 le16_to_cpu(cl->cl_cpg)); 437 handle = ocfs2_start_trans(osb, credits); 438 if (IS_ERR(handle)) { 439 status = PTR_ERR(handle); 440 handle = NULL; 441 mlog_errno(status); 442 goto bail; 443 } 444 445 if (last_alloc_group && *last_alloc_group != 0) { 446 mlog(0, "use old allocation group %llu for block group alloc\n", 447 (unsigned long long)*last_alloc_group); 448 ac->ac_last_group = *last_alloc_group; 449 } 450 status = ocfs2_claim_clusters(osb, 451 handle, 452 ac, 453 le16_to_cpu(cl->cl_cpg), 454 &bit_off, 455 &num_bits); 456 if (status < 0) { 457 if (status != -ENOSPC) 458 mlog_errno(status); 459 goto bail; 460 } 461 462 alloc_rec = ocfs2_find_smallest_chain(cl); 463 464 /* setup the group */ 465 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 466 mlog(0, "new descriptor, record %u, at block %llu\n", 467 alloc_rec, (unsigned long long)bg_blkno); 468 469 bg_bh = sb_getblk(osb->sb, bg_blkno); 470 if (!bg_bh) { 471 status = -EIO; 472 mlog_errno(status); 473 goto bail; 474 } 475 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 476 477 status = ocfs2_block_group_fill(handle, 478 alloc_inode, 479 bg_bh, 480 bg_blkno, 481 alloc_rec, 482 cl); 483 if (status < 0) { 484 mlog_errno(status); 485 goto bail; 486 } 487 488 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 489 490 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 491 bh, OCFS2_JOURNAL_ACCESS_WRITE); 492 if (status < 0) { 493 mlog_errno(status); 494 goto bail; 495 } 496 497 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 498 le16_to_cpu(bg->bg_free_bits_count)); 499 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 500 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 501 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 502 le16_add_cpu(&cl->cl_next_free_rec, 1); 503 504 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 505 le16_to_cpu(bg->bg_free_bits_count)); 506 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 507 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 508 509 status = ocfs2_journal_dirty(handle, bh); 510 if (status < 0) { 511 mlog_errno(status); 512 goto bail; 513 } 514 515 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 516 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 517 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 518 le32_to_cpu(fe->i_clusters))); 519 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 520 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 521 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 522 523 status = 0; 524 525 /* save the new last alloc group so that the caller can cache it. */ 526 if (last_alloc_group) 527 *last_alloc_group = ac->ac_last_group; 528 529 bail: 530 if (handle) 531 ocfs2_commit_trans(osb, handle); 532 533 if (ac) 534 ocfs2_free_alloc_context(ac); 535 536 brelse(bg_bh); 537 538 mlog_exit(status); 539 return status; 540 } 541 542 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 543 struct ocfs2_alloc_context *ac, 544 int type, 545 u32 slot, 546 u64 *last_alloc_group, 547 int flags) 548 { 549 int status; 550 u32 bits_wanted = ac->ac_bits_wanted; 551 struct inode *alloc_inode; 552 struct buffer_head *bh = NULL; 553 struct ocfs2_dinode *fe; 554 u32 free_bits; 555 556 mlog_entry_void(); 557 558 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 559 if (!alloc_inode) { 560 mlog_errno(-EINVAL); 561 return -EINVAL; 562 } 563 564 mutex_lock(&alloc_inode->i_mutex); 565 566 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 567 if (status < 0) { 568 mutex_unlock(&alloc_inode->i_mutex); 569 iput(alloc_inode); 570 571 mlog_errno(status); 572 return status; 573 } 574 575 ac->ac_inode = alloc_inode; 576 ac->ac_alloc_slot = slot; 577 578 fe = (struct ocfs2_dinode *) bh->b_data; 579 580 /* The bh was validated by the inode read inside 581 * ocfs2_inode_lock(). Any corruption is a code bug. */ 582 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 583 584 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 585 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 586 (unsigned long long)le64_to_cpu(fe->i_blkno)); 587 status = -EIO; 588 goto bail; 589 } 590 591 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 592 le32_to_cpu(fe->id1.bitmap1.i_used); 593 594 if (bits_wanted > free_bits) { 595 /* cluster bitmap never grows */ 596 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 597 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", 598 bits_wanted, free_bits); 599 status = -ENOSPC; 600 goto bail; 601 } 602 603 if (!(flags & ALLOC_NEW_GROUP)) { 604 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 605 "and we don't alloc a new group for it.\n", 606 slot, bits_wanted, free_bits); 607 status = -ENOSPC; 608 goto bail; 609 } 610 611 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 612 ac->ac_max_block, 613 last_alloc_group, flags); 614 if (status < 0) { 615 if (status != -ENOSPC) 616 mlog_errno(status); 617 goto bail; 618 } 619 atomic_inc(&osb->alloc_stats.bg_extends); 620 621 /* You should never ask for this much metadata */ 622 BUG_ON(bits_wanted > 623 (le32_to_cpu(fe->id1.bitmap1.i_total) 624 - le32_to_cpu(fe->id1.bitmap1.i_used))); 625 } 626 627 get_bh(bh); 628 ac->ac_bh = bh; 629 bail: 630 brelse(bh); 631 632 mlog_exit(status); 633 return status; 634 } 635 636 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 637 { 638 spin_lock(&osb->osb_lock); 639 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 640 spin_unlock(&osb->osb_lock); 641 atomic_set(&osb->s_num_inodes_stolen, 0); 642 } 643 644 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 645 { 646 spin_lock(&osb->osb_lock); 647 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 648 spin_unlock(&osb->osb_lock); 649 atomic_set(&osb->s_num_meta_stolen, 0); 650 } 651 652 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 653 { 654 ocfs2_init_inode_steal_slot(osb); 655 ocfs2_init_meta_steal_slot(osb); 656 } 657 658 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 659 { 660 spin_lock(&osb->osb_lock); 661 if (type == INODE_ALLOC_SYSTEM_INODE) 662 osb->s_inode_steal_slot = slot; 663 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 664 osb->s_meta_steal_slot = slot; 665 spin_unlock(&osb->osb_lock); 666 } 667 668 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 669 { 670 int slot = OCFS2_INVALID_SLOT; 671 672 spin_lock(&osb->osb_lock); 673 if (type == INODE_ALLOC_SYSTEM_INODE) 674 slot = osb->s_inode_steal_slot; 675 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 676 slot = osb->s_meta_steal_slot; 677 spin_unlock(&osb->osb_lock); 678 679 return slot; 680 } 681 682 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 683 { 684 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 685 } 686 687 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 688 { 689 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 690 } 691 692 static int ocfs2_steal_resource(struct ocfs2_super *osb, 693 struct ocfs2_alloc_context *ac, 694 int type) 695 { 696 int i, status = -ENOSPC; 697 int slot = __ocfs2_get_steal_slot(osb, type); 698 699 /* Start to steal resource from the first slot after ours. */ 700 if (slot == OCFS2_INVALID_SLOT) 701 slot = osb->slot_num + 1; 702 703 for (i = 0; i < osb->max_slots; i++, slot++) { 704 if (slot == osb->max_slots) 705 slot = 0; 706 707 if (slot == osb->slot_num) 708 continue; 709 710 status = ocfs2_reserve_suballoc_bits(osb, ac, 711 type, 712 (u32)slot, NULL, 713 NOT_ALLOC_NEW_GROUP); 714 if (status >= 0) { 715 __ocfs2_set_steal_slot(osb, slot, type); 716 break; 717 } 718 719 ocfs2_free_ac_resource(ac); 720 } 721 722 return status; 723 } 724 725 static int ocfs2_steal_inode(struct ocfs2_super *osb, 726 struct ocfs2_alloc_context *ac) 727 { 728 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 729 } 730 731 static int ocfs2_steal_meta(struct ocfs2_super *osb, 732 struct ocfs2_alloc_context *ac) 733 { 734 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 735 } 736 737 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 738 int blocks, 739 struct ocfs2_alloc_context **ac) 740 { 741 int status; 742 int slot = ocfs2_get_meta_steal_slot(osb); 743 744 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 745 if (!(*ac)) { 746 status = -ENOMEM; 747 mlog_errno(status); 748 goto bail; 749 } 750 751 (*ac)->ac_bits_wanted = blocks; 752 (*ac)->ac_which = OCFS2_AC_USE_META; 753 (*ac)->ac_group_search = ocfs2_block_group_search; 754 755 if (slot != OCFS2_INVALID_SLOT && 756 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 757 goto extent_steal; 758 759 atomic_set(&osb->s_num_meta_stolen, 0); 760 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 761 EXTENT_ALLOC_SYSTEM_INODE, 762 (u32)osb->slot_num, NULL, 763 ALLOC_NEW_GROUP); 764 765 766 if (status >= 0) { 767 status = 0; 768 if (slot != OCFS2_INVALID_SLOT) 769 ocfs2_init_meta_steal_slot(osb); 770 goto bail; 771 } else if (status < 0 && status != -ENOSPC) { 772 mlog_errno(status); 773 goto bail; 774 } 775 776 ocfs2_free_ac_resource(*ac); 777 778 extent_steal: 779 status = ocfs2_steal_meta(osb, *ac); 780 atomic_inc(&osb->s_num_meta_stolen); 781 if (status < 0) { 782 if (status != -ENOSPC) 783 mlog_errno(status); 784 goto bail; 785 } 786 787 status = 0; 788 bail: 789 if ((status < 0) && *ac) { 790 ocfs2_free_alloc_context(*ac); 791 *ac = NULL; 792 } 793 794 mlog_exit(status); 795 return status; 796 } 797 798 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 799 struct ocfs2_extent_list *root_el, 800 struct ocfs2_alloc_context **ac) 801 { 802 return ocfs2_reserve_new_metadata_blocks(osb, 803 ocfs2_extend_meta_needed(root_el), 804 ac); 805 } 806 807 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 808 struct ocfs2_alloc_context **ac) 809 { 810 int status; 811 int slot = ocfs2_get_inode_steal_slot(osb); 812 u64 alloc_group; 813 814 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 815 if (!(*ac)) { 816 status = -ENOMEM; 817 mlog_errno(status); 818 goto bail; 819 } 820 821 (*ac)->ac_bits_wanted = 1; 822 (*ac)->ac_which = OCFS2_AC_USE_INODE; 823 824 (*ac)->ac_group_search = ocfs2_block_group_search; 825 826 /* 827 * stat(2) can't handle i_ino > 32bits, so we tell the 828 * lower levels not to allocate us a block group past that 829 * limit. The 'inode64' mount option avoids this behavior. 830 */ 831 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 832 (*ac)->ac_max_block = (u32)~0U; 833 834 /* 835 * slot is set when we successfully steal inode from other nodes. 836 * It is reset in 3 places: 837 * 1. when we flush the truncate log 838 * 2. when we complete local alloc recovery. 839 * 3. when we successfully allocate from our own slot. 840 * After it is set, we will go on stealing inodes until we find the 841 * need to check our slots to see whether there is some space for us. 842 */ 843 if (slot != OCFS2_INVALID_SLOT && 844 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 845 goto inode_steal; 846 847 atomic_set(&osb->s_num_inodes_stolen, 0); 848 alloc_group = osb->osb_inode_alloc_group; 849 status = ocfs2_reserve_suballoc_bits(osb, *ac, 850 INODE_ALLOC_SYSTEM_INODE, 851 (u32)osb->slot_num, 852 &alloc_group, 853 ALLOC_NEW_GROUP | 854 ALLOC_GROUPS_FROM_GLOBAL); 855 if (status >= 0) { 856 status = 0; 857 858 spin_lock(&osb->osb_lock); 859 osb->osb_inode_alloc_group = alloc_group; 860 spin_unlock(&osb->osb_lock); 861 mlog(0, "after reservation, new allocation group is " 862 "%llu\n", (unsigned long long)alloc_group); 863 864 /* 865 * Some inodes must be freed by us, so try to allocate 866 * from our own next time. 867 */ 868 if (slot != OCFS2_INVALID_SLOT) 869 ocfs2_init_inode_steal_slot(osb); 870 goto bail; 871 } else if (status < 0 && status != -ENOSPC) { 872 mlog_errno(status); 873 goto bail; 874 } 875 876 ocfs2_free_ac_resource(*ac); 877 878 inode_steal: 879 status = ocfs2_steal_inode(osb, *ac); 880 atomic_inc(&osb->s_num_inodes_stolen); 881 if (status < 0) { 882 if (status != -ENOSPC) 883 mlog_errno(status); 884 goto bail; 885 } 886 887 status = 0; 888 bail: 889 if ((status < 0) && *ac) { 890 ocfs2_free_alloc_context(*ac); 891 *ac = NULL; 892 } 893 894 mlog_exit(status); 895 return status; 896 } 897 898 /* local alloc code has to do the same thing, so rather than do this 899 * twice.. */ 900 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 901 struct ocfs2_alloc_context *ac) 902 { 903 int status; 904 905 ac->ac_which = OCFS2_AC_USE_MAIN; 906 ac->ac_group_search = ocfs2_cluster_group_search; 907 908 status = ocfs2_reserve_suballoc_bits(osb, ac, 909 GLOBAL_BITMAP_SYSTEM_INODE, 910 OCFS2_INVALID_SLOT, NULL, 911 ALLOC_NEW_GROUP); 912 if (status < 0 && status != -ENOSPC) { 913 mlog_errno(status); 914 goto bail; 915 } 916 917 bail: 918 return status; 919 } 920 921 /* Callers don't need to care which bitmap (local alloc or main) to 922 * use so we figure it out for them, but unfortunately this clutters 923 * things a bit. */ 924 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 925 u32 bits_wanted, u64 max_block, 926 int flags, 927 struct ocfs2_alloc_context **ac) 928 { 929 int status; 930 931 mlog_entry_void(); 932 933 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 934 if (!(*ac)) { 935 status = -ENOMEM; 936 mlog_errno(status); 937 goto bail; 938 } 939 940 (*ac)->ac_bits_wanted = bits_wanted; 941 (*ac)->ac_max_block = max_block; 942 943 status = -ENOSPC; 944 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 945 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 946 status = ocfs2_reserve_local_alloc_bits(osb, 947 bits_wanted, 948 *ac); 949 if (status == -EFBIG) { 950 /* The local alloc window is outside ac_max_block. 951 * use the main bitmap. */ 952 status = -ENOSPC; 953 } else if ((status < 0) && (status != -ENOSPC)) { 954 mlog_errno(status); 955 goto bail; 956 } 957 } 958 959 if (status == -ENOSPC) { 960 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 961 if (status < 0) { 962 if (status != -ENOSPC) 963 mlog_errno(status); 964 goto bail; 965 } 966 } 967 968 status = 0; 969 bail: 970 if ((status < 0) && *ac) { 971 ocfs2_free_alloc_context(*ac); 972 *ac = NULL; 973 } 974 975 mlog_exit(status); 976 return status; 977 } 978 979 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 980 u32 bits_wanted, 981 struct ocfs2_alloc_context **ac) 982 { 983 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 984 ALLOC_NEW_GROUP, ac); 985 } 986 987 /* 988 * More or less lifted from ext3. I'll leave their description below: 989 * 990 * "For ext3 allocations, we must not reuse any blocks which are 991 * allocated in the bitmap buffer's "last committed data" copy. This 992 * prevents deletes from freeing up the page for reuse until we have 993 * committed the delete transaction. 994 * 995 * If we didn't do this, then deleting something and reallocating it as 996 * data would allow the old block to be overwritten before the 997 * transaction committed (because we force data to disk before commit). 998 * This would lead to corruption if we crashed between overwriting the 999 * data and committing the delete. 1000 * 1001 * @@@ We may want to make this allocation behaviour conditional on 1002 * data-writes at some point, and disable it for metadata allocations or 1003 * sync-data inodes." 1004 * 1005 * Note: OCFS2 already does this differently for metadata vs data 1006 * allocations, as those bitmaps are separate and undo access is never 1007 * called on a metadata group descriptor. 1008 */ 1009 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1010 int nr) 1011 { 1012 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1013 int ret; 1014 1015 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1016 return 0; 1017 1018 if (!buffer_jbd(bg_bh)) 1019 return 1; 1020 1021 jbd_lock_bh_state(bg_bh); 1022 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; 1023 if (bg) 1024 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1025 else 1026 ret = 1; 1027 jbd_unlock_bh_state(bg_bh); 1028 1029 return ret; 1030 } 1031 1032 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1033 struct buffer_head *bg_bh, 1034 unsigned int bits_wanted, 1035 unsigned int total_bits, 1036 u16 *bit_off, 1037 u16 *bits_found) 1038 { 1039 void *bitmap; 1040 u16 best_offset, best_size; 1041 int offset, start, found, status = 0; 1042 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1043 1044 /* Callers got this descriptor from 1045 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1046 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1047 1048 found = start = best_offset = best_size = 0; 1049 bitmap = bg->bg_bitmap; 1050 1051 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { 1052 if (offset == total_bits) 1053 break; 1054 1055 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1056 /* We found a zero, but we can't use it as it 1057 * hasn't been put to disk yet! */ 1058 found = 0; 1059 start = offset + 1; 1060 } else if (offset == start) { 1061 /* we found a zero */ 1062 found++; 1063 /* move start to the next bit to test */ 1064 start++; 1065 } else { 1066 /* got a zero after some ones */ 1067 found = 1; 1068 start = offset + 1; 1069 } 1070 if (found > best_size) { 1071 best_size = found; 1072 best_offset = start - found; 1073 } 1074 /* we got everything we needed */ 1075 if (found == bits_wanted) { 1076 /* mlog(0, "Found it all!\n"); */ 1077 break; 1078 } 1079 } 1080 1081 /* XXX: I think the first clause is equivalent to the second 1082 * - jlbec */ 1083 if (found == bits_wanted) { 1084 *bit_off = start - found; 1085 *bits_found = found; 1086 } else if (best_size) { 1087 *bit_off = best_offset; 1088 *bits_found = best_size; 1089 } else { 1090 status = -ENOSPC; 1091 /* No error log here -- see the comment above 1092 * ocfs2_test_bg_bit_allocatable */ 1093 } 1094 1095 return status; 1096 } 1097 1098 static inline int ocfs2_block_group_set_bits(handle_t *handle, 1099 struct inode *alloc_inode, 1100 struct ocfs2_group_desc *bg, 1101 struct buffer_head *group_bh, 1102 unsigned int bit_off, 1103 unsigned int num_bits) 1104 { 1105 int status; 1106 void *bitmap = bg->bg_bitmap; 1107 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1108 1109 mlog_entry_void(); 1110 1111 /* All callers get the descriptor via 1112 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1113 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1114 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1115 1116 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 1117 num_bits); 1118 1119 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1120 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1121 1122 status = ocfs2_journal_access_gd(handle, 1123 INODE_CACHE(alloc_inode), 1124 group_bh, 1125 journal_type); 1126 if (status < 0) { 1127 mlog_errno(status); 1128 goto bail; 1129 } 1130 1131 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1132 1133 while(num_bits--) 1134 ocfs2_set_bit(bit_off++, bitmap); 1135 1136 status = ocfs2_journal_dirty(handle, 1137 group_bh); 1138 if (status < 0) { 1139 mlog_errno(status); 1140 goto bail; 1141 } 1142 1143 bail: 1144 mlog_exit(status); 1145 return status; 1146 } 1147 1148 /* find the one with the most empty bits */ 1149 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1150 { 1151 u16 curr, best; 1152 1153 BUG_ON(!cl->cl_next_free_rec); 1154 1155 best = curr = 0; 1156 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1157 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1158 le32_to_cpu(cl->cl_recs[best].c_free)) 1159 best = curr; 1160 curr++; 1161 } 1162 1163 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1164 return best; 1165 } 1166 1167 static int ocfs2_relink_block_group(handle_t *handle, 1168 struct inode *alloc_inode, 1169 struct buffer_head *fe_bh, 1170 struct buffer_head *bg_bh, 1171 struct buffer_head *prev_bg_bh, 1172 u16 chain) 1173 { 1174 int status; 1175 /* there is a really tiny chance the journal calls could fail, 1176 * but we wouldn't want inconsistent blocks in *any* case. */ 1177 u64 fe_ptr, bg_ptr, prev_bg_ptr; 1178 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1179 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1180 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1181 1182 /* The caller got these descriptors from 1183 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1184 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1185 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1186 1187 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 1188 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1189 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1190 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1191 1192 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); 1193 bg_ptr = le64_to_cpu(bg->bg_next_group); 1194 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1195 1196 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1197 prev_bg_bh, 1198 OCFS2_JOURNAL_ACCESS_WRITE); 1199 if (status < 0) { 1200 mlog_errno(status); 1201 goto out_rollback; 1202 } 1203 1204 prev_bg->bg_next_group = bg->bg_next_group; 1205 1206 status = ocfs2_journal_dirty(handle, prev_bg_bh); 1207 if (status < 0) { 1208 mlog_errno(status); 1209 goto out_rollback; 1210 } 1211 1212 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1213 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1214 if (status < 0) { 1215 mlog_errno(status); 1216 goto out_rollback; 1217 } 1218 1219 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1220 1221 status = ocfs2_journal_dirty(handle, bg_bh); 1222 if (status < 0) { 1223 mlog_errno(status); 1224 goto out_rollback; 1225 } 1226 1227 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1228 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1229 if (status < 0) { 1230 mlog_errno(status); 1231 goto out_rollback; 1232 } 1233 1234 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1235 1236 status = ocfs2_journal_dirty(handle, fe_bh); 1237 if (status < 0) { 1238 mlog_errno(status); 1239 goto out_rollback; 1240 } 1241 1242 status = 0; 1243 out_rollback: 1244 if (status < 0) { 1245 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 1246 bg->bg_next_group = cpu_to_le64(bg_ptr); 1247 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1248 } 1249 1250 mlog_exit(status); 1251 return status; 1252 } 1253 1254 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1255 u32 wanted) 1256 { 1257 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1258 } 1259 1260 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1261 * value on error. */ 1262 static int ocfs2_cluster_group_search(struct inode *inode, 1263 struct buffer_head *group_bh, 1264 u32 bits_wanted, u32 min_bits, 1265 u64 max_block, 1266 u16 *bit_off, u16 *bits_found) 1267 { 1268 int search = -ENOSPC; 1269 int ret; 1270 u64 blkoff; 1271 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1272 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1273 u16 tmp_off, tmp_found; 1274 unsigned int max_bits, gd_cluster_off; 1275 1276 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1277 1278 if (gd->bg_free_bits_count) { 1279 max_bits = le16_to_cpu(gd->bg_bits); 1280 1281 /* Tail groups in cluster bitmaps which aren't cpg 1282 * aligned are prone to partial extention by a failed 1283 * fs resize. If the file system resize never got to 1284 * update the dinode cluster count, then we don't want 1285 * to trust any clusters past it, regardless of what 1286 * the group descriptor says. */ 1287 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1288 le64_to_cpu(gd->bg_blkno)); 1289 if ((gd_cluster_off + max_bits) > 1290 OCFS2_I(inode)->ip_clusters) { 1291 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1292 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", 1293 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1294 le16_to_cpu(gd->bg_bits), 1295 OCFS2_I(inode)->ip_clusters, max_bits); 1296 } 1297 1298 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1299 group_bh, bits_wanted, 1300 max_bits, 1301 &tmp_off, &tmp_found); 1302 if (ret) 1303 return ret; 1304 1305 if (max_block) { 1306 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1307 gd_cluster_off + 1308 tmp_off + tmp_found); 1309 mlog(0, "Checking %llu against %llu\n", 1310 (unsigned long long)blkoff, 1311 (unsigned long long)max_block); 1312 if (blkoff > max_block) 1313 return -ENOSPC; 1314 } 1315 1316 /* ocfs2_block_group_find_clear_bits() might 1317 * return success, but we still want to return 1318 * -ENOSPC unless it found the minimum number 1319 * of bits. */ 1320 if (min_bits <= tmp_found) { 1321 *bit_off = tmp_off; 1322 *bits_found = tmp_found; 1323 search = 0; /* success */ 1324 } else if (tmp_found) { 1325 /* 1326 * Don't show bits which we'll be returning 1327 * for allocation to the local alloc bitmap. 1328 */ 1329 ocfs2_local_alloc_seen_free_bits(osb, tmp_found); 1330 } 1331 } 1332 1333 return search; 1334 } 1335 1336 static int ocfs2_block_group_search(struct inode *inode, 1337 struct buffer_head *group_bh, 1338 u32 bits_wanted, u32 min_bits, 1339 u64 max_block, 1340 u16 *bit_off, u16 *bits_found) 1341 { 1342 int ret = -ENOSPC; 1343 u64 blkoff; 1344 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1345 1346 BUG_ON(min_bits != 1); 1347 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1348 1349 if (bg->bg_free_bits_count) { 1350 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1351 group_bh, bits_wanted, 1352 le16_to_cpu(bg->bg_bits), 1353 bit_off, bits_found); 1354 if (!ret && max_block) { 1355 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + 1356 *bits_found; 1357 mlog(0, "Checking %llu against %llu\n", 1358 (unsigned long long)blkoff, 1359 (unsigned long long)max_block); 1360 if (blkoff > max_block) 1361 ret = -ENOSPC; 1362 } 1363 } 1364 1365 return ret; 1366 } 1367 1368 static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1369 handle_t *handle, 1370 struct buffer_head *di_bh, 1371 u32 num_bits, 1372 u16 chain) 1373 { 1374 int ret; 1375 u32 tmp_used; 1376 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1377 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1378 1379 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1380 OCFS2_JOURNAL_ACCESS_WRITE); 1381 if (ret < 0) { 1382 mlog_errno(ret); 1383 goto out; 1384 } 1385 1386 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1387 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1388 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1389 1390 ret = ocfs2_journal_dirty(handle, di_bh); 1391 if (ret < 0) 1392 mlog_errno(ret); 1393 1394 out: 1395 return ret; 1396 } 1397 1398 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1399 handle_t *handle, 1400 u32 bits_wanted, 1401 u32 min_bits, 1402 u16 *bit_off, 1403 unsigned int *num_bits, 1404 u64 gd_blkno, 1405 u16 *bits_left) 1406 { 1407 int ret; 1408 u16 found; 1409 struct buffer_head *group_bh = NULL; 1410 struct ocfs2_group_desc *gd; 1411 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1412 struct inode *alloc_inode = ac->ac_inode; 1413 1414 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, 1415 &group_bh); 1416 if (ret < 0) { 1417 mlog_errno(ret); 1418 return ret; 1419 } 1420 1421 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1422 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1423 ac->ac_max_block, bit_off, &found); 1424 if (ret < 0) { 1425 if (ret != -ENOSPC) 1426 mlog_errno(ret); 1427 goto out; 1428 } 1429 1430 *num_bits = found; 1431 1432 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1433 *num_bits, 1434 le16_to_cpu(gd->bg_chain)); 1435 if (ret < 0) { 1436 mlog_errno(ret); 1437 goto out; 1438 } 1439 1440 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1441 *bit_off, *num_bits); 1442 if (ret < 0) 1443 mlog_errno(ret); 1444 1445 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1446 1447 out: 1448 brelse(group_bh); 1449 1450 return ret; 1451 } 1452 1453 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1454 handle_t *handle, 1455 u32 bits_wanted, 1456 u32 min_bits, 1457 u16 *bit_off, 1458 unsigned int *num_bits, 1459 u64 *bg_blkno, 1460 u16 *bits_left) 1461 { 1462 int status; 1463 u16 chain, tmp_bits; 1464 u32 tmp_used; 1465 u64 next_group; 1466 struct inode *alloc_inode = ac->ac_inode; 1467 struct buffer_head *group_bh = NULL; 1468 struct buffer_head *prev_group_bh = NULL; 1469 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1470 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1471 struct ocfs2_group_desc *bg; 1472 1473 chain = ac->ac_chain; 1474 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n", 1475 bits_wanted, chain, 1476 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1477 1478 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1479 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1480 &group_bh); 1481 if (status < 0) { 1482 mlog_errno(status); 1483 goto bail; 1484 } 1485 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1486 1487 status = -ENOSPC; 1488 /* for now, the chain search is a bit simplistic. We just use 1489 * the 1st group with any empty bits. */ 1490 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1491 bits_wanted, min_bits, 1492 ac->ac_max_block, bit_off, 1493 &tmp_bits)) == -ENOSPC) { 1494 if (!bg->bg_next_group) 1495 break; 1496 1497 brelse(prev_group_bh); 1498 prev_group_bh = NULL; 1499 1500 next_group = le64_to_cpu(bg->bg_next_group); 1501 prev_group_bh = group_bh; 1502 group_bh = NULL; 1503 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1504 next_group, &group_bh); 1505 if (status < 0) { 1506 mlog_errno(status); 1507 goto bail; 1508 } 1509 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1510 } 1511 if (status < 0) { 1512 if (status != -ENOSPC) 1513 mlog_errno(status); 1514 goto bail; 1515 } 1516 1517 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1518 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1519 1520 *num_bits = tmp_bits; 1521 1522 BUG_ON(*num_bits == 0); 1523 1524 /* 1525 * Keep track of previous block descriptor read. When 1526 * we find a target, if we have read more than X 1527 * number of descriptors, and the target is reasonably 1528 * empty, relink him to top of his chain. 1529 * 1530 * We've read 0 extra blocks and only send one more to 1531 * the transaction, yet the next guy to search has a 1532 * much easier time. 1533 * 1534 * Do this *after* figuring out how many bits we're taking out 1535 * of our target group. 1536 */ 1537 if (ac->ac_allow_chain_relink && 1538 (prev_group_bh) && 1539 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1540 status = ocfs2_relink_block_group(handle, alloc_inode, 1541 ac->ac_bh, group_bh, 1542 prev_group_bh, chain); 1543 if (status < 0) { 1544 mlog_errno(status); 1545 goto bail; 1546 } 1547 } 1548 1549 /* Ok, claim our bits now: set the info on dinode, chainlist 1550 * and then the group */ 1551 status = ocfs2_journal_access_di(handle, 1552 INODE_CACHE(alloc_inode), 1553 ac->ac_bh, 1554 OCFS2_JOURNAL_ACCESS_WRITE); 1555 if (status < 0) { 1556 mlog_errno(status); 1557 goto bail; 1558 } 1559 1560 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1561 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); 1562 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); 1563 1564 status = ocfs2_journal_dirty(handle, 1565 ac->ac_bh); 1566 if (status < 0) { 1567 mlog_errno(status); 1568 goto bail; 1569 } 1570 1571 status = ocfs2_block_group_set_bits(handle, 1572 alloc_inode, 1573 bg, 1574 group_bh, 1575 *bit_off, 1576 *num_bits); 1577 if (status < 0) { 1578 mlog_errno(status); 1579 goto bail; 1580 } 1581 1582 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1583 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1584 1585 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1586 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1587 bail: 1588 brelse(group_bh); 1589 brelse(prev_group_bh); 1590 1591 mlog_exit(status); 1592 return status; 1593 } 1594 1595 /* will give out up to bits_wanted contiguous bits. */ 1596 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1597 struct ocfs2_alloc_context *ac, 1598 handle_t *handle, 1599 u32 bits_wanted, 1600 u32 min_bits, 1601 u16 *bit_off, 1602 unsigned int *num_bits, 1603 u64 *bg_blkno) 1604 { 1605 int status; 1606 u16 victim, i; 1607 u16 bits_left = 0; 1608 u64 hint_blkno = ac->ac_last_group; 1609 struct ocfs2_chain_list *cl; 1610 struct ocfs2_dinode *fe; 1611 1612 mlog_entry_void(); 1613 1614 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1615 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1616 BUG_ON(!ac->ac_bh); 1617 1618 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1619 1620 /* The bh was validated by the inode read during 1621 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 1622 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 1623 1624 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1625 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1626 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1627 "bits but only %u total.", 1628 (unsigned long long)le64_to_cpu(fe->i_blkno), 1629 le32_to_cpu(fe->id1.bitmap1.i_used), 1630 le32_to_cpu(fe->id1.bitmap1.i_total)); 1631 status = -EIO; 1632 goto bail; 1633 } 1634 1635 if (hint_blkno) { 1636 /* Attempt to short-circuit the usual search mechanism 1637 * by jumping straight to the most recently used 1638 * allocation group. This helps us mantain some 1639 * contiguousness across allocations. */ 1640 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1641 min_bits, bit_off, num_bits, 1642 hint_blkno, &bits_left); 1643 if (!status) { 1644 /* Be careful to update *bg_blkno here as the 1645 * caller is expecting it to be filled in, and 1646 * ocfs2_search_one_group() won't do that for 1647 * us. */ 1648 *bg_blkno = hint_blkno; 1649 goto set_hint; 1650 } 1651 if (status < 0 && status != -ENOSPC) { 1652 mlog_errno(status); 1653 goto bail; 1654 } 1655 } 1656 1657 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1658 1659 victim = ocfs2_find_victim_chain(cl); 1660 ac->ac_chain = victim; 1661 ac->ac_allow_chain_relink = 1; 1662 1663 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1664 num_bits, bg_blkno, &bits_left); 1665 if (!status) 1666 goto set_hint; 1667 if (status < 0 && status != -ENOSPC) { 1668 mlog_errno(status); 1669 goto bail; 1670 } 1671 1672 mlog(0, "Search of victim chain %u came up with nothing, " 1673 "trying all chains now.\n", victim); 1674 1675 /* If we didn't pick a good victim, then just default to 1676 * searching each chain in order. Don't allow chain relinking 1677 * because we only calculate enough journal credits for one 1678 * relink per alloc. */ 1679 ac->ac_allow_chain_relink = 0; 1680 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 1681 if (i == victim) 1682 continue; 1683 if (!cl->cl_recs[i].c_free) 1684 continue; 1685 1686 ac->ac_chain = i; 1687 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1688 bit_off, num_bits, bg_blkno, 1689 &bits_left); 1690 if (!status) 1691 break; 1692 if (status < 0 && status != -ENOSPC) { 1693 mlog_errno(status); 1694 goto bail; 1695 } 1696 } 1697 1698 set_hint: 1699 if (status != -ENOSPC) { 1700 /* If the next search of this group is not likely to 1701 * yield a suitable extent, then we reset the last 1702 * group hint so as to not waste a disk read */ 1703 if (bits_left < min_bits) 1704 ac->ac_last_group = 0; 1705 else 1706 ac->ac_last_group = *bg_blkno; 1707 } 1708 1709 bail: 1710 mlog_exit(status); 1711 return status; 1712 } 1713 1714 int ocfs2_claim_metadata(struct ocfs2_super *osb, 1715 handle_t *handle, 1716 struct ocfs2_alloc_context *ac, 1717 u32 bits_wanted, 1718 u16 *suballoc_bit_start, 1719 unsigned int *num_bits, 1720 u64 *blkno_start) 1721 { 1722 int status; 1723 u64 bg_blkno; 1724 1725 BUG_ON(!ac); 1726 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1727 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1728 1729 status = ocfs2_claim_suballoc_bits(osb, 1730 ac, 1731 handle, 1732 bits_wanted, 1733 1, 1734 suballoc_bit_start, 1735 num_bits, 1736 &bg_blkno); 1737 if (status < 0) { 1738 mlog_errno(status); 1739 goto bail; 1740 } 1741 atomic_inc(&osb->alloc_stats.bg_allocs); 1742 1743 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 1744 ac->ac_bits_given += (*num_bits); 1745 status = 0; 1746 bail: 1747 mlog_exit(status); 1748 return status; 1749 } 1750 1751 static void ocfs2_init_inode_ac_group(struct inode *dir, 1752 struct buffer_head *parent_fe_bh, 1753 struct ocfs2_alloc_context *ac) 1754 { 1755 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1756 /* 1757 * Try to allocate inodes from some specific group. 1758 * 1759 * If the parent dir has recorded the last group used in allocation, 1760 * cool, use it. Otherwise if we try to allocate new inode from the 1761 * same slot the parent dir belongs to, use the same chunk. 1762 * 1763 * We are very careful here to avoid the mistake of setting 1764 * ac_last_group to a group descriptor from a different (unlocked) slot. 1765 */ 1766 if (OCFS2_I(dir)->ip_last_used_group && 1767 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 1768 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 1769 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) 1770 ac->ac_last_group = ocfs2_which_suballoc_group( 1771 le64_to_cpu(fe->i_blkno), 1772 le16_to_cpu(fe->i_suballoc_bit)); 1773 } 1774 1775 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 1776 struct ocfs2_alloc_context *ac) 1777 { 1778 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 1779 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 1780 } 1781 1782 int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1783 handle_t *handle, 1784 struct inode *dir, 1785 struct buffer_head *parent_fe_bh, 1786 struct ocfs2_alloc_context *ac, 1787 u16 *suballoc_bit, 1788 u64 *fe_blkno) 1789 { 1790 int status; 1791 unsigned int num_bits; 1792 u64 bg_blkno; 1793 1794 mlog_entry_void(); 1795 1796 BUG_ON(!ac); 1797 BUG_ON(ac->ac_bits_given != 0); 1798 BUG_ON(ac->ac_bits_wanted != 1); 1799 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1800 1801 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 1802 1803 status = ocfs2_claim_suballoc_bits(osb, 1804 ac, 1805 handle, 1806 1, 1807 1, 1808 suballoc_bit, 1809 &num_bits, 1810 &bg_blkno); 1811 if (status < 0) { 1812 mlog_errno(status); 1813 goto bail; 1814 } 1815 atomic_inc(&osb->alloc_stats.bg_allocs); 1816 1817 BUG_ON(num_bits != 1); 1818 1819 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1820 ac->ac_bits_given++; 1821 ocfs2_save_inode_ac_group(dir, ac); 1822 status = 0; 1823 bail: 1824 mlog_exit(status); 1825 return status; 1826 } 1827 1828 /* translate a group desc. blkno and it's bitmap offset into 1829 * disk cluster offset. */ 1830 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 1831 u64 bg_blkno, 1832 u16 bg_bit_off) 1833 { 1834 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1835 u32 cluster = 0; 1836 1837 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1838 1839 if (bg_blkno != osb->first_cluster_group_blkno) 1840 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 1841 cluster += (u32) bg_bit_off; 1842 return cluster; 1843 } 1844 1845 /* given a cluster offset, calculate which block group it belongs to 1846 * and return that block offset. */ 1847 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 1848 { 1849 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1850 u32 group_no; 1851 1852 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1853 1854 group_no = cluster / osb->bitmap_cpg; 1855 if (!group_no) 1856 return osb->first_cluster_group_blkno; 1857 return ocfs2_clusters_to_blocks(inode->i_sb, 1858 group_no * osb->bitmap_cpg); 1859 } 1860 1861 /* given the block number of a cluster start, calculate which cluster 1862 * group and descriptor bitmap offset that corresponds to. */ 1863 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 1864 u64 data_blkno, 1865 u64 *bg_blkno, 1866 u16 *bg_bit_off) 1867 { 1868 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1869 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 1870 1871 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1872 1873 *bg_blkno = ocfs2_which_cluster_group(inode, 1874 data_cluster); 1875 1876 if (*bg_blkno == osb->first_cluster_group_blkno) 1877 *bg_bit_off = (u16) data_cluster; 1878 else 1879 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 1880 data_blkno - *bg_blkno); 1881 } 1882 1883 /* 1884 * min_bits - minimum contiguous chunk from this total allocation we 1885 * can handle. set to what we asked for originally for a full 1886 * contig. allocation, set to '1' to indicate we can deal with extents 1887 * of any size. 1888 */ 1889 int __ocfs2_claim_clusters(struct ocfs2_super *osb, 1890 handle_t *handle, 1891 struct ocfs2_alloc_context *ac, 1892 u32 min_clusters, 1893 u32 max_clusters, 1894 u32 *cluster_start, 1895 u32 *num_clusters) 1896 { 1897 int status; 1898 unsigned int bits_wanted = max_clusters; 1899 u64 bg_blkno = 0; 1900 u16 bg_bit_off; 1901 1902 mlog_entry_void(); 1903 1904 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1905 1906 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 1907 && ac->ac_which != OCFS2_AC_USE_MAIN); 1908 1909 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 1910 status = ocfs2_claim_local_alloc_bits(osb, 1911 handle, 1912 ac, 1913 bits_wanted, 1914 cluster_start, 1915 num_clusters); 1916 if (!status) 1917 atomic_inc(&osb->alloc_stats.local_data); 1918 } else { 1919 if (min_clusters > (osb->bitmap_cpg - 1)) { 1920 /* The only paths asking for contiguousness 1921 * should know about this already. */ 1922 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 1923 "group bitmap size %u!\n", min_clusters, 1924 osb->bitmap_cpg); 1925 status = -ENOSPC; 1926 goto bail; 1927 } 1928 /* clamp the current request down to a realistic size. */ 1929 if (bits_wanted > (osb->bitmap_cpg - 1)) 1930 bits_wanted = osb->bitmap_cpg - 1; 1931 1932 status = ocfs2_claim_suballoc_bits(osb, 1933 ac, 1934 handle, 1935 bits_wanted, 1936 min_clusters, 1937 &bg_bit_off, 1938 num_clusters, 1939 &bg_blkno); 1940 if (!status) { 1941 *cluster_start = 1942 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 1943 bg_blkno, 1944 bg_bit_off); 1945 atomic_inc(&osb->alloc_stats.bitmap_data); 1946 } 1947 } 1948 if (status < 0) { 1949 if (status != -ENOSPC) 1950 mlog_errno(status); 1951 goto bail; 1952 } 1953 1954 ac->ac_bits_given += *num_clusters; 1955 1956 bail: 1957 mlog_exit(status); 1958 return status; 1959 } 1960 1961 int ocfs2_claim_clusters(struct ocfs2_super *osb, 1962 handle_t *handle, 1963 struct ocfs2_alloc_context *ac, 1964 u32 min_clusters, 1965 u32 *cluster_start, 1966 u32 *num_clusters) 1967 { 1968 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 1969 1970 return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, 1971 bits_wanted, cluster_start, num_clusters); 1972 } 1973 1974 static int ocfs2_block_group_clear_bits(handle_t *handle, 1975 struct inode *alloc_inode, 1976 struct ocfs2_group_desc *bg, 1977 struct buffer_head *group_bh, 1978 unsigned int bit_off, 1979 unsigned int num_bits, 1980 void (*undo_fn)(unsigned int bit, 1981 unsigned long *bmap)) 1982 { 1983 int status; 1984 unsigned int tmp; 1985 struct ocfs2_group_desc *undo_bg = NULL; 1986 1987 mlog_entry_void(); 1988 1989 /* The caller got this descriptor from 1990 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1991 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1992 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1994 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1997 group_bh, 1998 undo_fn ? 1999 OCFS2_JOURNAL_ACCESS_UNDO : 2000 OCFS2_JOURNAL_ACCESS_WRITE); 2001 if (status < 0) { 2002 mlog_errno(status); 2003 goto bail; 2004 } 2005 2006 if (undo_fn) { 2007 jbd_lock_bh_state(group_bh); 2008 undo_bg = (struct ocfs2_group_desc *) 2009 bh2jh(group_bh)->b_committed_data; 2010 BUG_ON(!undo_bg); 2011 } 2012 2013 tmp = num_bits; 2014 while(tmp--) { 2015 ocfs2_clear_bit((bit_off + tmp), 2016 (unsigned long *) bg->bg_bitmap); 2017 if (undo_fn) 2018 undo_fn(bit_off + tmp, 2019 (unsigned long *) undo_bg->bg_bitmap); 2020 } 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2022 2023 if (undo_fn) 2024 jbd_unlock_bh_state(group_bh); 2025 2026 status = ocfs2_journal_dirty(handle, group_bh); 2027 if (status < 0) 2028 mlog_errno(status); 2029 bail: 2030 return status; 2031 } 2032 2033 /* 2034 * expects the suballoc inode to already be locked. 2035 */ 2036 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2037 struct inode *alloc_inode, 2038 struct buffer_head *alloc_bh, 2039 unsigned int start_bit, 2040 u64 bg_blkno, 2041 unsigned int count, 2042 void (*undo_fn)(unsigned int bit, 2043 unsigned long *bitmap)) 2044 { 2045 int status = 0; 2046 u32 tmp_used; 2047 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2048 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2049 struct buffer_head *group_bh = NULL; 2050 struct ocfs2_group_desc *group; 2051 2052 mlog_entry_void(); 2053 2054 /* The alloc_bh comes from ocfs2_free_dinode() or 2055 * ocfs2_free_clusters(). The callers have all locked the 2056 * allocator and gotten alloc_bh from the lock call. This 2057 * validates the dinode buffer. Any corruption that has happended 2058 * is a code bug. */ 2059 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2060 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2061 2062 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 2063 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 2064 (unsigned long long)bg_blkno, start_bit); 2065 2066 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2067 &group_bh); 2068 if (status < 0) { 2069 mlog_errno(status); 2070 goto bail; 2071 } 2072 group = (struct ocfs2_group_desc *) group_bh->b_data; 2073 2074 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2075 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2077 group, group_bh, 2078 start_bit, count, undo_fn); 2079 if (status < 0) { 2080 mlog_errno(status); 2081 goto bail; 2082 } 2083 2084 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2085 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2086 if (status < 0) { 2087 mlog_errno(status); 2088 goto bail; 2089 } 2090 2091 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 2092 count); 2093 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2094 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2095 2096 status = ocfs2_journal_dirty(handle, alloc_bh); 2097 if (status < 0) { 2098 mlog_errno(status); 2099 goto bail; 2100 } 2101 2102 bail: 2103 brelse(group_bh); 2104 2105 mlog_exit(status); 2106 return status; 2107 } 2108 2109 int ocfs2_free_suballoc_bits(handle_t *handle, 2110 struct inode *alloc_inode, 2111 struct buffer_head *alloc_bh, 2112 unsigned int start_bit, 2113 u64 bg_blkno, 2114 unsigned int count) 2115 { 2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2117 start_bit, bg_blkno, count, NULL); 2118 } 2119 2120 int ocfs2_free_dinode(handle_t *handle, 2121 struct inode *inode_alloc_inode, 2122 struct buffer_head *inode_alloc_bh, 2123 struct ocfs2_dinode *di) 2124 { 2125 u64 blk = le64_to_cpu(di->i_blkno); 2126 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2127 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2128 2129 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2130 inode_alloc_bh, bit, bg_blkno, 1); 2131 } 2132 2133 static int _ocfs2_free_clusters(handle_t *handle, 2134 struct inode *bitmap_inode, 2135 struct buffer_head *bitmap_bh, 2136 u64 start_blk, 2137 unsigned int num_clusters, 2138 void (*undo_fn)(unsigned int bit, 2139 unsigned long *bitmap)) 2140 { 2141 int status; 2142 u16 bg_start_bit; 2143 u64 bg_blkno; 2144 struct ocfs2_dinode *fe; 2145 2146 /* You can't ever have a contiguous set of clusters 2147 * bigger than a block group bitmap so we never have to worry 2148 * about looping on them. */ 2149 2150 mlog_entry_void(); 2151 2152 /* This is expensive. We can safely remove once this stuff has 2153 * gotten tested really well. */ 2154 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); 2155 2156 fe = (struct ocfs2_dinode *) bitmap_bh->b_data; 2157 2158 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2159 &bg_start_bit); 2160 2161 mlog(0, "want to free %u clusters starting at block %llu\n", 2162 num_clusters, (unsigned long long)start_blk); 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2164 (unsigned long long)bg_blkno, bg_start_bit); 2165 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2167 bg_start_bit, bg_blkno, 2168 num_clusters, undo_fn); 2169 if (status < 0) { 2170 mlog_errno(status); 2171 goto out; 2172 } 2173 2174 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 2175 num_clusters); 2176 2177 out: 2178 mlog_exit(status); 2179 return status; 2180 } 2181 2182 int ocfs2_free_clusters(handle_t *handle, 2183 struct inode *bitmap_inode, 2184 struct buffer_head *bitmap_bh, 2185 u64 start_blk, 2186 unsigned int num_clusters) 2187 { 2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2189 start_blk, num_clusters, 2190 _ocfs2_set_bit); 2191 } 2192 2193 /* 2194 * Give never-used clusters back to the global bitmap. We don't need 2195 * to protect these bits in the undo buffer. 2196 */ 2197 int ocfs2_release_clusters(handle_t *handle, 2198 struct inode *bitmap_inode, 2199 struct buffer_head *bitmap_bh, 2200 u64 start_blk, 2201 unsigned int num_clusters) 2202 { 2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2204 start_blk, num_clusters, 2205 _ocfs2_clear_bit); 2206 } 2207 2208 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2209 { 2210 printk("Block Group:\n"); 2211 printk("bg_signature: %s\n", bg->bg_signature); 2212 printk("bg_size: %u\n", bg->bg_size); 2213 printk("bg_bits: %u\n", bg->bg_bits); 2214 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); 2215 printk("bg_chain: %u\n", bg->bg_chain); 2216 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); 2217 printk("bg_next_group: %llu\n", 2218 (unsigned long long)bg->bg_next_group); 2219 printk("bg_parent_dinode: %llu\n", 2220 (unsigned long long)bg->bg_parent_dinode); 2221 printk("bg_blkno: %llu\n", 2222 (unsigned long long)bg->bg_blkno); 2223 } 2224 2225 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) 2226 { 2227 int i; 2228 2229 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); 2230 printk("i_signature: %s\n", fe->i_signature); 2231 printk("i_size: %llu\n", 2232 (unsigned long long)fe->i_size); 2233 printk("i_clusters: %u\n", fe->i_clusters); 2234 printk("i_generation: %u\n", 2235 le32_to_cpu(fe->i_generation)); 2236 printk("id1.bitmap1.i_used: %u\n", 2237 le32_to_cpu(fe->id1.bitmap1.i_used)); 2238 printk("id1.bitmap1.i_total: %u\n", 2239 le32_to_cpu(fe->id1.bitmap1.i_total)); 2240 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); 2241 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); 2242 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); 2243 printk("id2.i_chain.cl_next_free_rec: %u\n", 2244 fe->id2.i_chain.cl_next_free_rec); 2245 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { 2246 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, 2247 fe->id2.i_chain.cl_recs[i].c_free); 2248 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, 2249 fe->id2.i_chain.cl_recs[i].c_total); 2250 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, 2251 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); 2252 } 2253 } 2254 2255 /* 2256 * For a given allocation, determine which allocators will need to be 2257 * accessed, and lock them, reserving the appropriate number of bits. 2258 * 2259 * Sparse file systems call this from ocfs2_write_begin_nolock() 2260 * and ocfs2_allocate_unwritten_extents(). 2261 * 2262 * File systems which don't support holes call this from 2263 * ocfs2_extend_allocation(). 2264 */ 2265 int ocfs2_lock_allocators(struct inode *inode, 2266 struct ocfs2_extent_tree *et, 2267 u32 clusters_to_add, u32 extents_to_split, 2268 struct ocfs2_alloc_context **data_ac, 2269 struct ocfs2_alloc_context **meta_ac) 2270 { 2271 int ret = 0, num_free_extents; 2272 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 2273 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2274 2275 *meta_ac = NULL; 2276 if (data_ac) 2277 *data_ac = NULL; 2278 2279 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2280 2281 num_free_extents = ocfs2_num_free_extents(osb, et); 2282 if (num_free_extents < 0) { 2283 ret = num_free_extents; 2284 mlog_errno(ret); 2285 goto out; 2286 } 2287 2288 /* 2289 * Sparse allocation file systems need to be more conservative 2290 * with reserving room for expansion - the actual allocation 2291 * happens while we've got a journal handle open so re-taking 2292 * a cluster lock (because we ran out of room for another 2293 * extent) will violate ordering rules. 2294 * 2295 * Most of the time we'll only be seeing this 1 cluster at a time 2296 * anyway. 2297 * 2298 * Always lock for any unwritten extents - we might want to 2299 * add blocks during a split. 2300 */ 2301 if (!num_free_extents || 2302 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 2303 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 2304 if (ret < 0) { 2305 if (ret != -ENOSPC) 2306 mlog_errno(ret); 2307 goto out; 2308 } 2309 } 2310 2311 if (clusters_to_add == 0) 2312 goto out; 2313 2314 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 2315 if (ret < 0) { 2316 if (ret != -ENOSPC) 2317 mlog_errno(ret); 2318 goto out; 2319 } 2320 2321 out: 2322 if (ret) { 2323 if (*meta_ac) { 2324 ocfs2_free_alloc_context(*meta_ac); 2325 *meta_ac = NULL; 2326 } 2327 2328 /* 2329 * We cannot have an error and a non null *data_ac. 2330 */ 2331 } 2332 2333 return ret; 2334 } 2335 2336 /* 2337 * Read the inode specified by blkno to get suballoc_slot and 2338 * suballoc_bit. 2339 */ 2340 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2341 u16 *suballoc_slot, u16 *suballoc_bit) 2342 { 2343 int status; 2344 struct buffer_head *inode_bh = NULL; 2345 struct ocfs2_dinode *inode_fe; 2346 2347 mlog_entry("blkno: %llu\n", (unsigned long long)blkno); 2348 2349 /* dirty read disk */ 2350 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2351 if (status < 0) { 2352 mlog(ML_ERROR, "read block %llu failed %d\n", 2353 (unsigned long long)blkno, status); 2354 goto bail; 2355 } 2356 2357 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 2358 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 2359 mlog(ML_ERROR, "invalid inode %llu requested\n", 2360 (unsigned long long)blkno); 2361 status = -EINVAL; 2362 goto bail; 2363 } 2364 2365 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 2366 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 2367 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 2368 (unsigned long long)blkno, 2369 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 2370 status = -EINVAL; 2371 goto bail; 2372 } 2373 2374 if (suballoc_slot) 2375 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2376 if (suballoc_bit) 2377 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2378 2379 bail: 2380 brelse(inode_bh); 2381 2382 mlog_exit(status); 2383 return status; 2384 } 2385 2386 /* 2387 * test whether bit is SET in allocator bitmap or not. on success, 0 2388 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 2389 * is returned and *res is meaningless. Call this after you have 2390 * cluster locked against suballoc, or you may get a result based on 2391 * non-up2date contents 2392 */ 2393 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2394 struct inode *suballoc, 2395 struct buffer_head *alloc_bh, u64 blkno, 2396 u16 bit, int *res) 2397 { 2398 struct ocfs2_dinode *alloc_fe; 2399 struct ocfs2_group_desc *group; 2400 struct buffer_head *group_bh = NULL; 2401 u64 bg_blkno; 2402 int status; 2403 2404 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2405 (unsigned int)bit); 2406 2407 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2408 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2409 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2410 (unsigned int)bit, 2411 ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); 2412 status = -EINVAL; 2413 goto bail; 2414 } 2415 2416 bg_blkno = ocfs2_which_suballoc_group(blkno, bit); 2417 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2418 &group_bh); 2419 if (status < 0) { 2420 mlog(ML_ERROR, "read group %llu failed %d\n", 2421 (unsigned long long)bg_blkno, status); 2422 goto bail; 2423 } 2424 2425 group = (struct ocfs2_group_desc *) group_bh->b_data; 2426 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 2427 2428 bail: 2429 brelse(group_bh); 2430 2431 mlog_exit(status); 2432 return status; 2433 } 2434 2435 /* 2436 * Test if the bit representing this inode (blkno) is set in the 2437 * suballocator. 2438 * 2439 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 2440 * 2441 * In the event of failure, a negative value is returned and *res is 2442 * meaningless. 2443 * 2444 * Callers must make sure to hold nfs_sync_lock to prevent 2445 * ocfs2_delete_inode() on another node from accessing the same 2446 * suballocator concurrently. 2447 */ 2448 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2449 { 2450 int status; 2451 u16 suballoc_bit = 0, suballoc_slot = 0; 2452 struct inode *inode_alloc_inode; 2453 struct buffer_head *alloc_bh = NULL; 2454 2455 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2456 2457 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2458 &suballoc_bit); 2459 if (status < 0) { 2460 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2461 goto bail; 2462 } 2463 2464 inode_alloc_inode = 2465 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, 2466 suballoc_slot); 2467 if (!inode_alloc_inode) { 2468 /* the error code could be inaccurate, but we are not able to 2469 * get the correct one. */ 2470 status = -EINVAL; 2471 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 2472 (u32)suballoc_slot); 2473 goto bail; 2474 } 2475 2476 mutex_lock(&inode_alloc_inode->i_mutex); 2477 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2478 if (status < 0) { 2479 mutex_unlock(&inode_alloc_inode->i_mutex); 2480 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2481 (u32)suballoc_slot, status); 2482 goto bail; 2483 } 2484 2485 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2486 blkno, suballoc_bit, res); 2487 if (status < 0) 2488 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2489 2490 ocfs2_inode_unlock(inode_alloc_inode, 0); 2491 mutex_unlock(&inode_alloc_inode->i_mutex); 2492 2493 iput(inode_alloc_inode); 2494 brelse(alloc_bh); 2495 bail: 2496 mlog_exit(status); 2497 return status; 2498 } 2499