1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * suballoc.c 4 * 5 * metadata alloc and free 6 * Inspired by ext3 block groups. 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/string.h> 15 #include <linux/highmem.h> 16 17 #include <cluster/masklog.h> 18 19 #include "ocfs2.h" 20 21 #include "alloc.h" 22 #include "blockcheck.h" 23 #include "dlmglue.h" 24 #include "inode.h" 25 #include "journal.h" 26 #include "localalloc.h" 27 #include "suballoc.h" 28 #include "super.h" 29 #include "sysfile.h" 30 #include "uptodate.h" 31 #include "ocfs2_trace.h" 32 33 #include "buffer_head_io.h" 34 35 #define NOT_ALLOC_NEW_GROUP 0 36 #define ALLOC_NEW_GROUP 0x1 37 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 38 39 #define OCFS2_MAX_TO_STEAL 1024 40 41 struct ocfs2_suballoc_result { 42 u64 sr_bg_blkno; /* The bg we allocated from. Set 43 to 0 when a block group is 44 contiguous. */ 45 u64 sr_bg_stable_blkno; /* 46 * Doesn't change, always 47 * set to target block 48 * group descriptor 49 * block. 50 */ 51 u64 sr_blkno; /* The first allocated block */ 52 unsigned int sr_bit_offset; /* The bit in the bg */ 53 unsigned int sr_bits; /* How many bits we claimed */ 54 unsigned int sr_max_contig_bits; /* The length for contiguous 55 * free bits, only available 56 * for cluster group 57 */ 58 }; 59 60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) 61 { 62 if (res->sr_blkno == 0) 63 return 0; 64 65 if (res->sr_bg_blkno) 66 return res->sr_bg_blkno; 67 68 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 69 } 70 71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 72 static int ocfs2_block_group_fill(handle_t *handle, 73 struct inode *alloc_inode, 74 struct buffer_head *bg_bh, 75 u64 group_blkno, 76 unsigned int group_clusters, 77 u16 my_chain, 78 struct ocfs2_chain_list *cl); 79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 80 struct inode *alloc_inode, 81 struct buffer_head *bh, 82 u64 max_block, 83 u64 *last_alloc_group, 84 int flags); 85 86 static int ocfs2_cluster_group_search(struct inode *inode, 87 struct buffer_head *group_bh, 88 u32 bits_wanted, u32 min_bits, 89 u64 max_block, 90 struct ocfs2_suballoc_result *res); 91 static int ocfs2_block_group_search(struct inode *inode, 92 struct buffer_head *group_bh, 93 u32 bits_wanted, u32 min_bits, 94 u64 max_block, 95 struct ocfs2_suballoc_result *res); 96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 97 handle_t *handle, 98 u32 bits_wanted, 99 u32 min_bits, 100 struct ocfs2_suballoc_result *res); 101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 102 int nr); 103 static int ocfs2_relink_block_group(handle_t *handle, 104 struct inode *alloc_inode, 105 struct buffer_head *fe_bh, 106 struct buffer_head *bg_bh, 107 struct buffer_head *prev_bg_bh, 108 u16 chain); 109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 110 u32 wanted); 111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 112 u64 bg_blkno, 113 u16 bg_bit_off); 114 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 115 u64 data_blkno, 116 u64 *bg_blkno, 117 u16 *bg_bit_off); 118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 119 u32 bits_wanted, u64 max_block, 120 int flags, 121 struct ocfs2_alloc_context **ac); 122 123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 124 { 125 struct inode *inode = ac->ac_inode; 126 127 if (inode) { 128 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 129 ocfs2_inode_unlock(inode, 1); 130 131 inode_unlock(inode); 132 133 iput(inode); 134 ac->ac_inode = NULL; 135 } 136 brelse(ac->ac_bh); 137 ac->ac_bh = NULL; 138 ac->ac_resv = NULL; 139 kfree(ac->ac_find_loc_priv); 140 ac->ac_find_loc_priv = NULL; 141 } 142 143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 144 { 145 ocfs2_free_ac_resource(ac); 146 kfree(ac); 147 } 148 149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 150 { 151 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 152 } 153 154 #define do_error(fmt, ...) \ 155 do { \ 156 if (resize) \ 157 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ 158 else \ 159 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 160 } while (0) 161 162 static int ocfs2_validate_gd_self(struct super_block *sb, 163 struct buffer_head *bh, 164 int resize) 165 { 166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 167 168 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 169 do_error("Group descriptor #%llu has bad signature %.*s\n", 170 (unsigned long long)bh->b_blocknr, 7, 171 gd->bg_signature); 172 } 173 174 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 175 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", 176 (unsigned long long)bh->b_blocknr, 177 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 178 } 179 180 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 181 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", 182 (unsigned long long)bh->b_blocknr, 183 le32_to_cpu(gd->bg_generation)); 184 } 185 186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 187 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", 188 (unsigned long long)bh->b_blocknr, 189 le16_to_cpu(gd->bg_bits), 190 le16_to_cpu(gd->bg_free_bits_count)); 191 } 192 193 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 194 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", 195 (unsigned long long)bh->b_blocknr, 196 le16_to_cpu(gd->bg_bits), 197 8 * le16_to_cpu(gd->bg_size)); 198 } 199 200 return 0; 201 } 202 203 static int ocfs2_validate_gd_parent(struct super_block *sb, 204 struct ocfs2_dinode *di, 205 struct buffer_head *bh, 206 int resize) 207 { 208 unsigned int max_bits; 209 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 211 if (di->i_blkno != gd->bg_parent_dinode) { 212 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", 213 (unsigned long long)bh->b_blocknr, 214 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 215 (unsigned long long)le64_to_cpu(di->i_blkno)); 216 } 217 218 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 219 if (le16_to_cpu(gd->bg_bits) > max_bits) { 220 do_error("Group descriptor #%llu has bit count of %u\n", 221 (unsigned long long)bh->b_blocknr, 222 le16_to_cpu(gd->bg_bits)); 223 } 224 225 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 226 if ((le16_to_cpu(gd->bg_chain) > 227 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 228 ((le16_to_cpu(gd->bg_chain) == 229 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 230 do_error("Group descriptor #%llu has bad chain %u\n", 231 (unsigned long long)bh->b_blocknr, 232 le16_to_cpu(gd->bg_chain)); 233 } 234 235 return 0; 236 } 237 238 #undef do_error 239 240 /* 241 * This version only prints errors. It does not fail the filesystem, and 242 * exists only for resize. 243 */ 244 int ocfs2_check_group_descriptor(struct super_block *sb, 245 struct ocfs2_dinode *di, 246 struct buffer_head *bh) 247 { 248 int rc; 249 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 250 251 BUG_ON(!buffer_uptodate(bh)); 252 253 /* 254 * If the ecc fails, we return the error but otherwise 255 * leave the filesystem running. We know any error is 256 * local to this block. 257 */ 258 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 259 if (rc) { 260 mlog(ML_ERROR, 261 "Checksum failed for group descriptor %llu\n", 262 (unsigned long long)bh->b_blocknr); 263 } else 264 rc = ocfs2_validate_gd_self(sb, bh, 1); 265 if (!rc) 266 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 267 268 return rc; 269 } 270 271 static int ocfs2_validate_group_descriptor(struct super_block *sb, 272 struct buffer_head *bh) 273 { 274 int rc; 275 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 276 277 trace_ocfs2_validate_group_descriptor( 278 (unsigned long long)bh->b_blocknr); 279 280 BUG_ON(!buffer_uptodate(bh)); 281 282 /* 283 * If the ecc fails, we return the error but otherwise 284 * leave the filesystem running. We know any error is 285 * local to this block. 286 */ 287 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 288 if (rc) 289 return rc; 290 291 /* 292 * Errors after here are fatal. 293 */ 294 295 return ocfs2_validate_gd_self(sb, bh, 0); 296 } 297 298 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 299 u64 gd_blkno, struct buffer_head **bh) 300 { 301 int rc; 302 struct buffer_head *tmp = *bh; 303 304 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 305 ocfs2_validate_group_descriptor); 306 if (rc) 307 goto out; 308 309 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 310 if (rc) { 311 brelse(tmp); 312 goto out; 313 } 314 315 /* If ocfs2_read_block() got us a new bh, pass it up. */ 316 if (!*bh) 317 *bh = tmp; 318 319 out: 320 return rc; 321 } 322 323 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 324 struct ocfs2_group_desc *bg, 325 struct ocfs2_chain_list *cl, 326 u64 p_blkno, unsigned int clusters) 327 { 328 struct ocfs2_extent_list *el = &bg->bg_list; 329 struct ocfs2_extent_rec *rec; 330 331 BUG_ON(!ocfs2_supports_discontig_bg(osb)); 332 if (!el->l_next_free_rec) 333 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); 334 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; 335 rec->e_blkno = cpu_to_le64(p_blkno); 336 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 337 le16_to_cpu(cl->cl_bpc)); 338 rec->e_leaf_clusters = cpu_to_le16(clusters); 339 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 340 le16_add_cpu(&bg->bg_free_bits_count, 341 clusters * le16_to_cpu(cl->cl_bpc)); 342 le16_add_cpu(&el->l_next_free_rec, 1); 343 } 344 345 static int ocfs2_block_group_fill(handle_t *handle, 346 struct inode *alloc_inode, 347 struct buffer_head *bg_bh, 348 u64 group_blkno, 349 unsigned int group_clusters, 350 u16 my_chain, 351 struct ocfs2_chain_list *cl) 352 { 353 int status = 0; 354 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 355 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 356 struct super_block * sb = alloc_inode->i_sb; 357 358 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 359 status = ocfs2_error(alloc_inode->i_sb, 360 "group block (%llu) != b_blocknr (%llu)\n", 361 (unsigned long long)group_blkno, 362 (unsigned long long) bg_bh->b_blocknr); 363 goto bail; 364 } 365 366 status = ocfs2_journal_access_gd(handle, 367 INODE_CACHE(alloc_inode), 368 bg_bh, 369 OCFS2_JOURNAL_ACCESS_CREATE); 370 if (status < 0) { 371 mlog_errno(status); 372 goto bail; 373 } 374 375 memset(bg, 0, sb->s_blocksize); 376 strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 377 bg->bg_generation = cpu_to_le32(osb->fs_generation); 378 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 379 osb->s_feature_incompat)); 380 bg->bg_chain = cpu_to_le16(my_chain); 381 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 382 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 383 bg->bg_blkno = cpu_to_le64(group_blkno); 384 if (group_clusters == le16_to_cpu(cl->cl_cpg)) 385 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 386 else 387 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, 388 group_clusters); 389 390 /* set the 1st bit in the bitmap to account for the descriptor block */ 391 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 392 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 393 394 ocfs2_journal_dirty(handle, bg_bh); 395 396 /* There is no need to zero out or otherwise initialize the 397 * other blocks in a group - All valid FS metadata in a block 398 * group stores the superblock fs_generation value at 399 * allocation time. */ 400 401 bail: 402 if (status) 403 mlog_errno(status); 404 return status; 405 } 406 407 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 408 { 409 u16 curr, best; 410 411 best = curr = 0; 412 while (curr < le16_to_cpu(cl->cl_count)) { 413 if (le32_to_cpu(cl->cl_recs[best].c_total) > 414 le32_to_cpu(cl->cl_recs[curr].c_total)) 415 best = curr; 416 curr++; 417 } 418 return best; 419 } 420 421 static struct buffer_head * 422 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, 423 struct inode *alloc_inode, 424 struct ocfs2_alloc_context *ac, 425 struct ocfs2_chain_list *cl) 426 { 427 int status; 428 u32 bit_off, num_bits; 429 u64 bg_blkno; 430 struct buffer_head *bg_bh; 431 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 432 433 status = ocfs2_claim_clusters(handle, ac, 434 le16_to_cpu(cl->cl_cpg), &bit_off, 435 &num_bits); 436 if (status < 0) { 437 if (status != -ENOSPC) 438 mlog_errno(status); 439 goto bail; 440 } 441 442 /* setup the group */ 443 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 444 trace_ocfs2_block_group_alloc_contig( 445 (unsigned long long)bg_blkno, alloc_rec); 446 447 bg_bh = sb_getblk(osb->sb, bg_blkno); 448 if (!bg_bh) { 449 status = -ENOMEM; 450 mlog_errno(status); 451 goto bail; 452 } 453 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 454 455 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 456 bg_blkno, num_bits, alloc_rec, cl); 457 if (status < 0) { 458 brelse(bg_bh); 459 mlog_errno(status); 460 } 461 462 bail: 463 return status ? ERR_PTR(status) : bg_bh; 464 } 465 466 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, 467 handle_t *handle, 468 struct ocfs2_alloc_context *ac, 469 unsigned int min_bits, 470 u32 *bit_off, u32 *num_bits) 471 { 472 int status = 0; 473 474 while (min_bits) { 475 status = ocfs2_claim_clusters(handle, ac, min_bits, 476 bit_off, num_bits); 477 if (status != -ENOSPC) 478 break; 479 480 min_bits >>= 1; 481 } 482 483 return status; 484 } 485 486 static int ocfs2_block_group_grow_discontig(handle_t *handle, 487 struct inode *alloc_inode, 488 struct buffer_head *bg_bh, 489 struct ocfs2_alloc_context *ac, 490 struct ocfs2_chain_list *cl, 491 unsigned int min_bits) 492 { 493 int status; 494 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 495 struct ocfs2_group_desc *bg = 496 (struct ocfs2_group_desc *)bg_bh->b_data; 497 unsigned int needed = le16_to_cpu(cl->cl_cpg) - 498 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 499 u32 p_cpos, clusters; 500 u64 p_blkno; 501 struct ocfs2_extent_list *el = &bg->bg_list; 502 503 status = ocfs2_journal_access_gd(handle, 504 INODE_CACHE(alloc_inode), 505 bg_bh, 506 OCFS2_JOURNAL_ACCESS_CREATE); 507 if (status < 0) { 508 mlog_errno(status); 509 goto bail; 510 } 511 512 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < 513 le16_to_cpu(el->l_count))) { 514 if (min_bits > needed) 515 min_bits = needed; 516 status = ocfs2_block_group_claim_bits(osb, handle, ac, 517 min_bits, &p_cpos, 518 &clusters); 519 if (status < 0) { 520 if (status != -ENOSPC) 521 mlog_errno(status); 522 goto bail; 523 } 524 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); 525 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, 526 clusters); 527 528 min_bits = clusters; 529 needed = le16_to_cpu(cl->cl_cpg) - 530 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 531 } 532 533 if (needed > 0) { 534 /* 535 * We have used up all the extent rec but can't fill up 536 * the cpg. So bail out. 537 */ 538 status = -ENOSPC; 539 goto bail; 540 } 541 542 ocfs2_journal_dirty(handle, bg_bh); 543 544 bail: 545 return status; 546 } 547 548 static void ocfs2_bg_alloc_cleanup(handle_t *handle, 549 struct ocfs2_alloc_context *cluster_ac, 550 struct inode *alloc_inode, 551 struct buffer_head *bg_bh) 552 { 553 int i, ret; 554 struct ocfs2_group_desc *bg; 555 struct ocfs2_extent_list *el; 556 struct ocfs2_extent_rec *rec; 557 558 if (!bg_bh) 559 return; 560 561 bg = (struct ocfs2_group_desc *)bg_bh->b_data; 562 el = &bg->bg_list; 563 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 564 rec = &el->l_recs[i]; 565 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, 566 cluster_ac->ac_bh, 567 le64_to_cpu(rec->e_blkno), 568 le16_to_cpu(rec->e_leaf_clusters)); 569 if (ret) 570 mlog_errno(ret); 571 /* Try all the clusters to free */ 572 } 573 574 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); 575 brelse(bg_bh); 576 } 577 578 static struct buffer_head * 579 ocfs2_block_group_alloc_discontig(handle_t *handle, 580 struct inode *alloc_inode, 581 struct ocfs2_alloc_context *ac, 582 struct ocfs2_chain_list *cl) 583 { 584 int status; 585 u32 bit_off, num_bits; 586 u64 bg_blkno; 587 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; 588 struct buffer_head *bg_bh = NULL; 589 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 590 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 591 592 if (!ocfs2_supports_discontig_bg(osb)) { 593 status = -ENOSPC; 594 goto bail; 595 } 596 597 status = ocfs2_extend_trans(handle, 598 ocfs2_calc_bg_discontig_credits(osb->sb)); 599 if (status) { 600 mlog_errno(status); 601 goto bail; 602 } 603 604 /* 605 * We're going to be grabbing from multiple cluster groups. 606 * We don't have enough credits to relink them all, and the 607 * cluster groups will be staying in cache for the duration of 608 * this operation. 609 */ 610 ac->ac_disable_chain_relink = 1; 611 612 /* Claim the first region */ 613 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, 614 &bit_off, &num_bits); 615 if (status < 0) { 616 if (status != -ENOSPC) 617 mlog_errno(status); 618 goto bail; 619 } 620 min_bits = num_bits; 621 622 /* setup the group */ 623 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 624 trace_ocfs2_block_group_alloc_discontig( 625 (unsigned long long)bg_blkno, alloc_rec); 626 627 bg_bh = sb_getblk(osb->sb, bg_blkno); 628 if (!bg_bh) { 629 status = -ENOMEM; 630 mlog_errno(status); 631 goto bail; 632 } 633 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 634 635 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 636 bg_blkno, num_bits, alloc_rec, cl); 637 if (status < 0) { 638 mlog_errno(status); 639 goto bail; 640 } 641 642 status = ocfs2_block_group_grow_discontig(handle, alloc_inode, 643 bg_bh, ac, cl, min_bits); 644 if (status) 645 mlog_errno(status); 646 647 bail: 648 if (status) 649 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); 650 return status ? ERR_PTR(status) : bg_bh; 651 } 652 653 /* 654 * We expect the block group allocator to already be locked. 655 */ 656 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 657 struct inode *alloc_inode, 658 struct buffer_head *bh, 659 u64 max_block, 660 u64 *last_alloc_group, 661 int flags) 662 { 663 int status, credits; 664 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 665 struct ocfs2_chain_list *cl; 666 struct ocfs2_alloc_context *ac = NULL; 667 handle_t *handle = NULL; 668 u16 alloc_rec; 669 struct buffer_head *bg_bh = NULL; 670 struct ocfs2_group_desc *bg; 671 672 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 673 674 cl = &fe->id2.i_chain; 675 status = ocfs2_reserve_clusters_with_limit(osb, 676 le16_to_cpu(cl->cl_cpg), 677 max_block, flags, &ac); 678 if (status < 0) { 679 if (status != -ENOSPC) 680 mlog_errno(status); 681 goto bail; 682 } 683 684 credits = ocfs2_calc_group_alloc_credits(osb->sb, 685 le16_to_cpu(cl->cl_cpg)); 686 handle = ocfs2_start_trans(osb, credits); 687 if (IS_ERR(handle)) { 688 status = PTR_ERR(handle); 689 handle = NULL; 690 mlog_errno(status); 691 goto bail; 692 } 693 694 if (last_alloc_group && *last_alloc_group != 0) { 695 trace_ocfs2_block_group_alloc( 696 (unsigned long long)*last_alloc_group); 697 ac->ac_last_group = *last_alloc_group; 698 } 699 700 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 701 ac, cl); 702 if (PTR_ERR(bg_bh) == -ENOSPC) { 703 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 704 bg_bh = ocfs2_block_group_alloc_discontig(handle, 705 alloc_inode, 706 ac, cl); 707 } 708 if (IS_ERR(bg_bh)) { 709 status = PTR_ERR(bg_bh); 710 bg_bh = NULL; 711 if (status != -ENOSPC) 712 mlog_errno(status); 713 goto bail; 714 } 715 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 716 717 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 718 bh, OCFS2_JOURNAL_ACCESS_WRITE); 719 if (status < 0) { 720 mlog_errno(status); 721 goto bail; 722 } 723 724 alloc_rec = le16_to_cpu(bg->bg_chain); 725 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 726 le16_to_cpu(bg->bg_free_bits_count)); 727 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 728 le16_to_cpu(bg->bg_bits)); 729 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; 730 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 731 le16_add_cpu(&cl->cl_next_free_rec, 1); 732 733 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 734 le16_to_cpu(bg->bg_free_bits_count)); 735 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 736 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 737 738 ocfs2_journal_dirty(handle, bh); 739 740 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 741 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 742 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 743 le32_to_cpu(fe->i_clusters))); 744 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 745 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 746 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 747 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 748 749 status = 0; 750 751 /* save the new last alloc group so that the caller can cache it. */ 752 if (last_alloc_group) 753 *last_alloc_group = ac->ac_last_group; 754 755 bail: 756 if (handle) 757 ocfs2_commit_trans(osb, handle); 758 759 if (ac) 760 ocfs2_free_alloc_context(ac); 761 762 brelse(bg_bh); 763 764 if (status) 765 mlog_errno(status); 766 return status; 767 } 768 769 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 770 struct ocfs2_alloc_context *ac, 771 int type, 772 u32 slot, 773 u64 *last_alloc_group, 774 int flags) 775 { 776 int status; 777 u32 bits_wanted = ac->ac_bits_wanted; 778 struct inode *alloc_inode; 779 struct buffer_head *bh = NULL; 780 struct ocfs2_dinode *fe; 781 u32 free_bits; 782 783 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 784 if (!alloc_inode) { 785 mlog_errno(-EINVAL); 786 return -EINVAL; 787 } 788 789 inode_lock(alloc_inode); 790 791 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 792 if (status < 0) { 793 inode_unlock(alloc_inode); 794 iput(alloc_inode); 795 796 mlog_errno(status); 797 return status; 798 } 799 800 ac->ac_inode = alloc_inode; 801 ac->ac_alloc_slot = slot; 802 803 fe = (struct ocfs2_dinode *) bh->b_data; 804 805 /* The bh was validated by the inode read inside 806 * ocfs2_inode_lock(). Any corruption is a code bug. */ 807 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 808 809 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 810 status = ocfs2_error(alloc_inode->i_sb, 811 "Invalid chain allocator %llu\n", 812 (unsigned long long)le64_to_cpu(fe->i_blkno)); 813 goto bail; 814 } 815 816 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 817 le32_to_cpu(fe->id1.bitmap1.i_used); 818 819 if (bits_wanted > free_bits) { 820 /* cluster bitmap never grows */ 821 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 822 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, 823 free_bits); 824 status = -ENOSPC; 825 goto bail; 826 } 827 828 if (!(flags & ALLOC_NEW_GROUP)) { 829 trace_ocfs2_reserve_suballoc_bits_no_new_group( 830 slot, bits_wanted, free_bits); 831 status = -ENOSPC; 832 goto bail; 833 } 834 835 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 836 ac->ac_max_block, 837 last_alloc_group, flags); 838 if (status < 0) { 839 if (status != -ENOSPC) 840 mlog_errno(status); 841 goto bail; 842 } 843 atomic_inc(&osb->alloc_stats.bg_extends); 844 845 /* You should never ask for this much metadata */ 846 BUG_ON(bits_wanted > 847 (le32_to_cpu(fe->id1.bitmap1.i_total) 848 - le32_to_cpu(fe->id1.bitmap1.i_used))); 849 } 850 851 get_bh(bh); 852 ac->ac_bh = bh; 853 bail: 854 brelse(bh); 855 856 if (status) 857 mlog_errno(status); 858 return status; 859 } 860 861 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 862 { 863 spin_lock(&osb->osb_lock); 864 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 865 spin_unlock(&osb->osb_lock); 866 atomic_set(&osb->s_num_inodes_stolen, 0); 867 } 868 869 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 870 { 871 spin_lock(&osb->osb_lock); 872 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 873 spin_unlock(&osb->osb_lock); 874 atomic_set(&osb->s_num_meta_stolen, 0); 875 } 876 877 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 878 { 879 ocfs2_init_inode_steal_slot(osb); 880 ocfs2_init_meta_steal_slot(osb); 881 } 882 883 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 884 { 885 spin_lock(&osb->osb_lock); 886 if (type == INODE_ALLOC_SYSTEM_INODE) 887 osb->s_inode_steal_slot = (u16)slot; 888 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 889 osb->s_meta_steal_slot = (u16)slot; 890 spin_unlock(&osb->osb_lock); 891 } 892 893 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 894 { 895 int slot = OCFS2_INVALID_SLOT; 896 897 spin_lock(&osb->osb_lock); 898 if (type == INODE_ALLOC_SYSTEM_INODE) 899 slot = osb->s_inode_steal_slot; 900 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 901 slot = osb->s_meta_steal_slot; 902 spin_unlock(&osb->osb_lock); 903 904 return slot; 905 } 906 907 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 908 { 909 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 910 } 911 912 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 913 { 914 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 915 } 916 917 static int ocfs2_steal_resource(struct ocfs2_super *osb, 918 struct ocfs2_alloc_context *ac, 919 int type) 920 { 921 int i, status = -ENOSPC; 922 int slot = __ocfs2_get_steal_slot(osb, type); 923 924 /* Start to steal resource from the first slot after ours. */ 925 if (slot == OCFS2_INVALID_SLOT) 926 slot = osb->slot_num + 1; 927 928 for (i = 0; i < osb->max_slots; i++, slot++) { 929 if (slot == osb->max_slots) 930 slot = 0; 931 932 if (slot == osb->slot_num) 933 continue; 934 935 status = ocfs2_reserve_suballoc_bits(osb, ac, 936 type, 937 (u32)slot, NULL, 938 NOT_ALLOC_NEW_GROUP); 939 if (status >= 0) { 940 __ocfs2_set_steal_slot(osb, slot, type); 941 break; 942 } 943 944 ocfs2_free_ac_resource(ac); 945 } 946 947 return status; 948 } 949 950 static int ocfs2_steal_inode(struct ocfs2_super *osb, 951 struct ocfs2_alloc_context *ac) 952 { 953 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 954 } 955 956 static int ocfs2_steal_meta(struct ocfs2_super *osb, 957 struct ocfs2_alloc_context *ac) 958 { 959 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 960 } 961 962 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 963 int blocks, 964 struct ocfs2_alloc_context **ac) 965 { 966 int status; 967 int slot = ocfs2_get_meta_steal_slot(osb); 968 969 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 970 if (!(*ac)) { 971 status = -ENOMEM; 972 mlog_errno(status); 973 goto bail; 974 } 975 976 (*ac)->ac_bits_wanted = blocks; 977 (*ac)->ac_which = OCFS2_AC_USE_META; 978 (*ac)->ac_group_search = ocfs2_block_group_search; 979 980 if (slot != OCFS2_INVALID_SLOT && 981 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 982 goto extent_steal; 983 984 atomic_set(&osb->s_num_meta_stolen, 0); 985 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 986 EXTENT_ALLOC_SYSTEM_INODE, 987 (u32)osb->slot_num, NULL, 988 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); 989 990 991 if (status >= 0) { 992 status = 0; 993 if (slot != OCFS2_INVALID_SLOT) 994 ocfs2_init_meta_steal_slot(osb); 995 goto bail; 996 } else if (status < 0 && status != -ENOSPC) { 997 mlog_errno(status); 998 goto bail; 999 } 1000 1001 ocfs2_free_ac_resource(*ac); 1002 1003 extent_steal: 1004 status = ocfs2_steal_meta(osb, *ac); 1005 atomic_inc(&osb->s_num_meta_stolen); 1006 if (status < 0) { 1007 if (status != -ENOSPC) 1008 mlog_errno(status); 1009 goto bail; 1010 } 1011 1012 status = 0; 1013 bail: 1014 if ((status < 0) && *ac) { 1015 ocfs2_free_alloc_context(*ac); 1016 *ac = NULL; 1017 } 1018 1019 if (status) 1020 mlog_errno(status); 1021 return status; 1022 } 1023 1024 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 1025 struct ocfs2_extent_list *root_el, 1026 struct ocfs2_alloc_context **ac) 1027 { 1028 return ocfs2_reserve_new_metadata_blocks(osb, 1029 ocfs2_extend_meta_needed(root_el), 1030 ac); 1031 } 1032 1033 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 1034 struct ocfs2_alloc_context **ac) 1035 { 1036 int status; 1037 int slot = ocfs2_get_inode_steal_slot(osb); 1038 u64 alloc_group; 1039 1040 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1041 if (!(*ac)) { 1042 status = -ENOMEM; 1043 mlog_errno(status); 1044 goto bail; 1045 } 1046 1047 (*ac)->ac_bits_wanted = 1; 1048 (*ac)->ac_which = OCFS2_AC_USE_INODE; 1049 1050 (*ac)->ac_group_search = ocfs2_block_group_search; 1051 1052 /* 1053 * stat(2) can't handle i_ino > 32bits, so we tell the 1054 * lower levels not to allocate us a block group past that 1055 * limit. The 'inode64' mount option avoids this behavior. 1056 */ 1057 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 1058 (*ac)->ac_max_block = (u32)~0U; 1059 1060 /* 1061 * slot is set when we successfully steal inode from other nodes. 1062 * It is reset in 3 places: 1063 * 1. when we flush the truncate log 1064 * 2. when we complete local alloc recovery. 1065 * 3. when we successfully allocate from our own slot. 1066 * After it is set, we will go on stealing inodes until we find the 1067 * need to check our slots to see whether there is some space for us. 1068 */ 1069 if (slot != OCFS2_INVALID_SLOT && 1070 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 1071 goto inode_steal; 1072 1073 atomic_set(&osb->s_num_inodes_stolen, 0); 1074 alloc_group = osb->osb_inode_alloc_group; 1075 status = ocfs2_reserve_suballoc_bits(osb, *ac, 1076 INODE_ALLOC_SYSTEM_INODE, 1077 (u32)osb->slot_num, 1078 &alloc_group, 1079 ALLOC_NEW_GROUP | 1080 ALLOC_GROUPS_FROM_GLOBAL); 1081 if (status >= 0) { 1082 status = 0; 1083 1084 spin_lock(&osb->osb_lock); 1085 osb->osb_inode_alloc_group = alloc_group; 1086 spin_unlock(&osb->osb_lock); 1087 trace_ocfs2_reserve_new_inode_new_group( 1088 (unsigned long long)alloc_group); 1089 1090 /* 1091 * Some inodes must be freed by us, so try to allocate 1092 * from our own next time. 1093 */ 1094 if (slot != OCFS2_INVALID_SLOT) 1095 ocfs2_init_inode_steal_slot(osb); 1096 goto bail; 1097 } else if (status < 0 && status != -ENOSPC) { 1098 mlog_errno(status); 1099 goto bail; 1100 } 1101 1102 ocfs2_free_ac_resource(*ac); 1103 1104 inode_steal: 1105 status = ocfs2_steal_inode(osb, *ac); 1106 atomic_inc(&osb->s_num_inodes_stolen); 1107 if (status < 0) { 1108 if (status != -ENOSPC) 1109 mlog_errno(status); 1110 goto bail; 1111 } 1112 1113 status = 0; 1114 bail: 1115 if ((status < 0) && *ac) { 1116 ocfs2_free_alloc_context(*ac); 1117 *ac = NULL; 1118 } 1119 1120 if (status) 1121 mlog_errno(status); 1122 return status; 1123 } 1124 1125 /* local alloc code has to do the same thing, so rather than do this 1126 * twice.. */ 1127 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 1128 struct ocfs2_alloc_context *ac) 1129 { 1130 int status; 1131 1132 ac->ac_which = OCFS2_AC_USE_MAIN; 1133 ac->ac_group_search = ocfs2_cluster_group_search; 1134 1135 status = ocfs2_reserve_suballoc_bits(osb, ac, 1136 GLOBAL_BITMAP_SYSTEM_INODE, 1137 OCFS2_INVALID_SLOT, NULL, 1138 ALLOC_NEW_GROUP); 1139 if (status < 0 && status != -ENOSPC) 1140 mlog_errno(status); 1141 1142 return status; 1143 } 1144 1145 /* Callers don't need to care which bitmap (local alloc or main) to 1146 * use so we figure it out for them, but unfortunately this clutters 1147 * things a bit. */ 1148 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 1149 u32 bits_wanted, u64 max_block, 1150 int flags, 1151 struct ocfs2_alloc_context **ac) 1152 { 1153 int status, ret = 0; 1154 int retried = 0; 1155 1156 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1157 if (!(*ac)) { 1158 status = -ENOMEM; 1159 mlog_errno(status); 1160 goto bail; 1161 } 1162 1163 (*ac)->ac_bits_wanted = bits_wanted; 1164 (*ac)->ac_max_block = max_block; 1165 1166 status = -ENOSPC; 1167 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 1168 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 1169 status = ocfs2_reserve_local_alloc_bits(osb, 1170 bits_wanted, 1171 *ac); 1172 if ((status < 0) && (status != -ENOSPC)) { 1173 mlog_errno(status); 1174 goto bail; 1175 } 1176 } 1177 1178 if (status == -ENOSPC) { 1179 retry: 1180 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1181 /* Retry if there is sufficient space cached in truncate log */ 1182 if (status == -ENOSPC && !retried) { 1183 retried = 1; 1184 ocfs2_inode_unlock((*ac)->ac_inode, 1); 1185 inode_unlock((*ac)->ac_inode); 1186 1187 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1188 if (ret == 1) { 1189 iput((*ac)->ac_inode); 1190 (*ac)->ac_inode = NULL; 1191 goto retry; 1192 } 1193 1194 if (ret < 0) 1195 mlog_errno(ret); 1196 1197 inode_lock((*ac)->ac_inode); 1198 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1199 if (ret < 0) { 1200 mlog_errno(ret); 1201 inode_unlock((*ac)->ac_inode); 1202 iput((*ac)->ac_inode); 1203 (*ac)->ac_inode = NULL; 1204 goto bail; 1205 } 1206 } 1207 if (status < 0) { 1208 if (status != -ENOSPC) 1209 mlog_errno(status); 1210 goto bail; 1211 } 1212 } 1213 1214 status = 0; 1215 bail: 1216 if ((status < 0) && *ac) { 1217 ocfs2_free_alloc_context(*ac); 1218 *ac = NULL; 1219 } 1220 1221 if (status) 1222 mlog_errno(status); 1223 return status; 1224 } 1225 1226 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 1227 u32 bits_wanted, 1228 struct ocfs2_alloc_context **ac) 1229 { 1230 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 1231 ALLOC_NEW_GROUP, ac); 1232 } 1233 1234 /* 1235 * More or less lifted from ext3. I'll leave their description below: 1236 * 1237 * "For ext3 allocations, we must not reuse any blocks which are 1238 * allocated in the bitmap buffer's "last committed data" copy. This 1239 * prevents deletes from freeing up the page for reuse until we have 1240 * committed the delete transaction. 1241 * 1242 * If we didn't do this, then deleting something and reallocating it as 1243 * data would allow the old block to be overwritten before the 1244 * transaction committed (because we force data to disk before commit). 1245 * This would lead to corruption if we crashed between overwriting the 1246 * data and committing the delete. 1247 * 1248 * @@@ We may want to make this allocation behaviour conditional on 1249 * data-writes at some point, and disable it for metadata allocations or 1250 * sync-data inodes." 1251 * 1252 * Note: OCFS2 already does this differently for metadata vs data 1253 * allocations, as those bitmaps are separate and undo access is never 1254 * called on a metadata group descriptor. 1255 */ 1256 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1257 int nr) 1258 { 1259 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1260 struct journal_head *jh; 1261 int ret; 1262 1263 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1264 return 0; 1265 1266 jh = jbd2_journal_grab_journal_head(bg_bh); 1267 if (!jh) 1268 return 1; 1269 1270 spin_lock(&jh->b_state_lock); 1271 bg = (struct ocfs2_group_desc *) jh->b_committed_data; 1272 if (bg) 1273 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1274 else 1275 ret = 1; 1276 spin_unlock(&jh->b_state_lock); 1277 jbd2_journal_put_journal_head(jh); 1278 1279 return ret; 1280 } 1281 1282 u16 ocfs2_find_max_contig_free_bits(void *bitmap, 1283 u16 total_bits, u16 start) 1284 { 1285 u16 offset, free_bits; 1286 u16 contig_bits = 0; 1287 1288 while (start < total_bits) { 1289 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); 1290 if (offset == total_bits) 1291 break; 1292 1293 start = ocfs2_find_next_bit(bitmap, total_bits, offset); 1294 free_bits = start - offset; 1295 if (contig_bits < free_bits) 1296 contig_bits = free_bits; 1297 } 1298 1299 return contig_bits; 1300 } 1301 1302 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1303 struct buffer_head *bg_bh, 1304 unsigned int bits_wanted, 1305 unsigned int total_bits, 1306 struct ocfs2_suballoc_result *res) 1307 { 1308 void *bitmap; 1309 u16 best_offset, best_size; 1310 u16 prev_best_size = 0; 1311 int offset, start, found, status = 0; 1312 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1313 1314 /* Callers got this descriptor from 1315 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1316 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1317 1318 found = start = best_offset = best_size = 0; 1319 bitmap = bg->bg_bitmap; 1320 1321 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < 1322 total_bits) { 1323 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1324 /* We found a zero, but we can't use it as it 1325 * hasn't been put to disk yet! */ 1326 found = 0; 1327 start = offset + 1; 1328 } else if (offset == start) { 1329 /* we found a zero */ 1330 found++; 1331 /* move start to the next bit to test */ 1332 start++; 1333 } else { 1334 /* got a zero after some ones */ 1335 found = 1; 1336 start = offset + 1; 1337 prev_best_size = best_size; 1338 } 1339 if (found > best_size) { 1340 best_size = found; 1341 best_offset = start - found; 1342 } 1343 /* we got everything we needed */ 1344 if (found == bits_wanted) { 1345 /* mlog(0, "Found it all!\n"); */ 1346 break; 1347 } 1348 } 1349 1350 /* best_size will be allocated, we save prev_best_size */ 1351 res->sr_max_contig_bits = prev_best_size; 1352 if (best_size) { 1353 res->sr_bit_offset = best_offset; 1354 res->sr_bits = best_size; 1355 } else { 1356 status = -ENOSPC; 1357 /* No error log here -- see the comment above 1358 * ocfs2_test_bg_bit_allocatable */ 1359 } 1360 1361 return status; 1362 } 1363 1364 int ocfs2_block_group_set_bits(handle_t *handle, 1365 struct inode *alloc_inode, 1366 struct ocfs2_group_desc *bg, 1367 struct buffer_head *group_bh, 1368 unsigned int bit_off, 1369 unsigned int num_bits, 1370 unsigned int max_contig_bits, 1371 int fastpath) 1372 { 1373 int status; 1374 void *bitmap = bg->bg_bitmap; 1375 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1376 unsigned int start = bit_off + num_bits; 1377 u16 contig_bits; 1378 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1379 1380 /* All callers get the descriptor via 1381 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1382 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1383 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1384 1385 trace_ocfs2_block_group_set_bits(bit_off, num_bits); 1386 1387 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1388 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1389 1390 status = ocfs2_journal_access_gd(handle, 1391 INODE_CACHE(alloc_inode), 1392 group_bh, 1393 journal_type); 1394 if (status < 0) { 1395 mlog_errno(status); 1396 goto bail; 1397 } 1398 1399 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1400 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1401 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 1402 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1403 le16_to_cpu(bg->bg_bits), 1404 le16_to_cpu(bg->bg_free_bits_count), 1405 num_bits); 1406 } 1407 while(num_bits--) 1408 ocfs2_set_bit(bit_off++, bitmap); 1409 1410 /* 1411 * this is optimize path, caller set old contig value 1412 * in max_contig_bits to bypass finding action. 1413 */ 1414 if (fastpath) { 1415 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1416 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { 1417 /* 1418 * Usually, the block group bitmap allocates only 1 bit 1419 * at a time, while the cluster group allocates n bits 1420 * each time. Therefore, we only save the contig bits for 1421 * the cluster group. 1422 */ 1423 contig_bits = ocfs2_find_max_contig_free_bits(bitmap, 1424 le16_to_cpu(bg->bg_bits), start); 1425 if (contig_bits > max_contig_bits) 1426 max_contig_bits = contig_bits; 1427 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1428 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); 1429 } else { 1430 bg->bg_contig_free_bits = 0; 1431 } 1432 1433 ocfs2_journal_dirty(handle, group_bh); 1434 1435 bail: 1436 return status; 1437 } 1438 1439 /* find the one with the most empty bits */ 1440 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1441 { 1442 u16 curr, best; 1443 1444 BUG_ON(!cl->cl_next_free_rec); 1445 1446 best = curr = 0; 1447 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1448 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1449 le32_to_cpu(cl->cl_recs[best].c_free)) 1450 best = curr; 1451 curr++; 1452 } 1453 1454 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1455 return best; 1456 } 1457 1458 static int ocfs2_relink_block_group(handle_t *handle, 1459 struct inode *alloc_inode, 1460 struct buffer_head *fe_bh, 1461 struct buffer_head *bg_bh, 1462 struct buffer_head *prev_bg_bh, 1463 u16 chain) 1464 { 1465 int status; 1466 /* there is a really tiny chance the journal calls could fail, 1467 * but we wouldn't want inconsistent blocks in *any* case. */ 1468 u64 bg_ptr, prev_bg_ptr; 1469 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1470 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1471 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1472 1473 /* The caller got these descriptors from 1474 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1475 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1476 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1477 1478 trace_ocfs2_relink_block_group( 1479 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1480 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1481 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1482 1483 bg_ptr = le64_to_cpu(bg->bg_next_group); 1484 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1485 1486 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1487 prev_bg_bh, 1488 OCFS2_JOURNAL_ACCESS_WRITE); 1489 if (status < 0) 1490 goto out; 1491 1492 prev_bg->bg_next_group = bg->bg_next_group; 1493 ocfs2_journal_dirty(handle, prev_bg_bh); 1494 1495 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1496 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1497 if (status < 0) 1498 goto out_rollback_prev_bg; 1499 1500 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1501 ocfs2_journal_dirty(handle, bg_bh); 1502 1503 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1504 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1505 if (status < 0) 1506 goto out_rollback_bg; 1507 1508 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1509 ocfs2_journal_dirty(handle, fe_bh); 1510 1511 out: 1512 if (status < 0) 1513 mlog_errno(status); 1514 return status; 1515 1516 out_rollback_bg: 1517 bg->bg_next_group = cpu_to_le64(bg_ptr); 1518 out_rollback_prev_bg: 1519 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1520 goto out; 1521 } 1522 1523 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1524 u32 wanted) 1525 { 1526 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1527 } 1528 1529 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1530 * value on error. */ 1531 static int ocfs2_cluster_group_search(struct inode *inode, 1532 struct buffer_head *group_bh, 1533 u32 bits_wanted, u32 min_bits, 1534 u64 max_block, 1535 struct ocfs2_suballoc_result *res) 1536 { 1537 int search = -ENOSPC; 1538 int ret; 1539 u64 blkoff; 1540 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1541 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1542 unsigned int max_bits, gd_cluster_off; 1543 1544 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1545 1546 if (le16_to_cpu(gd->bg_contig_free_bits) && 1547 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) 1548 return -ENOSPC; 1549 1550 /* ->bg_contig_free_bits may un-initialized, so compare again */ 1551 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { 1552 max_bits = le16_to_cpu(gd->bg_bits); 1553 1554 /* Tail groups in cluster bitmaps which aren't cpg 1555 * aligned are prone to partial extension by a failed 1556 * fs resize. If the file system resize never got to 1557 * update the dinode cluster count, then we don't want 1558 * to trust any clusters past it, regardless of what 1559 * the group descriptor says. */ 1560 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1561 le64_to_cpu(gd->bg_blkno)); 1562 if ((gd_cluster_off + max_bits) > 1563 OCFS2_I(inode)->ip_clusters) { 1564 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1565 trace_ocfs2_cluster_group_search_wrong_max_bits( 1566 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1567 le16_to_cpu(gd->bg_bits), 1568 OCFS2_I(inode)->ip_clusters, max_bits); 1569 } 1570 1571 ret = ocfs2_block_group_find_clear_bits(osb, 1572 group_bh, bits_wanted, 1573 max_bits, res); 1574 if (ret) 1575 return ret; 1576 1577 if (max_block) { 1578 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1579 gd_cluster_off + 1580 res->sr_bit_offset + 1581 res->sr_bits); 1582 trace_ocfs2_cluster_group_search_max_block( 1583 (unsigned long long)blkoff, 1584 (unsigned long long)max_block); 1585 if (blkoff > max_block) 1586 return -ENOSPC; 1587 } 1588 1589 /* ocfs2_block_group_find_clear_bits() might 1590 * return success, but we still want to return 1591 * -ENOSPC unless it found the minimum number 1592 * of bits. */ 1593 if (min_bits <= res->sr_bits) 1594 search = 0; /* success */ 1595 } 1596 1597 return search; 1598 } 1599 1600 static int ocfs2_block_group_search(struct inode *inode, 1601 struct buffer_head *group_bh, 1602 u32 bits_wanted, u32 min_bits, 1603 u64 max_block, 1604 struct ocfs2_suballoc_result *res) 1605 { 1606 int ret = -ENOSPC; 1607 u64 blkoff; 1608 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1609 1610 BUG_ON(min_bits != 1); 1611 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1612 1613 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { 1614 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1615 group_bh, bits_wanted, 1616 le16_to_cpu(bg->bg_bits), 1617 res); 1618 if (!ret && max_block) { 1619 blkoff = le64_to_cpu(bg->bg_blkno) + 1620 res->sr_bit_offset + res->sr_bits; 1621 trace_ocfs2_block_group_search_max_block( 1622 (unsigned long long)blkoff, 1623 (unsigned long long)max_block); 1624 if (blkoff > max_block) 1625 ret = -ENOSPC; 1626 } 1627 } 1628 1629 return ret; 1630 } 1631 1632 int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1633 handle_t *handle, 1634 struct buffer_head *di_bh, 1635 u32 num_bits, 1636 u16 chain) 1637 { 1638 int ret; 1639 u32 tmp_used; 1640 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1641 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1642 1643 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1644 OCFS2_JOURNAL_ACCESS_WRITE); 1645 if (ret < 0) { 1646 mlog_errno(ret); 1647 goto out; 1648 } 1649 1650 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1651 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1652 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1653 ocfs2_journal_dirty(handle, di_bh); 1654 1655 out: 1656 return ret; 1657 } 1658 1659 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, 1660 struct buffer_head *di_bh, 1661 u32 num_bits, 1662 u16 chain) 1663 { 1664 u32 tmp_used; 1665 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1666 struct ocfs2_chain_list *cl; 1667 1668 cl = (struct ocfs2_chain_list *)&di->id2.i_chain; 1669 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1670 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); 1671 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); 1672 } 1673 1674 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1675 struct ocfs2_extent_rec *rec, 1676 struct ocfs2_chain_list *cl) 1677 { 1678 unsigned int bpc = le16_to_cpu(cl->cl_bpc); 1679 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; 1680 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; 1681 1682 if (res->sr_bit_offset < bitoff) 1683 return 0; 1684 if (res->sr_bit_offset >= (bitoff + bitcount)) 1685 return 0; 1686 res->sr_blkno = le64_to_cpu(rec->e_blkno) + 1687 (res->sr_bit_offset - bitoff); 1688 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) 1689 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; 1690 return 1; 1691 } 1692 1693 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, 1694 struct ocfs2_group_desc *bg, 1695 struct ocfs2_suballoc_result *res) 1696 { 1697 int i; 1698 u64 bg_blkno = res->sr_bg_blkno; /* Save off */ 1699 struct ocfs2_extent_rec *rec; 1700 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1701 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1702 1703 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { 1704 res->sr_blkno = 0; 1705 return; 1706 } 1707 1708 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; 1709 res->sr_bg_blkno = 0; /* Clear it for contig block groups */ 1710 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || 1711 !bg->bg_list.l_next_free_rec) 1712 return; 1713 1714 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { 1715 rec = &bg->bg_list.l_recs[i]; 1716 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { 1717 res->sr_bg_blkno = bg_blkno; /* Restore */ 1718 break; 1719 } 1720 } 1721 } 1722 1723 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1724 handle_t *handle, 1725 u32 bits_wanted, 1726 u32 min_bits, 1727 struct ocfs2_suballoc_result *res, 1728 u16 *bits_left) 1729 { 1730 int ret; 1731 struct buffer_head *group_bh = NULL; 1732 struct ocfs2_group_desc *gd; 1733 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1734 struct inode *alloc_inode = ac->ac_inode; 1735 1736 ret = ocfs2_read_group_descriptor(alloc_inode, di, 1737 res->sr_bg_blkno, &group_bh); 1738 if (ret < 0) { 1739 mlog_errno(ret); 1740 return ret; 1741 } 1742 1743 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1744 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1745 ac->ac_max_block, res); 1746 if (ret < 0) { 1747 if (ret != -ENOSPC) 1748 mlog_errno(ret); 1749 goto out; 1750 } 1751 1752 if (!ret) 1753 ocfs2_bg_discontig_fix_result(ac, gd, res); 1754 1755 /* 1756 * sr_bg_blkno might have been changed by 1757 * ocfs2_bg_discontig_fix_result 1758 */ 1759 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1760 1761 if (ac->ac_find_loc_only) 1762 goto out_loc_only; 1763 1764 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1765 res->sr_bits, 1766 le16_to_cpu(gd->bg_chain)); 1767 if (ret < 0) { 1768 mlog_errno(ret); 1769 goto out; 1770 } 1771 1772 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1773 res->sr_bit_offset, res->sr_bits, 1774 res->sr_max_contig_bits, 0); 1775 if (ret < 0) { 1776 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, 1777 res->sr_bits, 1778 le16_to_cpu(gd->bg_chain)); 1779 mlog_errno(ret); 1780 } 1781 1782 out_loc_only: 1783 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1784 1785 out: 1786 brelse(group_bh); 1787 1788 return ret; 1789 } 1790 1791 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1792 handle_t *handle, 1793 u32 bits_wanted, 1794 u32 min_bits, 1795 struct ocfs2_suballoc_result *res, 1796 u16 *bits_left) 1797 { 1798 int status; 1799 u16 chain; 1800 u32 contig_bits; 1801 u64 next_group; 1802 struct inode *alloc_inode = ac->ac_inode; 1803 struct buffer_head *group_bh = NULL; 1804 struct buffer_head *prev_group_bh = NULL; 1805 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1806 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1807 struct ocfs2_group_desc *bg; 1808 1809 chain = ac->ac_chain; 1810 trace_ocfs2_search_chain_begin( 1811 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 1812 bits_wanted, chain); 1813 1814 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1815 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1816 &group_bh); 1817 if (status < 0) { 1818 mlog_errno(status); 1819 goto bail; 1820 } 1821 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1822 1823 status = -ENOSPC; 1824 /* for now, the chain search is a bit simplistic. We just use 1825 * the 1st group with any empty bits. */ 1826 while (1) { 1827 if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { 1828 contig_bits = le16_to_cpu(bg->bg_contig_free_bits); 1829 if (!contig_bits) 1830 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 1831 le16_to_cpu(bg->bg_bits), 0); 1832 if (bits_wanted > contig_bits && contig_bits >= min_bits) 1833 bits_wanted = contig_bits; 1834 } 1835 1836 status = ac->ac_group_search(alloc_inode, group_bh, 1837 bits_wanted, min_bits, 1838 ac->ac_max_block, res); 1839 if (status != -ENOSPC) 1840 break; 1841 if (!bg->bg_next_group) 1842 break; 1843 1844 brelse(prev_group_bh); 1845 prev_group_bh = NULL; 1846 1847 next_group = le64_to_cpu(bg->bg_next_group); 1848 prev_group_bh = group_bh; 1849 group_bh = NULL; 1850 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1851 next_group, &group_bh); 1852 if (status < 0) { 1853 mlog_errno(status); 1854 goto bail; 1855 } 1856 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1857 } 1858 if (status < 0) { 1859 if (status != -ENOSPC) 1860 mlog_errno(status); 1861 goto bail; 1862 } 1863 1864 trace_ocfs2_search_chain_succ( 1865 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); 1866 1867 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1868 1869 BUG_ON(res->sr_bits == 0); 1870 if (!status) 1871 ocfs2_bg_discontig_fix_result(ac, bg, res); 1872 1873 /* 1874 * sr_bg_blkno might have been changed by 1875 * ocfs2_bg_discontig_fix_result 1876 */ 1877 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1878 1879 /* 1880 * Keep track of previous block descriptor read. When 1881 * we find a target, if we have read more than X 1882 * number of descriptors, and the target is reasonably 1883 * empty, relink him to top of his chain. 1884 * 1885 * We've read 0 extra blocks and only send one more to 1886 * the transaction, yet the next guy to search has a 1887 * much easier time. 1888 * 1889 * Do this *after* figuring out how many bits we're taking out 1890 * of our target group. 1891 */ 1892 if (!ac->ac_disable_chain_relink && 1893 (prev_group_bh) && 1894 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { 1895 status = ocfs2_relink_block_group(handle, alloc_inode, 1896 ac->ac_bh, group_bh, 1897 prev_group_bh, chain); 1898 if (status < 0) { 1899 mlog_errno(status); 1900 goto bail; 1901 } 1902 } 1903 1904 if (ac->ac_find_loc_only) 1905 goto out_loc_only; 1906 1907 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, 1908 ac->ac_bh, res->sr_bits, 1909 chain); 1910 if (status) { 1911 mlog_errno(status); 1912 goto bail; 1913 } 1914 1915 status = ocfs2_block_group_set_bits(handle, 1916 alloc_inode, 1917 bg, 1918 group_bh, 1919 res->sr_bit_offset, 1920 res->sr_bits, 1921 res->sr_max_contig_bits, 1922 0); 1923 if (status < 0) { 1924 ocfs2_rollback_alloc_dinode_counts(alloc_inode, 1925 ac->ac_bh, res->sr_bits, chain); 1926 mlog_errno(status); 1927 goto bail; 1928 } 1929 1930 trace_ocfs2_search_chain_end( 1931 (unsigned long long)le64_to_cpu(fe->i_blkno), 1932 res->sr_bits); 1933 1934 out_loc_only: 1935 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1936 bail: 1937 brelse(group_bh); 1938 brelse(prev_group_bh); 1939 1940 if (status) 1941 mlog_errno(status); 1942 return status; 1943 } 1944 1945 /* will give out up to bits_wanted contiguous bits. */ 1946 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 1947 handle_t *handle, 1948 u32 bits_wanted, 1949 u32 min_bits, 1950 struct ocfs2_suballoc_result *res) 1951 { 1952 int status; 1953 u16 victim, i; 1954 u16 bits_left = 0; 1955 u64 hint = ac->ac_last_group; 1956 struct ocfs2_chain_list *cl; 1957 struct ocfs2_dinode *fe; 1958 1959 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1960 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1961 BUG_ON(!ac->ac_bh); 1962 1963 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1964 1965 /* The bh was validated by the inode read during 1966 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 1967 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 1968 1969 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1970 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1971 status = ocfs2_error(ac->ac_inode->i_sb, 1972 "Chain allocator dinode %llu has %u used bits but only %u total\n", 1973 (unsigned long long)le64_to_cpu(fe->i_blkno), 1974 le32_to_cpu(fe->id1.bitmap1.i_used), 1975 le32_to_cpu(fe->id1.bitmap1.i_total)); 1976 goto bail; 1977 } 1978 1979 res->sr_bg_blkno = hint; 1980 if (res->sr_bg_blkno) { 1981 /* Attempt to short-circuit the usual search mechanism 1982 * by jumping straight to the most recently used 1983 * allocation group. This helps us maintain some 1984 * contiguousness across allocations. */ 1985 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1986 min_bits, res, &bits_left); 1987 if (!status) 1988 goto set_hint; 1989 if (status < 0 && status != -ENOSPC) { 1990 mlog_errno(status); 1991 goto bail; 1992 } 1993 } 1994 1995 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1996 if (!le16_to_cpu(cl->cl_next_free_rec) || 1997 le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { 1998 status = ocfs2_error(ac->ac_inode->i_sb, 1999 "Chain allocator dinode %llu has invalid next " 2000 "free chain record %u, but only %u total\n", 2001 (unsigned long long)le64_to_cpu(fe->i_blkno), 2002 le16_to_cpu(cl->cl_next_free_rec), 2003 le16_to_cpu(cl->cl_count)); 2004 goto bail; 2005 } 2006 2007 victim = ocfs2_find_victim_chain(cl); 2008 ac->ac_chain = victim; 2009 2010 search: 2011 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2012 res, &bits_left); 2013 if (!status) { 2014 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) 2015 hint = res->sr_bg_blkno; 2016 else 2017 hint = ocfs2_group_from_res(res); 2018 goto set_hint; 2019 } 2020 if (status < 0 && status != -ENOSPC) { 2021 mlog_errno(status); 2022 goto bail; 2023 } 2024 2025 trace_ocfs2_claim_suballoc_bits(victim); 2026 2027 /* If we didn't pick a good victim, then just default to 2028 * searching each chain in order. Don't allow chain relinking 2029 * because we only calculate enough journal credits for one 2030 * relink per alloc. */ 2031 ac->ac_disable_chain_relink = 1; 2032 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 2033 if (i == victim) 2034 continue; 2035 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) 2036 continue; 2037 2038 ac->ac_chain = i; 2039 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2040 res, &bits_left); 2041 if (!status) { 2042 hint = ocfs2_group_from_res(res); 2043 break; 2044 } 2045 if (status < 0 && status != -ENOSPC) { 2046 mlog_errno(status); 2047 goto bail; 2048 } 2049 } 2050 2051 /* Chains can't supply the bits_wanted contiguous space. 2052 * We should switch to using every single bit when allocating 2053 * from the global bitmap. */ 2054 if (i == le16_to_cpu(cl->cl_next_free_rec) && 2055 status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { 2056 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 2057 ac->ac_chain = victim; 2058 goto search; 2059 } 2060 2061 set_hint: 2062 if (status != -ENOSPC) { 2063 /* If the next search of this group is not likely to 2064 * yield a suitable extent, then we reset the last 2065 * group hint so as to not waste a disk read */ 2066 if (bits_left < min_bits) 2067 ac->ac_last_group = 0; 2068 else 2069 ac->ac_last_group = hint; 2070 } 2071 2072 bail: 2073 if (status) 2074 mlog_errno(status); 2075 return status; 2076 } 2077 2078 int ocfs2_claim_metadata(handle_t *handle, 2079 struct ocfs2_alloc_context *ac, 2080 u32 bits_wanted, 2081 u64 *suballoc_loc, 2082 u16 *suballoc_bit_start, 2083 unsigned int *num_bits, 2084 u64 *blkno_start) 2085 { 2086 int status; 2087 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2088 2089 BUG_ON(!ac); 2090 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 2091 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 2092 2093 status = ocfs2_claim_suballoc_bits(ac, 2094 handle, 2095 bits_wanted, 2096 1, 2097 &res); 2098 if (status < 0) { 2099 mlog_errno(status); 2100 goto bail; 2101 } 2102 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2103 2104 *suballoc_loc = res.sr_bg_blkno; 2105 *suballoc_bit_start = res.sr_bit_offset; 2106 *blkno_start = res.sr_blkno; 2107 ac->ac_bits_given += res.sr_bits; 2108 *num_bits = res.sr_bits; 2109 status = 0; 2110 bail: 2111 if (status) 2112 mlog_errno(status); 2113 return status; 2114 } 2115 2116 static void ocfs2_init_inode_ac_group(struct inode *dir, 2117 struct buffer_head *parent_di_bh, 2118 struct ocfs2_alloc_context *ac) 2119 { 2120 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; 2121 /* 2122 * Try to allocate inodes from some specific group. 2123 * 2124 * If the parent dir has recorded the last group used in allocation, 2125 * cool, use it. Otherwise if we try to allocate new inode from the 2126 * same slot the parent dir belongs to, use the same chunk. 2127 * 2128 * We are very careful here to avoid the mistake of setting 2129 * ac_last_group to a group descriptor from a different (unlocked) slot. 2130 */ 2131 if (OCFS2_I(dir)->ip_last_used_group && 2132 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2133 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2134 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { 2135 if (di->i_suballoc_loc) 2136 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); 2137 else 2138 ac->ac_last_group = ocfs2_which_suballoc_group( 2139 le64_to_cpu(di->i_blkno), 2140 le16_to_cpu(di->i_suballoc_bit)); 2141 } 2142 } 2143 2144 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2145 struct ocfs2_alloc_context *ac) 2146 { 2147 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 2148 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2149 } 2150 2151 int ocfs2_find_new_inode_loc(struct inode *dir, 2152 struct buffer_head *parent_fe_bh, 2153 struct ocfs2_alloc_context *ac, 2154 u64 *fe_blkno) 2155 { 2156 int ret; 2157 handle_t *handle = NULL; 2158 struct ocfs2_suballoc_result *res; 2159 2160 BUG_ON(!ac); 2161 BUG_ON(ac->ac_bits_given != 0); 2162 BUG_ON(ac->ac_bits_wanted != 1); 2163 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2164 2165 res = kzalloc(sizeof(*res), GFP_NOFS); 2166 if (res == NULL) { 2167 ret = -ENOMEM; 2168 mlog_errno(ret); 2169 goto out; 2170 } 2171 2172 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2173 2174 /* 2175 * The handle started here is for chain relink. Alternatively, 2176 * we could just disable relink for these calls. 2177 */ 2178 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); 2179 if (IS_ERR(handle)) { 2180 ret = PTR_ERR(handle); 2181 handle = NULL; 2182 mlog_errno(ret); 2183 goto out; 2184 } 2185 2186 /* 2187 * This will instruct ocfs2_claim_suballoc_bits and 2188 * ocfs2_search_one_group to search but save actual allocation 2189 * for later. 2190 */ 2191 ac->ac_find_loc_only = 1; 2192 2193 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); 2194 if (ret < 0) { 2195 mlog_errno(ret); 2196 goto out; 2197 } 2198 2199 ac->ac_find_loc_priv = res; 2200 *fe_blkno = res->sr_blkno; 2201 ocfs2_update_inode_fsync_trans(handle, dir, 0); 2202 out: 2203 if (handle) 2204 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2205 2206 if (ret) 2207 kfree(res); 2208 2209 return ret; 2210 } 2211 2212 int ocfs2_claim_new_inode_at_loc(handle_t *handle, 2213 struct inode *dir, 2214 struct ocfs2_alloc_context *ac, 2215 u64 *suballoc_loc, 2216 u16 *suballoc_bit, 2217 u64 di_blkno) 2218 { 2219 int ret; 2220 u16 chain; 2221 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; 2222 struct buffer_head *bg_bh = NULL; 2223 struct ocfs2_group_desc *bg; 2224 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2225 2226 /* 2227 * Since di_blkno is being passed back in, we check for any 2228 * inconsistencies which may have happened between 2229 * calls. These are code bugs as di_blkno is not expected to 2230 * change once returned from ocfs2_find_new_inode_loc() 2231 */ 2232 BUG_ON(res->sr_blkno != di_blkno); 2233 2234 ret = ocfs2_read_group_descriptor(ac->ac_inode, di, 2235 res->sr_bg_stable_blkno, &bg_bh); 2236 if (ret) { 2237 mlog_errno(ret); 2238 goto out; 2239 } 2240 2241 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 2242 chain = le16_to_cpu(bg->bg_chain); 2243 2244 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, 2245 ac->ac_bh, res->sr_bits, 2246 chain); 2247 if (ret) { 2248 mlog_errno(ret); 2249 goto out; 2250 } 2251 2252 ret = ocfs2_block_group_set_bits(handle, 2253 ac->ac_inode, 2254 bg, 2255 bg_bh, 2256 res->sr_bit_offset, 2257 res->sr_bits, 2258 res->sr_max_contig_bits, 2259 0); 2260 if (ret < 0) { 2261 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, 2262 ac->ac_bh, res->sr_bits, chain); 2263 mlog_errno(ret); 2264 goto out; 2265 } 2266 2267 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, 2268 res->sr_bits); 2269 2270 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2271 2272 BUG_ON(res->sr_bits != 1); 2273 2274 *suballoc_loc = res->sr_bg_blkno; 2275 *suballoc_bit = res->sr_bit_offset; 2276 ac->ac_bits_given++; 2277 ocfs2_save_inode_ac_group(dir, ac); 2278 2279 out: 2280 brelse(bg_bh); 2281 2282 return ret; 2283 } 2284 2285 int ocfs2_claim_new_inode(handle_t *handle, 2286 struct inode *dir, 2287 struct buffer_head *parent_fe_bh, 2288 struct ocfs2_alloc_context *ac, 2289 u64 *suballoc_loc, 2290 u16 *suballoc_bit, 2291 u64 *fe_blkno) 2292 { 2293 int status; 2294 struct ocfs2_suballoc_result res; 2295 2296 BUG_ON(!ac); 2297 BUG_ON(ac->ac_bits_given != 0); 2298 BUG_ON(ac->ac_bits_wanted != 1); 2299 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2300 2301 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2302 2303 status = ocfs2_claim_suballoc_bits(ac, 2304 handle, 2305 1, 2306 1, 2307 &res); 2308 if (status < 0) { 2309 mlog_errno(status); 2310 goto bail; 2311 } 2312 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2313 2314 BUG_ON(res.sr_bits != 1); 2315 2316 *suballoc_loc = res.sr_bg_blkno; 2317 *suballoc_bit = res.sr_bit_offset; 2318 *fe_blkno = res.sr_blkno; 2319 ac->ac_bits_given++; 2320 ocfs2_save_inode_ac_group(dir, ac); 2321 status = 0; 2322 bail: 2323 if (status) 2324 mlog_errno(status); 2325 return status; 2326 } 2327 2328 /* translate a group desc. blkno and it's bitmap offset into 2329 * disk cluster offset. */ 2330 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 2331 u64 bg_blkno, 2332 u16 bg_bit_off) 2333 { 2334 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2335 u32 cluster = 0; 2336 2337 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2338 2339 if (bg_blkno != osb->first_cluster_group_blkno) 2340 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 2341 cluster += (u32) bg_bit_off; 2342 return cluster; 2343 } 2344 2345 /* given a cluster offset, calculate which block group it belongs to 2346 * and return that block offset. */ 2347 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 2348 { 2349 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2350 u32 group_no; 2351 2352 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2353 2354 group_no = cluster / osb->bitmap_cpg; 2355 if (!group_no) 2356 return osb->first_cluster_group_blkno; 2357 return ocfs2_clusters_to_blocks(inode->i_sb, 2358 group_no * osb->bitmap_cpg); 2359 } 2360 2361 /* given the block number of a cluster start, calculate which cluster 2362 * group and descriptor bitmap offset that corresponds to. */ 2363 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 2364 u64 data_blkno, 2365 u64 *bg_blkno, 2366 u16 *bg_bit_off) 2367 { 2368 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2369 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 2370 2371 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2372 2373 *bg_blkno = ocfs2_which_cluster_group(inode, 2374 data_cluster); 2375 2376 if (*bg_blkno == osb->first_cluster_group_blkno) 2377 *bg_bit_off = (u16) data_cluster; 2378 else 2379 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 2380 data_blkno - *bg_blkno); 2381 } 2382 2383 /* 2384 * min_bits - minimum contiguous chunk from this total allocation we 2385 * can handle. set to what we asked for originally for a full 2386 * contig. allocation, set to '1' to indicate we can deal with extents 2387 * of any size. 2388 */ 2389 int __ocfs2_claim_clusters(handle_t *handle, 2390 struct ocfs2_alloc_context *ac, 2391 u32 min_clusters, 2392 u32 max_clusters, 2393 u32 *cluster_start, 2394 u32 *num_clusters) 2395 { 2396 int status; 2397 unsigned int bits_wanted = max_clusters; 2398 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2399 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2400 2401 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2402 2403 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2404 && ac->ac_which != OCFS2_AC_USE_MAIN 2405 && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); 2406 2407 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2408 WARN_ON(min_clusters > 1); 2409 2410 status = ocfs2_claim_local_alloc_bits(osb, 2411 handle, 2412 ac, 2413 bits_wanted, 2414 cluster_start, 2415 num_clusters); 2416 if (!status) 2417 atomic_inc(&osb->alloc_stats.local_data); 2418 } else { 2419 if (min_clusters > (osb->bitmap_cpg - 1)) { 2420 /* The only paths asking for contiguousness 2421 * should know about this already. */ 2422 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 2423 "group bitmap size %u!\n", min_clusters, 2424 osb->bitmap_cpg); 2425 status = -ENOSPC; 2426 goto bail; 2427 } 2428 /* clamp the current request down to a realistic size. */ 2429 if (bits_wanted > (osb->bitmap_cpg - 1)) 2430 bits_wanted = osb->bitmap_cpg - 1; 2431 2432 status = ocfs2_claim_suballoc_bits(ac, 2433 handle, 2434 bits_wanted, 2435 min_clusters, 2436 &res); 2437 if (!status) { 2438 BUG_ON(res.sr_blkno); /* cluster alloc can't set */ 2439 *cluster_start = 2440 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2441 res.sr_bg_blkno, 2442 res.sr_bit_offset); 2443 atomic_inc(&osb->alloc_stats.bitmap_data); 2444 *num_clusters = res.sr_bits; 2445 } 2446 } 2447 if (status < 0) { 2448 if (status != -ENOSPC) 2449 mlog_errno(status); 2450 goto bail; 2451 } 2452 2453 ac->ac_bits_given += *num_clusters; 2454 2455 bail: 2456 if (status) 2457 mlog_errno(status); 2458 return status; 2459 } 2460 2461 int ocfs2_claim_clusters(handle_t *handle, 2462 struct ocfs2_alloc_context *ac, 2463 u32 min_clusters, 2464 u32 *cluster_start, 2465 u32 *num_clusters) 2466 { 2467 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2468 2469 return __ocfs2_claim_clusters(handle, ac, min_clusters, 2470 bits_wanted, cluster_start, num_clusters); 2471 } 2472 2473 static int ocfs2_block_group_clear_bits(handle_t *handle, 2474 struct inode *alloc_inode, 2475 struct ocfs2_group_desc *bg, 2476 struct buffer_head *group_bh, 2477 unsigned int bit_off, 2478 unsigned int num_bits, 2479 unsigned int max_contig_bits, 2480 void (*undo_fn)(unsigned int bit, 2481 unsigned long *bmap)) 2482 { 2483 int status; 2484 unsigned int tmp; 2485 u16 contig_bits; 2486 struct ocfs2_group_desc *undo_bg = NULL; 2487 struct journal_head *jh; 2488 2489 /* The caller got this descriptor from 2490 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2491 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2492 2493 trace_ocfs2_block_group_clear_bits(bit_off, num_bits); 2494 2495 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2496 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2497 group_bh, 2498 undo_fn ? 2499 OCFS2_JOURNAL_ACCESS_UNDO : 2500 OCFS2_JOURNAL_ACCESS_WRITE); 2501 if (status < 0) { 2502 mlog_errno(status); 2503 goto bail; 2504 } 2505 2506 jh = bh2jh(group_bh); 2507 if (undo_fn) { 2508 spin_lock(&jh->b_state_lock); 2509 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; 2510 BUG_ON(!undo_bg); 2511 } 2512 2513 tmp = num_bits; 2514 while(tmp--) { 2515 ocfs2_clear_bit((bit_off + tmp), 2516 (unsigned long *) bg->bg_bitmap); 2517 if (undo_fn) 2518 undo_fn(bit_off + tmp, 2519 (unsigned long *) undo_bg->bg_bitmap); 2520 } 2521 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2522 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2523 if (undo_fn) 2524 spin_unlock(&jh->b_state_lock); 2525 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 2526 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2527 le16_to_cpu(bg->bg_bits), 2528 le16_to_cpu(bg->bg_free_bits_count), 2529 num_bits); 2530 } 2531 2532 /* 2533 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), 2534 * we still need to rescan whole bitmap. 2535 */ 2536 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 2537 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 2538 le16_to_cpu(bg->bg_bits), 0); 2539 if (contig_bits > max_contig_bits) 2540 max_contig_bits = contig_bits; 2541 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 2542 } else { 2543 bg->bg_contig_free_bits = 0; 2544 } 2545 2546 if (undo_fn) 2547 spin_unlock(&jh->b_state_lock); 2548 2549 ocfs2_journal_dirty(handle, group_bh); 2550 bail: 2551 return status; 2552 } 2553 2554 /* 2555 * expects the suballoc inode to already be locked. 2556 */ 2557 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2558 struct inode *alloc_inode, 2559 struct buffer_head *alloc_bh, 2560 unsigned int start_bit, 2561 u64 bg_blkno, 2562 unsigned int count, 2563 void (*undo_fn)(unsigned int bit, 2564 unsigned long *bitmap)) 2565 { 2566 int status = 0; 2567 u32 tmp_used; 2568 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2569 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2570 struct buffer_head *group_bh = NULL; 2571 struct ocfs2_group_desc *group; 2572 __le16 old_bg_contig_free_bits = 0; 2573 2574 /* The alloc_bh comes from ocfs2_free_dinode() or 2575 * ocfs2_free_clusters(). The callers have all locked the 2576 * allocator and gotten alloc_bh from the lock call. This 2577 * validates the dinode buffer. Any corruption that has happened 2578 * is a code bug. */ 2579 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2580 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2581 2582 trace_ocfs2_free_suballoc_bits( 2583 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 2584 (unsigned long long)bg_blkno, 2585 start_bit, count); 2586 2587 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2588 &group_bh); 2589 if (status < 0) { 2590 mlog_errno(status); 2591 goto bail; 2592 } 2593 group = (struct ocfs2_group_desc *) group_bh->b_data; 2594 2595 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2596 2597 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2598 old_bg_contig_free_bits = group->bg_contig_free_bits; 2599 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2600 group, group_bh, 2601 start_bit, count, 0, undo_fn); 2602 if (status < 0) { 2603 mlog_errno(status); 2604 goto bail; 2605 } 2606 2607 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2608 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2609 if (status < 0) { 2610 mlog_errno(status); 2611 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, 2612 start_bit, count, 2613 le16_to_cpu(old_bg_contig_free_bits), 1); 2614 goto bail; 2615 } 2616 2617 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 2618 count); 2619 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2620 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2621 ocfs2_journal_dirty(handle, alloc_bh); 2622 2623 bail: 2624 brelse(group_bh); 2625 return status; 2626 } 2627 2628 int ocfs2_free_suballoc_bits(handle_t *handle, 2629 struct inode *alloc_inode, 2630 struct buffer_head *alloc_bh, 2631 unsigned int start_bit, 2632 u64 bg_blkno, 2633 unsigned int count) 2634 { 2635 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2636 start_bit, bg_blkno, count, NULL); 2637 } 2638 2639 int ocfs2_free_dinode(handle_t *handle, 2640 struct inode *inode_alloc_inode, 2641 struct buffer_head *inode_alloc_bh, 2642 struct ocfs2_dinode *di) 2643 { 2644 u64 blk = le64_to_cpu(di->i_blkno); 2645 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2646 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2647 2648 if (di->i_suballoc_loc) 2649 bg_blkno = le64_to_cpu(di->i_suballoc_loc); 2650 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2651 inode_alloc_bh, bit, bg_blkno, 1); 2652 } 2653 2654 static int _ocfs2_free_clusters(handle_t *handle, 2655 struct inode *bitmap_inode, 2656 struct buffer_head *bitmap_bh, 2657 u64 start_blk, 2658 unsigned int num_clusters, 2659 void (*undo_fn)(unsigned int bit, 2660 unsigned long *bitmap)) 2661 { 2662 int status; 2663 u16 bg_start_bit; 2664 u64 bg_blkno; 2665 2666 /* You can't ever have a contiguous set of clusters 2667 * bigger than a block group bitmap so we never have to worry 2668 * about looping on them. 2669 * This is expensive. We can safely remove once this stuff has 2670 * gotten tested really well. */ 2671 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, 2672 ocfs2_blocks_to_clusters(bitmap_inode->i_sb, 2673 start_blk))); 2674 2675 2676 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2677 &bg_start_bit); 2678 2679 trace_ocfs2_free_clusters((unsigned long long)bg_blkno, 2680 (unsigned long long)start_blk, 2681 bg_start_bit, num_clusters); 2682 2683 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2684 bg_start_bit, bg_blkno, 2685 num_clusters, undo_fn); 2686 if (status < 0) { 2687 mlog_errno(status); 2688 goto out; 2689 } 2690 2691 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 2692 num_clusters); 2693 2694 out: 2695 return status; 2696 } 2697 2698 int ocfs2_free_clusters(handle_t *handle, 2699 struct inode *bitmap_inode, 2700 struct buffer_head *bitmap_bh, 2701 u64 start_blk, 2702 unsigned int num_clusters) 2703 { 2704 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2705 start_blk, num_clusters, 2706 _ocfs2_set_bit); 2707 } 2708 2709 /* 2710 * Give never-used clusters back to the global bitmap. We don't need 2711 * to protect these bits in the undo buffer. 2712 */ 2713 int ocfs2_release_clusters(handle_t *handle, 2714 struct inode *bitmap_inode, 2715 struct buffer_head *bitmap_bh, 2716 u64 start_blk, 2717 unsigned int num_clusters) 2718 { 2719 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2720 start_blk, num_clusters, 2721 _ocfs2_clear_bit); 2722 } 2723 2724 /* 2725 * For a given allocation, determine which allocators will need to be 2726 * accessed, and lock them, reserving the appropriate number of bits. 2727 * 2728 * Sparse file systems call this from ocfs2_write_begin_nolock() 2729 * and ocfs2_allocate_unwritten_extents(). 2730 * 2731 * File systems which don't support holes call this from 2732 * ocfs2_extend_allocation(). 2733 */ 2734 int ocfs2_lock_allocators(struct inode *inode, 2735 struct ocfs2_extent_tree *et, 2736 u32 clusters_to_add, u32 extents_to_split, 2737 struct ocfs2_alloc_context **data_ac, 2738 struct ocfs2_alloc_context **meta_ac) 2739 { 2740 int ret = 0, num_free_extents; 2741 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 2742 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2743 2744 *meta_ac = NULL; 2745 if (data_ac) 2746 *data_ac = NULL; 2747 2748 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2749 2750 num_free_extents = ocfs2_num_free_extents(et); 2751 if (num_free_extents < 0) { 2752 ret = num_free_extents; 2753 mlog_errno(ret); 2754 goto out; 2755 } 2756 2757 /* 2758 * Sparse allocation file systems need to be more conservative 2759 * with reserving room for expansion - the actual allocation 2760 * happens while we've got a journal handle open so re-taking 2761 * a cluster lock (because we ran out of room for another 2762 * extent) will violate ordering rules. 2763 * 2764 * Most of the time we'll only be seeing this 1 cluster at a time 2765 * anyway. 2766 * 2767 * Always lock for any unwritten extents - we might want to 2768 * add blocks during a split. 2769 */ 2770 if (!num_free_extents || 2771 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 2772 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 2773 if (ret < 0) { 2774 if (ret != -ENOSPC) 2775 mlog_errno(ret); 2776 goto out; 2777 } 2778 } 2779 2780 if (clusters_to_add == 0) 2781 goto out; 2782 2783 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 2784 if (ret < 0) { 2785 if (ret != -ENOSPC) 2786 mlog_errno(ret); 2787 goto out; 2788 } 2789 2790 out: 2791 if (ret) { 2792 if (*meta_ac) { 2793 ocfs2_free_alloc_context(*meta_ac); 2794 *meta_ac = NULL; 2795 } 2796 2797 /* 2798 * We cannot have an error and a non null *data_ac. 2799 */ 2800 } 2801 2802 return ret; 2803 } 2804 2805 /* 2806 * Read the inode specified by blkno to get suballoc_slot and 2807 * suballoc_bit. 2808 */ 2809 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2810 u16 *suballoc_slot, u64 *group_blkno, 2811 u16 *suballoc_bit) 2812 { 2813 int status; 2814 struct buffer_head *inode_bh = NULL; 2815 struct ocfs2_dinode *inode_fe; 2816 2817 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); 2818 2819 /* dirty read disk */ 2820 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2821 if (status < 0) { 2822 mlog(ML_ERROR, "read block %llu failed %d\n", 2823 (unsigned long long)blkno, status); 2824 goto bail; 2825 } 2826 2827 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 2828 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 2829 mlog(ML_ERROR, "invalid inode %llu requested\n", 2830 (unsigned long long)blkno); 2831 status = -EINVAL; 2832 goto bail; 2833 } 2834 2835 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 2836 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 2837 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 2838 (unsigned long long)blkno, 2839 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 2840 status = -EINVAL; 2841 goto bail; 2842 } 2843 2844 if (suballoc_slot) 2845 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2846 if (suballoc_bit) 2847 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2848 if (group_blkno) 2849 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); 2850 2851 bail: 2852 brelse(inode_bh); 2853 2854 if (status) 2855 mlog_errno(status); 2856 return status; 2857 } 2858 2859 /* 2860 * test whether bit is SET in allocator bitmap or not. on success, 0 2861 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 2862 * is returned and *res is meaningless. Call this after you have 2863 * cluster locked against suballoc, or you may get a result based on 2864 * non-up2date contents 2865 */ 2866 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2867 struct inode *suballoc, 2868 struct buffer_head *alloc_bh, 2869 u64 group_blkno, u64 blkno, 2870 u16 bit, int *res) 2871 { 2872 struct ocfs2_dinode *alloc_di; 2873 struct ocfs2_group_desc *group; 2874 struct buffer_head *group_bh = NULL; 2875 u64 bg_blkno; 2876 int status; 2877 2878 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, 2879 (unsigned int)bit); 2880 2881 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 2882 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 2883 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2884 (unsigned int)bit, 2885 ocfs2_bits_per_group(&alloc_di->id2.i_chain)); 2886 status = -EINVAL; 2887 goto bail; 2888 } 2889 2890 bg_blkno = group_blkno ? group_blkno : 2891 ocfs2_which_suballoc_group(blkno, bit); 2892 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, 2893 &group_bh); 2894 if (status < 0) { 2895 mlog(ML_ERROR, "read group %llu failed %d\n", 2896 (unsigned long long)bg_blkno, status); 2897 goto bail; 2898 } 2899 2900 group = (struct ocfs2_group_desc *) group_bh->b_data; 2901 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 2902 2903 bail: 2904 brelse(group_bh); 2905 2906 if (status) 2907 mlog_errno(status); 2908 return status; 2909 } 2910 2911 /* 2912 * Test if the bit representing this inode (blkno) is set in the 2913 * suballocator. 2914 * 2915 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 2916 * 2917 * In the event of failure, a negative value is returned and *res is 2918 * meaningless. 2919 * 2920 * Callers must make sure to hold nfs_sync_lock to prevent 2921 * ocfs2_delete_inode() on another node from accessing the same 2922 * suballocator concurrently. 2923 */ 2924 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2925 { 2926 int status; 2927 u64 group_blkno = 0; 2928 u16 suballoc_bit = 0, suballoc_slot = 0; 2929 struct inode *inode_alloc_inode; 2930 struct buffer_head *alloc_bh = NULL; 2931 2932 trace_ocfs2_test_inode_bit((unsigned long long)blkno); 2933 2934 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2935 &group_blkno, &suballoc_bit); 2936 if (status < 0) { 2937 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2938 goto bail; 2939 } 2940 2941 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) 2942 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 2943 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 2944 else 2945 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 2946 INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 2947 if (!inode_alloc_inode) { 2948 /* the error code could be inaccurate, but we are not able to 2949 * get the correct one. */ 2950 status = -EINVAL; 2951 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 2952 (u32)suballoc_slot); 2953 goto bail; 2954 } 2955 2956 inode_lock(inode_alloc_inode); 2957 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2958 if (status < 0) { 2959 inode_unlock(inode_alloc_inode); 2960 iput(inode_alloc_inode); 2961 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2962 (u32)suballoc_slot, status); 2963 goto bail; 2964 } 2965 2966 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2967 group_blkno, blkno, suballoc_bit, res); 2968 if (status < 0) 2969 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2970 2971 ocfs2_inode_unlock(inode_alloc_inode, 0); 2972 inode_unlock(inode_alloc_inode); 2973 2974 iput(inode_alloc_inode); 2975 brelse(alloc_bh); 2976 bail: 2977 if (status) 2978 mlog_errno(status); 2979 return status; 2980 } 2981