1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * suballoc.c 4 * 5 * metadata alloc and free 6 * Inspired by ext3 block groups. 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/highmem.h> 15 16 #include <cluster/masklog.h> 17 18 #include "ocfs2.h" 19 20 #include "alloc.h" 21 #include "blockcheck.h" 22 #include "dlmglue.h" 23 #include "inode.h" 24 #include "journal.h" 25 #include "localalloc.h" 26 #include "suballoc.h" 27 #include "super.h" 28 #include "sysfile.h" 29 #include "uptodate.h" 30 #include "ocfs2_trace.h" 31 32 #include "buffer_head_io.h" 33 34 #define NOT_ALLOC_NEW_GROUP 0 35 #define ALLOC_NEW_GROUP 0x1 36 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 37 38 #define OCFS2_MAX_TO_STEAL 1024 39 40 struct ocfs2_suballoc_result { 41 u64 sr_bg_blkno; /* The bg we allocated from. Set 42 to 0 when a block group is 43 contiguous. */ 44 u64 sr_bg_stable_blkno; /* 45 * Doesn't change, always 46 * set to target block 47 * group descriptor 48 * block. 49 */ 50 u64 sr_blkno; /* The first allocated block */ 51 unsigned int sr_bit_offset; /* The bit in the bg */ 52 unsigned int sr_bits; /* How many bits we claimed */ 53 unsigned int sr_max_contig_bits; /* The length for contiguous 54 * free bits, only available 55 * for cluster group 56 */ 57 }; 58 59 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) 60 { 61 if (res->sr_blkno == 0) 62 return 0; 63 64 if (res->sr_bg_blkno) 65 return res->sr_bg_blkno; 66 67 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 68 } 69 70 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 71 static int ocfs2_block_group_fill(handle_t *handle, 72 struct inode *alloc_inode, 73 struct buffer_head *bg_bh, 74 u64 group_blkno, 75 unsigned int group_clusters, 76 u16 my_chain, 77 struct ocfs2_chain_list *cl); 78 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 79 struct inode *alloc_inode, 80 struct buffer_head *bh, 81 u64 max_block, 82 u64 *last_alloc_group, 83 int flags); 84 85 static int ocfs2_cluster_group_search(struct inode *inode, 86 struct buffer_head *group_bh, 87 u32 bits_wanted, u32 min_bits, 88 u64 max_block, 89 struct ocfs2_suballoc_result *res); 90 static int ocfs2_block_group_search(struct inode *inode, 91 struct buffer_head *group_bh, 92 u32 bits_wanted, u32 min_bits, 93 u64 max_block, 94 struct ocfs2_suballoc_result *res); 95 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 96 handle_t *handle, 97 u32 bits_wanted, 98 u32 min_bits, 99 struct ocfs2_suballoc_result *res); 100 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 101 int nr); 102 static int ocfs2_relink_block_group(handle_t *handle, 103 struct inode *alloc_inode, 104 struct buffer_head *fe_bh, 105 struct buffer_head *bg_bh, 106 struct buffer_head *prev_bg_bh, 107 u16 chain); 108 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 109 u32 wanted); 110 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 111 u64 bg_blkno, 112 u16 bg_bit_off); 113 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 114 u64 data_blkno, 115 u64 *bg_blkno, 116 u16 *bg_bit_off); 117 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 118 u32 bits_wanted, u64 max_block, 119 int flags, 120 struct ocfs2_alloc_context **ac); 121 122 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 123 { 124 struct inode *inode = ac->ac_inode; 125 126 if (inode) { 127 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 128 ocfs2_inode_unlock(inode, 1); 129 130 inode_unlock(inode); 131 132 iput(inode); 133 ac->ac_inode = NULL; 134 } 135 brelse(ac->ac_bh); 136 ac->ac_bh = NULL; 137 ac->ac_resv = NULL; 138 kfree(ac->ac_find_loc_priv); 139 ac->ac_find_loc_priv = NULL; 140 } 141 142 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 143 { 144 ocfs2_free_ac_resource(ac); 145 kfree(ac); 146 } 147 148 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 149 { 150 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 151 } 152 153 #define do_error(fmt, ...) \ 154 do { \ 155 if (resize) \ 156 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ 157 else \ 158 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 159 } while (0) 160 161 static int ocfs2_validate_gd_self(struct super_block *sb, 162 struct buffer_head *bh, 163 int resize) 164 { 165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 166 167 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 168 do_error("Group descriptor #%llu has bad signature %.*s\n", 169 (unsigned long long)bh->b_blocknr, 7, 170 gd->bg_signature); 171 } 172 173 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 174 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", 175 (unsigned long long)bh->b_blocknr, 176 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 177 } 178 179 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 180 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", 181 (unsigned long long)bh->b_blocknr, 182 le32_to_cpu(gd->bg_generation)); 183 } 184 185 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 186 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", 187 (unsigned long long)bh->b_blocknr, 188 le16_to_cpu(gd->bg_bits), 189 le16_to_cpu(gd->bg_free_bits_count)); 190 } 191 192 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 193 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", 194 (unsigned long long)bh->b_blocknr, 195 le16_to_cpu(gd->bg_bits), 196 8 * le16_to_cpu(gd->bg_size)); 197 } 198 199 return 0; 200 } 201 202 static int ocfs2_validate_gd_parent(struct super_block *sb, 203 struct ocfs2_dinode *di, 204 struct buffer_head *bh, 205 int resize) 206 { 207 unsigned int max_bits; 208 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 209 210 if (di->i_blkno != gd->bg_parent_dinode) { 211 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", 212 (unsigned long long)bh->b_blocknr, 213 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 214 (unsigned long long)le64_to_cpu(di->i_blkno)); 215 } 216 217 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 218 if (le16_to_cpu(gd->bg_bits) > max_bits) { 219 do_error("Group descriptor #%llu has bit count of %u\n", 220 (unsigned long long)bh->b_blocknr, 221 le16_to_cpu(gd->bg_bits)); 222 } 223 224 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 225 if ((le16_to_cpu(gd->bg_chain) > 226 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 227 ((le16_to_cpu(gd->bg_chain) == 228 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 229 do_error("Group descriptor #%llu has bad chain %u\n", 230 (unsigned long long)bh->b_blocknr, 231 le16_to_cpu(gd->bg_chain)); 232 } 233 234 return 0; 235 } 236 237 #undef do_error 238 239 /* 240 * This version only prints errors. It does not fail the filesystem, and 241 * exists only for resize. 242 */ 243 int ocfs2_check_group_descriptor(struct super_block *sb, 244 struct ocfs2_dinode *di, 245 struct buffer_head *bh) 246 { 247 int rc; 248 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 249 250 BUG_ON(!buffer_uptodate(bh)); 251 252 /* 253 * If the ecc fails, we return the error but otherwise 254 * leave the filesystem running. We know any error is 255 * local to this block. 256 */ 257 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 258 if (rc) { 259 mlog(ML_ERROR, 260 "Checksum failed for group descriptor %llu\n", 261 (unsigned long long)bh->b_blocknr); 262 } else 263 rc = ocfs2_validate_gd_self(sb, bh, 1); 264 if (!rc) 265 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 266 267 return rc; 268 } 269 270 static int ocfs2_validate_group_descriptor(struct super_block *sb, 271 struct buffer_head *bh) 272 { 273 int rc; 274 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 275 276 trace_ocfs2_validate_group_descriptor( 277 (unsigned long long)bh->b_blocknr); 278 279 BUG_ON(!buffer_uptodate(bh)); 280 281 /* 282 * If the ecc fails, we return the error but otherwise 283 * leave the filesystem running. We know any error is 284 * local to this block. 285 */ 286 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 287 if (rc) 288 return rc; 289 290 /* 291 * Errors after here are fatal. 292 */ 293 294 return ocfs2_validate_gd_self(sb, bh, 0); 295 } 296 297 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 298 u64 gd_blkno, struct buffer_head **bh) 299 { 300 int rc; 301 struct buffer_head *tmp = *bh; 302 303 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 304 ocfs2_validate_group_descriptor); 305 if (rc) 306 goto out; 307 308 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 309 if (rc) { 310 brelse(tmp); 311 goto out; 312 } 313 314 /* If ocfs2_read_block() got us a new bh, pass it up. */ 315 if (!*bh) 316 *bh = tmp; 317 318 out: 319 return rc; 320 } 321 322 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 323 struct ocfs2_group_desc *bg, 324 struct ocfs2_chain_list *cl, 325 u64 p_blkno, unsigned int clusters) 326 { 327 struct ocfs2_extent_list *el = &bg->bg_list; 328 struct ocfs2_extent_rec *rec; 329 330 BUG_ON(!ocfs2_supports_discontig_bg(osb)); 331 if (!el->l_next_free_rec) 332 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); 333 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; 334 rec->e_blkno = cpu_to_le64(p_blkno); 335 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 336 le16_to_cpu(cl->cl_bpc)); 337 rec->e_leaf_clusters = cpu_to_le16(clusters); 338 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 339 le16_add_cpu(&bg->bg_free_bits_count, 340 clusters * le16_to_cpu(cl->cl_bpc)); 341 le16_add_cpu(&el->l_next_free_rec, 1); 342 } 343 344 static int ocfs2_block_group_fill(handle_t *handle, 345 struct inode *alloc_inode, 346 struct buffer_head *bg_bh, 347 u64 group_blkno, 348 unsigned int group_clusters, 349 u16 my_chain, 350 struct ocfs2_chain_list *cl) 351 { 352 int status = 0; 353 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 354 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 355 struct super_block * sb = alloc_inode->i_sb; 356 357 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 358 status = ocfs2_error(alloc_inode->i_sb, 359 "group block (%llu) != b_blocknr (%llu)\n", 360 (unsigned long long)group_blkno, 361 (unsigned long long) bg_bh->b_blocknr); 362 goto bail; 363 } 364 365 status = ocfs2_journal_access_gd(handle, 366 INODE_CACHE(alloc_inode), 367 bg_bh, 368 OCFS2_JOURNAL_ACCESS_CREATE); 369 if (status < 0) { 370 mlog_errno(status); 371 goto bail; 372 } 373 374 memset(bg, 0, sb->s_blocksize); 375 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 376 bg->bg_generation = cpu_to_le32(osb->fs_generation); 377 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 378 osb->s_feature_incompat)); 379 bg->bg_chain = cpu_to_le16(my_chain); 380 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 381 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 382 bg->bg_blkno = cpu_to_le64(group_blkno); 383 if (group_clusters == le16_to_cpu(cl->cl_cpg)) 384 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 385 else 386 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, 387 group_clusters); 388 389 /* set the 1st bit in the bitmap to account for the descriptor block */ 390 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 391 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 392 393 ocfs2_journal_dirty(handle, bg_bh); 394 395 /* There is no need to zero out or otherwise initialize the 396 * other blocks in a group - All valid FS metadata in a block 397 * group stores the superblock fs_generation value at 398 * allocation time. */ 399 400 bail: 401 if (status) 402 mlog_errno(status); 403 return status; 404 } 405 406 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 407 { 408 u16 curr, best; 409 410 best = curr = 0; 411 while (curr < le16_to_cpu(cl->cl_count)) { 412 if (le32_to_cpu(cl->cl_recs[best].c_total) > 413 le32_to_cpu(cl->cl_recs[curr].c_total)) 414 best = curr; 415 curr++; 416 } 417 return best; 418 } 419 420 static struct buffer_head * 421 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, 422 struct inode *alloc_inode, 423 struct ocfs2_alloc_context *ac, 424 struct ocfs2_chain_list *cl) 425 { 426 int status; 427 u32 bit_off, num_bits; 428 u64 bg_blkno; 429 struct buffer_head *bg_bh; 430 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 431 432 status = ocfs2_claim_clusters(handle, ac, 433 le16_to_cpu(cl->cl_cpg), &bit_off, 434 &num_bits); 435 if (status < 0) { 436 if (status != -ENOSPC) 437 mlog_errno(status); 438 goto bail; 439 } 440 441 /* setup the group */ 442 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 443 trace_ocfs2_block_group_alloc_contig( 444 (unsigned long long)bg_blkno, alloc_rec); 445 446 bg_bh = sb_getblk(osb->sb, bg_blkno); 447 if (!bg_bh) { 448 status = -ENOMEM; 449 mlog_errno(status); 450 goto bail; 451 } 452 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 453 454 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 455 bg_blkno, num_bits, alloc_rec, cl); 456 if (status < 0) { 457 brelse(bg_bh); 458 mlog_errno(status); 459 } 460 461 bail: 462 return status ? ERR_PTR(status) : bg_bh; 463 } 464 465 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, 466 handle_t *handle, 467 struct ocfs2_alloc_context *ac, 468 unsigned int min_bits, 469 u32 *bit_off, u32 *num_bits) 470 { 471 int status = 0; 472 473 while (min_bits) { 474 status = ocfs2_claim_clusters(handle, ac, min_bits, 475 bit_off, num_bits); 476 if (status != -ENOSPC) 477 break; 478 479 min_bits >>= 1; 480 } 481 482 return status; 483 } 484 485 static int ocfs2_block_group_grow_discontig(handle_t *handle, 486 struct inode *alloc_inode, 487 struct buffer_head *bg_bh, 488 struct ocfs2_alloc_context *ac, 489 struct ocfs2_chain_list *cl, 490 unsigned int min_bits) 491 { 492 int status; 493 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 494 struct ocfs2_group_desc *bg = 495 (struct ocfs2_group_desc *)bg_bh->b_data; 496 unsigned int needed = le16_to_cpu(cl->cl_cpg) - 497 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 498 u32 p_cpos, clusters; 499 u64 p_blkno; 500 struct ocfs2_extent_list *el = &bg->bg_list; 501 502 status = ocfs2_journal_access_gd(handle, 503 INODE_CACHE(alloc_inode), 504 bg_bh, 505 OCFS2_JOURNAL_ACCESS_CREATE); 506 if (status < 0) { 507 mlog_errno(status); 508 goto bail; 509 } 510 511 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < 512 le16_to_cpu(el->l_count))) { 513 if (min_bits > needed) 514 min_bits = needed; 515 status = ocfs2_block_group_claim_bits(osb, handle, ac, 516 min_bits, &p_cpos, 517 &clusters); 518 if (status < 0) { 519 if (status != -ENOSPC) 520 mlog_errno(status); 521 goto bail; 522 } 523 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); 524 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, 525 clusters); 526 527 min_bits = clusters; 528 needed = le16_to_cpu(cl->cl_cpg) - 529 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 530 } 531 532 if (needed > 0) { 533 /* 534 * We have used up all the extent rec but can't fill up 535 * the cpg. So bail out. 536 */ 537 status = -ENOSPC; 538 goto bail; 539 } 540 541 ocfs2_journal_dirty(handle, bg_bh); 542 543 bail: 544 return status; 545 } 546 547 static void ocfs2_bg_alloc_cleanup(handle_t *handle, 548 struct ocfs2_alloc_context *cluster_ac, 549 struct inode *alloc_inode, 550 struct buffer_head *bg_bh) 551 { 552 int i, ret; 553 struct ocfs2_group_desc *bg; 554 struct ocfs2_extent_list *el; 555 struct ocfs2_extent_rec *rec; 556 557 if (!bg_bh) 558 return; 559 560 bg = (struct ocfs2_group_desc *)bg_bh->b_data; 561 el = &bg->bg_list; 562 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 563 rec = &el->l_recs[i]; 564 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, 565 cluster_ac->ac_bh, 566 le64_to_cpu(rec->e_blkno), 567 le16_to_cpu(rec->e_leaf_clusters)); 568 if (ret) 569 mlog_errno(ret); 570 /* Try all the clusters to free */ 571 } 572 573 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); 574 brelse(bg_bh); 575 } 576 577 static struct buffer_head * 578 ocfs2_block_group_alloc_discontig(handle_t *handle, 579 struct inode *alloc_inode, 580 struct ocfs2_alloc_context *ac, 581 struct ocfs2_chain_list *cl) 582 { 583 int status; 584 u32 bit_off, num_bits; 585 u64 bg_blkno; 586 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; 587 struct buffer_head *bg_bh = NULL; 588 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 589 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 590 591 if (!ocfs2_supports_discontig_bg(osb)) { 592 status = -ENOSPC; 593 goto bail; 594 } 595 596 status = ocfs2_extend_trans(handle, 597 ocfs2_calc_bg_discontig_credits(osb->sb)); 598 if (status) { 599 mlog_errno(status); 600 goto bail; 601 } 602 603 /* 604 * We're going to be grabbing from multiple cluster groups. 605 * We don't have enough credits to relink them all, and the 606 * cluster groups will be staying in cache for the duration of 607 * this operation. 608 */ 609 ac->ac_disable_chain_relink = 1; 610 611 /* Claim the first region */ 612 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, 613 &bit_off, &num_bits); 614 if (status < 0) { 615 if (status != -ENOSPC) 616 mlog_errno(status); 617 goto bail; 618 } 619 min_bits = num_bits; 620 621 /* setup the group */ 622 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 623 trace_ocfs2_block_group_alloc_discontig( 624 (unsigned long long)bg_blkno, alloc_rec); 625 626 bg_bh = sb_getblk(osb->sb, bg_blkno); 627 if (!bg_bh) { 628 status = -ENOMEM; 629 mlog_errno(status); 630 goto bail; 631 } 632 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 633 634 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 635 bg_blkno, num_bits, alloc_rec, cl); 636 if (status < 0) { 637 mlog_errno(status); 638 goto bail; 639 } 640 641 status = ocfs2_block_group_grow_discontig(handle, alloc_inode, 642 bg_bh, ac, cl, min_bits); 643 if (status) 644 mlog_errno(status); 645 646 bail: 647 if (status) 648 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); 649 return status ? ERR_PTR(status) : bg_bh; 650 } 651 652 /* 653 * We expect the block group allocator to already be locked. 654 */ 655 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 656 struct inode *alloc_inode, 657 struct buffer_head *bh, 658 u64 max_block, 659 u64 *last_alloc_group, 660 int flags) 661 { 662 int status, credits; 663 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 664 struct ocfs2_chain_list *cl; 665 struct ocfs2_alloc_context *ac = NULL; 666 handle_t *handle = NULL; 667 u16 alloc_rec; 668 struct buffer_head *bg_bh = NULL; 669 struct ocfs2_group_desc *bg; 670 671 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 672 673 cl = &fe->id2.i_chain; 674 status = ocfs2_reserve_clusters_with_limit(osb, 675 le16_to_cpu(cl->cl_cpg), 676 max_block, flags, &ac); 677 if (status < 0) { 678 if (status != -ENOSPC) 679 mlog_errno(status); 680 goto bail; 681 } 682 683 credits = ocfs2_calc_group_alloc_credits(osb->sb, 684 le16_to_cpu(cl->cl_cpg)); 685 handle = ocfs2_start_trans(osb, credits); 686 if (IS_ERR(handle)) { 687 status = PTR_ERR(handle); 688 handle = NULL; 689 mlog_errno(status); 690 goto bail; 691 } 692 693 if (last_alloc_group && *last_alloc_group != 0) { 694 trace_ocfs2_block_group_alloc( 695 (unsigned long long)*last_alloc_group); 696 ac->ac_last_group = *last_alloc_group; 697 } 698 699 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 700 ac, cl); 701 if (PTR_ERR(bg_bh) == -ENOSPC) 702 bg_bh = ocfs2_block_group_alloc_discontig(handle, 703 alloc_inode, 704 ac, cl); 705 if (IS_ERR(bg_bh)) { 706 status = PTR_ERR(bg_bh); 707 bg_bh = NULL; 708 if (status != -ENOSPC) 709 mlog_errno(status); 710 goto bail; 711 } 712 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 713 714 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 715 bh, OCFS2_JOURNAL_ACCESS_WRITE); 716 if (status < 0) { 717 mlog_errno(status); 718 goto bail; 719 } 720 721 alloc_rec = le16_to_cpu(bg->bg_chain); 722 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 723 le16_to_cpu(bg->bg_free_bits_count)); 724 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 725 le16_to_cpu(bg->bg_bits)); 726 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; 727 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 728 le16_add_cpu(&cl->cl_next_free_rec, 1); 729 730 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 731 le16_to_cpu(bg->bg_free_bits_count)); 732 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 733 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 734 735 ocfs2_journal_dirty(handle, bh); 736 737 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 738 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 739 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 740 le32_to_cpu(fe->i_clusters))); 741 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 742 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 743 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 744 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 745 746 status = 0; 747 748 /* save the new last alloc group so that the caller can cache it. */ 749 if (last_alloc_group) 750 *last_alloc_group = ac->ac_last_group; 751 752 bail: 753 if (handle) 754 ocfs2_commit_trans(osb, handle); 755 756 if (ac) 757 ocfs2_free_alloc_context(ac); 758 759 brelse(bg_bh); 760 761 if (status) 762 mlog_errno(status); 763 return status; 764 } 765 766 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 767 struct ocfs2_alloc_context *ac, 768 int type, 769 u32 slot, 770 u64 *last_alloc_group, 771 int flags) 772 { 773 int status; 774 u32 bits_wanted = ac->ac_bits_wanted; 775 struct inode *alloc_inode; 776 struct buffer_head *bh = NULL; 777 struct ocfs2_dinode *fe; 778 u32 free_bits; 779 780 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 781 if (!alloc_inode) { 782 mlog_errno(-EINVAL); 783 return -EINVAL; 784 } 785 786 inode_lock(alloc_inode); 787 788 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 789 if (status < 0) { 790 inode_unlock(alloc_inode); 791 iput(alloc_inode); 792 793 mlog_errno(status); 794 return status; 795 } 796 797 ac->ac_inode = alloc_inode; 798 ac->ac_alloc_slot = slot; 799 800 fe = (struct ocfs2_dinode *) bh->b_data; 801 802 /* The bh was validated by the inode read inside 803 * ocfs2_inode_lock(). Any corruption is a code bug. */ 804 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 805 806 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 807 status = ocfs2_error(alloc_inode->i_sb, 808 "Invalid chain allocator %llu\n", 809 (unsigned long long)le64_to_cpu(fe->i_blkno)); 810 goto bail; 811 } 812 813 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 814 le32_to_cpu(fe->id1.bitmap1.i_used); 815 816 if (bits_wanted > free_bits) { 817 /* cluster bitmap never grows */ 818 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 819 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, 820 free_bits); 821 status = -ENOSPC; 822 goto bail; 823 } 824 825 if (!(flags & ALLOC_NEW_GROUP)) { 826 trace_ocfs2_reserve_suballoc_bits_no_new_group( 827 slot, bits_wanted, free_bits); 828 status = -ENOSPC; 829 goto bail; 830 } 831 832 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 833 ac->ac_max_block, 834 last_alloc_group, flags); 835 if (status < 0) { 836 if (status != -ENOSPC) 837 mlog_errno(status); 838 goto bail; 839 } 840 atomic_inc(&osb->alloc_stats.bg_extends); 841 842 /* You should never ask for this much metadata */ 843 BUG_ON(bits_wanted > 844 (le32_to_cpu(fe->id1.bitmap1.i_total) 845 - le32_to_cpu(fe->id1.bitmap1.i_used))); 846 } 847 848 get_bh(bh); 849 ac->ac_bh = bh; 850 bail: 851 brelse(bh); 852 853 if (status) 854 mlog_errno(status); 855 return status; 856 } 857 858 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 859 { 860 spin_lock(&osb->osb_lock); 861 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 862 spin_unlock(&osb->osb_lock); 863 atomic_set(&osb->s_num_inodes_stolen, 0); 864 } 865 866 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 867 { 868 spin_lock(&osb->osb_lock); 869 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 870 spin_unlock(&osb->osb_lock); 871 atomic_set(&osb->s_num_meta_stolen, 0); 872 } 873 874 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 875 { 876 ocfs2_init_inode_steal_slot(osb); 877 ocfs2_init_meta_steal_slot(osb); 878 } 879 880 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 881 { 882 spin_lock(&osb->osb_lock); 883 if (type == INODE_ALLOC_SYSTEM_INODE) 884 osb->s_inode_steal_slot = (u16)slot; 885 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 886 osb->s_meta_steal_slot = (u16)slot; 887 spin_unlock(&osb->osb_lock); 888 } 889 890 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 891 { 892 int slot = OCFS2_INVALID_SLOT; 893 894 spin_lock(&osb->osb_lock); 895 if (type == INODE_ALLOC_SYSTEM_INODE) 896 slot = osb->s_inode_steal_slot; 897 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 898 slot = osb->s_meta_steal_slot; 899 spin_unlock(&osb->osb_lock); 900 901 return slot; 902 } 903 904 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 905 { 906 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 907 } 908 909 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 910 { 911 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 912 } 913 914 static int ocfs2_steal_resource(struct ocfs2_super *osb, 915 struct ocfs2_alloc_context *ac, 916 int type) 917 { 918 int i, status = -ENOSPC; 919 int slot = __ocfs2_get_steal_slot(osb, type); 920 921 /* Start to steal resource from the first slot after ours. */ 922 if (slot == OCFS2_INVALID_SLOT) 923 slot = osb->slot_num + 1; 924 925 for (i = 0; i < osb->max_slots; i++, slot++) { 926 if (slot == osb->max_slots) 927 slot = 0; 928 929 if (slot == osb->slot_num) 930 continue; 931 932 status = ocfs2_reserve_suballoc_bits(osb, ac, 933 type, 934 (u32)slot, NULL, 935 NOT_ALLOC_NEW_GROUP); 936 if (status >= 0) { 937 __ocfs2_set_steal_slot(osb, slot, type); 938 break; 939 } 940 941 ocfs2_free_ac_resource(ac); 942 } 943 944 return status; 945 } 946 947 static int ocfs2_steal_inode(struct ocfs2_super *osb, 948 struct ocfs2_alloc_context *ac) 949 { 950 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 951 } 952 953 static int ocfs2_steal_meta(struct ocfs2_super *osb, 954 struct ocfs2_alloc_context *ac) 955 { 956 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 957 } 958 959 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 960 int blocks, 961 struct ocfs2_alloc_context **ac) 962 { 963 int status; 964 int slot = ocfs2_get_meta_steal_slot(osb); 965 966 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 967 if (!(*ac)) { 968 status = -ENOMEM; 969 mlog_errno(status); 970 goto bail; 971 } 972 973 (*ac)->ac_bits_wanted = blocks; 974 (*ac)->ac_which = OCFS2_AC_USE_META; 975 (*ac)->ac_group_search = ocfs2_block_group_search; 976 977 if (slot != OCFS2_INVALID_SLOT && 978 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 979 goto extent_steal; 980 981 atomic_set(&osb->s_num_meta_stolen, 0); 982 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 983 EXTENT_ALLOC_SYSTEM_INODE, 984 (u32)osb->slot_num, NULL, 985 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); 986 987 988 if (status >= 0) { 989 status = 0; 990 if (slot != OCFS2_INVALID_SLOT) 991 ocfs2_init_meta_steal_slot(osb); 992 goto bail; 993 } else if (status < 0 && status != -ENOSPC) { 994 mlog_errno(status); 995 goto bail; 996 } 997 998 ocfs2_free_ac_resource(*ac); 999 1000 extent_steal: 1001 status = ocfs2_steal_meta(osb, *ac); 1002 atomic_inc(&osb->s_num_meta_stolen); 1003 if (status < 0) { 1004 if (status != -ENOSPC) 1005 mlog_errno(status); 1006 goto bail; 1007 } 1008 1009 status = 0; 1010 bail: 1011 if ((status < 0) && *ac) { 1012 ocfs2_free_alloc_context(*ac); 1013 *ac = NULL; 1014 } 1015 1016 if (status) 1017 mlog_errno(status); 1018 return status; 1019 } 1020 1021 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 1022 struct ocfs2_extent_list *root_el, 1023 struct ocfs2_alloc_context **ac) 1024 { 1025 return ocfs2_reserve_new_metadata_blocks(osb, 1026 ocfs2_extend_meta_needed(root_el), 1027 ac); 1028 } 1029 1030 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 1031 struct ocfs2_alloc_context **ac) 1032 { 1033 int status; 1034 int slot = ocfs2_get_inode_steal_slot(osb); 1035 u64 alloc_group; 1036 1037 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1038 if (!(*ac)) { 1039 status = -ENOMEM; 1040 mlog_errno(status); 1041 goto bail; 1042 } 1043 1044 (*ac)->ac_bits_wanted = 1; 1045 (*ac)->ac_which = OCFS2_AC_USE_INODE; 1046 1047 (*ac)->ac_group_search = ocfs2_block_group_search; 1048 1049 /* 1050 * stat(2) can't handle i_ino > 32bits, so we tell the 1051 * lower levels not to allocate us a block group past that 1052 * limit. The 'inode64' mount option avoids this behavior. 1053 */ 1054 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 1055 (*ac)->ac_max_block = (u32)~0U; 1056 1057 /* 1058 * slot is set when we successfully steal inode from other nodes. 1059 * It is reset in 3 places: 1060 * 1. when we flush the truncate log 1061 * 2. when we complete local alloc recovery. 1062 * 3. when we successfully allocate from our own slot. 1063 * After it is set, we will go on stealing inodes until we find the 1064 * need to check our slots to see whether there is some space for us. 1065 */ 1066 if (slot != OCFS2_INVALID_SLOT && 1067 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 1068 goto inode_steal; 1069 1070 atomic_set(&osb->s_num_inodes_stolen, 0); 1071 alloc_group = osb->osb_inode_alloc_group; 1072 status = ocfs2_reserve_suballoc_bits(osb, *ac, 1073 INODE_ALLOC_SYSTEM_INODE, 1074 (u32)osb->slot_num, 1075 &alloc_group, 1076 ALLOC_NEW_GROUP | 1077 ALLOC_GROUPS_FROM_GLOBAL); 1078 if (status >= 0) { 1079 status = 0; 1080 1081 spin_lock(&osb->osb_lock); 1082 osb->osb_inode_alloc_group = alloc_group; 1083 spin_unlock(&osb->osb_lock); 1084 trace_ocfs2_reserve_new_inode_new_group( 1085 (unsigned long long)alloc_group); 1086 1087 /* 1088 * Some inodes must be freed by us, so try to allocate 1089 * from our own next time. 1090 */ 1091 if (slot != OCFS2_INVALID_SLOT) 1092 ocfs2_init_inode_steal_slot(osb); 1093 goto bail; 1094 } else if (status < 0 && status != -ENOSPC) { 1095 mlog_errno(status); 1096 goto bail; 1097 } 1098 1099 ocfs2_free_ac_resource(*ac); 1100 1101 inode_steal: 1102 status = ocfs2_steal_inode(osb, *ac); 1103 atomic_inc(&osb->s_num_inodes_stolen); 1104 if (status < 0) { 1105 if (status != -ENOSPC) 1106 mlog_errno(status); 1107 goto bail; 1108 } 1109 1110 status = 0; 1111 bail: 1112 if ((status < 0) && *ac) { 1113 ocfs2_free_alloc_context(*ac); 1114 *ac = NULL; 1115 } 1116 1117 if (status) 1118 mlog_errno(status); 1119 return status; 1120 } 1121 1122 /* local alloc code has to do the same thing, so rather than do this 1123 * twice.. */ 1124 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 1125 struct ocfs2_alloc_context *ac) 1126 { 1127 int status; 1128 1129 ac->ac_which = OCFS2_AC_USE_MAIN; 1130 ac->ac_group_search = ocfs2_cluster_group_search; 1131 1132 status = ocfs2_reserve_suballoc_bits(osb, ac, 1133 GLOBAL_BITMAP_SYSTEM_INODE, 1134 OCFS2_INVALID_SLOT, NULL, 1135 ALLOC_NEW_GROUP); 1136 if (status < 0 && status != -ENOSPC) 1137 mlog_errno(status); 1138 1139 return status; 1140 } 1141 1142 /* Callers don't need to care which bitmap (local alloc or main) to 1143 * use so we figure it out for them, but unfortunately this clutters 1144 * things a bit. */ 1145 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 1146 u32 bits_wanted, u64 max_block, 1147 int flags, 1148 struct ocfs2_alloc_context **ac) 1149 { 1150 int status, ret = 0; 1151 int retried = 0; 1152 1153 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1154 if (!(*ac)) { 1155 status = -ENOMEM; 1156 mlog_errno(status); 1157 goto bail; 1158 } 1159 1160 (*ac)->ac_bits_wanted = bits_wanted; 1161 (*ac)->ac_max_block = max_block; 1162 1163 status = -ENOSPC; 1164 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 1165 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 1166 status = ocfs2_reserve_local_alloc_bits(osb, 1167 bits_wanted, 1168 *ac); 1169 if ((status < 0) && (status != -ENOSPC)) { 1170 mlog_errno(status); 1171 goto bail; 1172 } 1173 } 1174 1175 if (status == -ENOSPC) { 1176 retry: 1177 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1178 /* Retry if there is sufficient space cached in truncate log */ 1179 if (status == -ENOSPC && !retried) { 1180 retried = 1; 1181 ocfs2_inode_unlock((*ac)->ac_inode, 1); 1182 inode_unlock((*ac)->ac_inode); 1183 1184 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1185 if (ret == 1) { 1186 iput((*ac)->ac_inode); 1187 (*ac)->ac_inode = NULL; 1188 goto retry; 1189 } 1190 1191 if (ret < 0) 1192 mlog_errno(ret); 1193 1194 inode_lock((*ac)->ac_inode); 1195 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1196 if (ret < 0) { 1197 mlog_errno(ret); 1198 inode_unlock((*ac)->ac_inode); 1199 iput((*ac)->ac_inode); 1200 (*ac)->ac_inode = NULL; 1201 goto bail; 1202 } 1203 } 1204 if (status < 0) { 1205 if (status != -ENOSPC) 1206 mlog_errno(status); 1207 goto bail; 1208 } 1209 } 1210 1211 status = 0; 1212 bail: 1213 if ((status < 0) && *ac) { 1214 ocfs2_free_alloc_context(*ac); 1215 *ac = NULL; 1216 } 1217 1218 if (status) 1219 mlog_errno(status); 1220 return status; 1221 } 1222 1223 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 1224 u32 bits_wanted, 1225 struct ocfs2_alloc_context **ac) 1226 { 1227 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 1228 ALLOC_NEW_GROUP, ac); 1229 } 1230 1231 /* 1232 * More or less lifted from ext3. I'll leave their description below: 1233 * 1234 * "For ext3 allocations, we must not reuse any blocks which are 1235 * allocated in the bitmap buffer's "last committed data" copy. This 1236 * prevents deletes from freeing up the page for reuse until we have 1237 * committed the delete transaction. 1238 * 1239 * If we didn't do this, then deleting something and reallocating it as 1240 * data would allow the old block to be overwritten before the 1241 * transaction committed (because we force data to disk before commit). 1242 * This would lead to corruption if we crashed between overwriting the 1243 * data and committing the delete. 1244 * 1245 * @@@ We may want to make this allocation behaviour conditional on 1246 * data-writes at some point, and disable it for metadata allocations or 1247 * sync-data inodes." 1248 * 1249 * Note: OCFS2 already does this differently for metadata vs data 1250 * allocations, as those bitmaps are separate and undo access is never 1251 * called on a metadata group descriptor. 1252 */ 1253 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1254 int nr) 1255 { 1256 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1257 struct journal_head *jh; 1258 int ret; 1259 1260 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1261 return 0; 1262 1263 jh = jbd2_journal_grab_journal_head(bg_bh); 1264 if (!jh) 1265 return 1; 1266 1267 spin_lock(&jh->b_state_lock); 1268 bg = (struct ocfs2_group_desc *) jh->b_committed_data; 1269 if (bg) 1270 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1271 else 1272 ret = 1; 1273 spin_unlock(&jh->b_state_lock); 1274 jbd2_journal_put_journal_head(jh); 1275 1276 return ret; 1277 } 1278 1279 u16 ocfs2_find_max_contig_free_bits(void *bitmap, 1280 u16 total_bits, u16 start) 1281 { 1282 u16 offset, free_bits; 1283 u16 contig_bits = 0; 1284 1285 while (start < total_bits) { 1286 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); 1287 if (offset == total_bits) 1288 break; 1289 1290 start = ocfs2_find_next_bit(bitmap, total_bits, offset); 1291 free_bits = start - offset; 1292 if (contig_bits < free_bits) 1293 contig_bits = free_bits; 1294 } 1295 1296 return contig_bits; 1297 } 1298 1299 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1300 struct buffer_head *bg_bh, 1301 unsigned int bits_wanted, 1302 unsigned int total_bits, 1303 struct ocfs2_suballoc_result *res) 1304 { 1305 void *bitmap; 1306 u16 best_offset, best_size; 1307 u16 prev_best_size = 0; 1308 int offset, start, found, status = 0; 1309 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1310 1311 /* Callers got this descriptor from 1312 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1313 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1314 1315 found = start = best_offset = best_size = 0; 1316 bitmap = bg->bg_bitmap; 1317 1318 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < 1319 total_bits) { 1320 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1321 /* We found a zero, but we can't use it as it 1322 * hasn't been put to disk yet! */ 1323 found = 0; 1324 start = offset + 1; 1325 } else if (offset == start) { 1326 /* we found a zero */ 1327 found++; 1328 /* move start to the next bit to test */ 1329 start++; 1330 } else { 1331 /* got a zero after some ones */ 1332 found = 1; 1333 start = offset + 1; 1334 prev_best_size = best_size; 1335 } 1336 if (found > best_size) { 1337 best_size = found; 1338 best_offset = start - found; 1339 } 1340 /* we got everything we needed */ 1341 if (found == bits_wanted) { 1342 /* mlog(0, "Found it all!\n"); */ 1343 break; 1344 } 1345 } 1346 1347 /* best_size will be allocated, we save prev_best_size */ 1348 res->sr_max_contig_bits = prev_best_size; 1349 if (best_size) { 1350 res->sr_bit_offset = best_offset; 1351 res->sr_bits = best_size; 1352 } else { 1353 status = -ENOSPC; 1354 /* No error log here -- see the comment above 1355 * ocfs2_test_bg_bit_allocatable */ 1356 } 1357 1358 return status; 1359 } 1360 1361 int ocfs2_block_group_set_bits(handle_t *handle, 1362 struct inode *alloc_inode, 1363 struct ocfs2_group_desc *bg, 1364 struct buffer_head *group_bh, 1365 unsigned int bit_off, 1366 unsigned int num_bits, 1367 unsigned int max_contig_bits, 1368 int fastpath) 1369 { 1370 int status; 1371 void *bitmap = bg->bg_bitmap; 1372 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1373 unsigned int start = bit_off + num_bits; 1374 u16 contig_bits; 1375 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1376 1377 /* All callers get the descriptor via 1378 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1379 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1380 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1381 1382 trace_ocfs2_block_group_set_bits(bit_off, num_bits); 1383 1384 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1385 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1386 1387 status = ocfs2_journal_access_gd(handle, 1388 INODE_CACHE(alloc_inode), 1389 group_bh, 1390 journal_type); 1391 if (status < 0) { 1392 mlog_errno(status); 1393 goto bail; 1394 } 1395 1396 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1397 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1398 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 1399 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1400 le16_to_cpu(bg->bg_bits), 1401 le16_to_cpu(bg->bg_free_bits_count), 1402 num_bits); 1403 } 1404 while(num_bits--) 1405 ocfs2_set_bit(bit_off++, bitmap); 1406 1407 /* 1408 * this is optimize path, caller set old contig value 1409 * in max_contig_bits to bypass finding action. 1410 */ 1411 if (fastpath) { 1412 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1413 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { 1414 /* 1415 * Usually, the block group bitmap allocates only 1 bit 1416 * at a time, while the cluster group allocates n bits 1417 * each time. Therefore, we only save the contig bits for 1418 * the cluster group. 1419 */ 1420 contig_bits = ocfs2_find_max_contig_free_bits(bitmap, 1421 le16_to_cpu(bg->bg_bits), start); 1422 if (contig_bits > max_contig_bits) 1423 max_contig_bits = contig_bits; 1424 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1425 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); 1426 } else { 1427 bg->bg_contig_free_bits = 0; 1428 } 1429 1430 ocfs2_journal_dirty(handle, group_bh); 1431 1432 bail: 1433 return status; 1434 } 1435 1436 /* find the one with the most empty bits */ 1437 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1438 { 1439 u16 curr, best; 1440 1441 BUG_ON(!cl->cl_next_free_rec); 1442 1443 best = curr = 0; 1444 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1445 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1446 le32_to_cpu(cl->cl_recs[best].c_free)) 1447 best = curr; 1448 curr++; 1449 } 1450 1451 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1452 return best; 1453 } 1454 1455 static int ocfs2_relink_block_group(handle_t *handle, 1456 struct inode *alloc_inode, 1457 struct buffer_head *fe_bh, 1458 struct buffer_head *bg_bh, 1459 struct buffer_head *prev_bg_bh, 1460 u16 chain) 1461 { 1462 int status; 1463 /* there is a really tiny chance the journal calls could fail, 1464 * but we wouldn't want inconsistent blocks in *any* case. */ 1465 u64 bg_ptr, prev_bg_ptr; 1466 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1467 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1468 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1469 1470 /* The caller got these descriptors from 1471 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1472 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1473 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1474 1475 trace_ocfs2_relink_block_group( 1476 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1477 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1478 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1479 1480 bg_ptr = le64_to_cpu(bg->bg_next_group); 1481 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1482 1483 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1484 prev_bg_bh, 1485 OCFS2_JOURNAL_ACCESS_WRITE); 1486 if (status < 0) 1487 goto out; 1488 1489 prev_bg->bg_next_group = bg->bg_next_group; 1490 ocfs2_journal_dirty(handle, prev_bg_bh); 1491 1492 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1493 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1494 if (status < 0) 1495 goto out_rollback_prev_bg; 1496 1497 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1498 ocfs2_journal_dirty(handle, bg_bh); 1499 1500 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1501 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1502 if (status < 0) 1503 goto out_rollback_bg; 1504 1505 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1506 ocfs2_journal_dirty(handle, fe_bh); 1507 1508 out: 1509 if (status < 0) 1510 mlog_errno(status); 1511 return status; 1512 1513 out_rollback_bg: 1514 bg->bg_next_group = cpu_to_le64(bg_ptr); 1515 out_rollback_prev_bg: 1516 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1517 goto out; 1518 } 1519 1520 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1521 u32 wanted) 1522 { 1523 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1524 } 1525 1526 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1527 * value on error. */ 1528 static int ocfs2_cluster_group_search(struct inode *inode, 1529 struct buffer_head *group_bh, 1530 u32 bits_wanted, u32 min_bits, 1531 u64 max_block, 1532 struct ocfs2_suballoc_result *res) 1533 { 1534 int search = -ENOSPC; 1535 int ret; 1536 u64 blkoff; 1537 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1538 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1539 unsigned int max_bits, gd_cluster_off; 1540 1541 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1542 1543 if (le16_to_cpu(gd->bg_contig_free_bits) && 1544 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) 1545 return -ENOSPC; 1546 1547 /* ->bg_contig_free_bits may un-initialized, so compare again */ 1548 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { 1549 max_bits = le16_to_cpu(gd->bg_bits); 1550 1551 /* Tail groups in cluster bitmaps which aren't cpg 1552 * aligned are prone to partial extension by a failed 1553 * fs resize. If the file system resize never got to 1554 * update the dinode cluster count, then we don't want 1555 * to trust any clusters past it, regardless of what 1556 * the group descriptor says. */ 1557 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1558 le64_to_cpu(gd->bg_blkno)); 1559 if ((gd_cluster_off + max_bits) > 1560 OCFS2_I(inode)->ip_clusters) { 1561 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1562 trace_ocfs2_cluster_group_search_wrong_max_bits( 1563 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1564 le16_to_cpu(gd->bg_bits), 1565 OCFS2_I(inode)->ip_clusters, max_bits); 1566 } 1567 1568 ret = ocfs2_block_group_find_clear_bits(osb, 1569 group_bh, bits_wanted, 1570 max_bits, res); 1571 if (ret) 1572 return ret; 1573 1574 if (max_block) { 1575 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1576 gd_cluster_off + 1577 res->sr_bit_offset + 1578 res->sr_bits); 1579 trace_ocfs2_cluster_group_search_max_block( 1580 (unsigned long long)blkoff, 1581 (unsigned long long)max_block); 1582 if (blkoff > max_block) 1583 return -ENOSPC; 1584 } 1585 1586 /* ocfs2_block_group_find_clear_bits() might 1587 * return success, but we still want to return 1588 * -ENOSPC unless it found the minimum number 1589 * of bits. */ 1590 if (min_bits <= res->sr_bits) 1591 search = 0; /* success */ 1592 } 1593 1594 return search; 1595 } 1596 1597 static int ocfs2_block_group_search(struct inode *inode, 1598 struct buffer_head *group_bh, 1599 u32 bits_wanted, u32 min_bits, 1600 u64 max_block, 1601 struct ocfs2_suballoc_result *res) 1602 { 1603 int ret = -ENOSPC; 1604 u64 blkoff; 1605 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1606 1607 BUG_ON(min_bits != 1); 1608 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1609 1610 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { 1611 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1612 group_bh, bits_wanted, 1613 le16_to_cpu(bg->bg_bits), 1614 res); 1615 if (!ret && max_block) { 1616 blkoff = le64_to_cpu(bg->bg_blkno) + 1617 res->sr_bit_offset + res->sr_bits; 1618 trace_ocfs2_block_group_search_max_block( 1619 (unsigned long long)blkoff, 1620 (unsigned long long)max_block); 1621 if (blkoff > max_block) 1622 ret = -ENOSPC; 1623 } 1624 } 1625 1626 return ret; 1627 } 1628 1629 int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1630 handle_t *handle, 1631 struct buffer_head *di_bh, 1632 u32 num_bits, 1633 u16 chain) 1634 { 1635 int ret; 1636 u32 tmp_used; 1637 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1638 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1639 1640 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1641 OCFS2_JOURNAL_ACCESS_WRITE); 1642 if (ret < 0) { 1643 mlog_errno(ret); 1644 goto out; 1645 } 1646 1647 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1648 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1649 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1650 ocfs2_journal_dirty(handle, di_bh); 1651 1652 out: 1653 return ret; 1654 } 1655 1656 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, 1657 struct buffer_head *di_bh, 1658 u32 num_bits, 1659 u16 chain) 1660 { 1661 u32 tmp_used; 1662 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1663 struct ocfs2_chain_list *cl; 1664 1665 cl = (struct ocfs2_chain_list *)&di->id2.i_chain; 1666 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1667 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); 1668 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); 1669 } 1670 1671 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1672 struct ocfs2_extent_rec *rec, 1673 struct ocfs2_chain_list *cl) 1674 { 1675 unsigned int bpc = le16_to_cpu(cl->cl_bpc); 1676 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; 1677 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; 1678 1679 if (res->sr_bit_offset < bitoff) 1680 return 0; 1681 if (res->sr_bit_offset >= (bitoff + bitcount)) 1682 return 0; 1683 res->sr_blkno = le64_to_cpu(rec->e_blkno) + 1684 (res->sr_bit_offset - bitoff); 1685 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) 1686 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; 1687 return 1; 1688 } 1689 1690 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, 1691 struct ocfs2_group_desc *bg, 1692 struct ocfs2_suballoc_result *res) 1693 { 1694 int i; 1695 u64 bg_blkno = res->sr_bg_blkno; /* Save off */ 1696 struct ocfs2_extent_rec *rec; 1697 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1698 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1699 1700 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { 1701 res->sr_blkno = 0; 1702 return; 1703 } 1704 1705 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; 1706 res->sr_bg_blkno = 0; /* Clear it for contig block groups */ 1707 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || 1708 !bg->bg_list.l_next_free_rec) 1709 return; 1710 1711 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { 1712 rec = &bg->bg_list.l_recs[i]; 1713 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { 1714 res->sr_bg_blkno = bg_blkno; /* Restore */ 1715 break; 1716 } 1717 } 1718 } 1719 1720 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1721 handle_t *handle, 1722 u32 bits_wanted, 1723 u32 min_bits, 1724 struct ocfs2_suballoc_result *res, 1725 u16 *bits_left) 1726 { 1727 int ret; 1728 struct buffer_head *group_bh = NULL; 1729 struct ocfs2_group_desc *gd; 1730 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1731 struct inode *alloc_inode = ac->ac_inode; 1732 1733 ret = ocfs2_read_group_descriptor(alloc_inode, di, 1734 res->sr_bg_blkno, &group_bh); 1735 if (ret < 0) { 1736 mlog_errno(ret); 1737 return ret; 1738 } 1739 1740 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1741 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1742 ac->ac_max_block, res); 1743 if (ret < 0) { 1744 if (ret != -ENOSPC) 1745 mlog_errno(ret); 1746 goto out; 1747 } 1748 1749 if (!ret) 1750 ocfs2_bg_discontig_fix_result(ac, gd, res); 1751 1752 /* 1753 * sr_bg_blkno might have been changed by 1754 * ocfs2_bg_discontig_fix_result 1755 */ 1756 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1757 1758 if (ac->ac_find_loc_only) 1759 goto out_loc_only; 1760 1761 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1762 res->sr_bits, 1763 le16_to_cpu(gd->bg_chain)); 1764 if (ret < 0) { 1765 mlog_errno(ret); 1766 goto out; 1767 } 1768 1769 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1770 res->sr_bit_offset, res->sr_bits, 1771 res->sr_max_contig_bits, 0); 1772 if (ret < 0) { 1773 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, 1774 res->sr_bits, 1775 le16_to_cpu(gd->bg_chain)); 1776 mlog_errno(ret); 1777 } 1778 1779 out_loc_only: 1780 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1781 1782 out: 1783 brelse(group_bh); 1784 1785 return ret; 1786 } 1787 1788 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1789 handle_t *handle, 1790 u32 bits_wanted, 1791 u32 min_bits, 1792 struct ocfs2_suballoc_result *res, 1793 u16 *bits_left) 1794 { 1795 int status; 1796 u16 chain; 1797 u64 next_group; 1798 struct inode *alloc_inode = ac->ac_inode; 1799 struct buffer_head *group_bh = NULL; 1800 struct buffer_head *prev_group_bh = NULL; 1801 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1802 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1803 struct ocfs2_group_desc *bg; 1804 1805 chain = ac->ac_chain; 1806 trace_ocfs2_search_chain_begin( 1807 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 1808 bits_wanted, chain); 1809 1810 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1811 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1812 &group_bh); 1813 if (status < 0) { 1814 mlog_errno(status); 1815 goto bail; 1816 } 1817 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1818 1819 status = -ENOSPC; 1820 /* for now, the chain search is a bit simplistic. We just use 1821 * the 1st group with any empty bits. */ 1822 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1823 bits_wanted, min_bits, 1824 ac->ac_max_block, 1825 res)) == -ENOSPC) { 1826 if (!bg->bg_next_group) 1827 break; 1828 1829 brelse(prev_group_bh); 1830 prev_group_bh = NULL; 1831 1832 next_group = le64_to_cpu(bg->bg_next_group); 1833 prev_group_bh = group_bh; 1834 group_bh = NULL; 1835 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1836 next_group, &group_bh); 1837 if (status < 0) { 1838 mlog_errno(status); 1839 goto bail; 1840 } 1841 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1842 } 1843 if (status < 0) { 1844 if (status != -ENOSPC) 1845 mlog_errno(status); 1846 goto bail; 1847 } 1848 1849 trace_ocfs2_search_chain_succ( 1850 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); 1851 1852 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1853 1854 BUG_ON(res->sr_bits == 0); 1855 if (!status) 1856 ocfs2_bg_discontig_fix_result(ac, bg, res); 1857 1858 /* 1859 * sr_bg_blkno might have been changed by 1860 * ocfs2_bg_discontig_fix_result 1861 */ 1862 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1863 1864 /* 1865 * Keep track of previous block descriptor read. When 1866 * we find a target, if we have read more than X 1867 * number of descriptors, and the target is reasonably 1868 * empty, relink him to top of his chain. 1869 * 1870 * We've read 0 extra blocks and only send one more to 1871 * the transaction, yet the next guy to search has a 1872 * much easier time. 1873 * 1874 * Do this *after* figuring out how many bits we're taking out 1875 * of our target group. 1876 */ 1877 if (!ac->ac_disable_chain_relink && 1878 (prev_group_bh) && 1879 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { 1880 status = ocfs2_relink_block_group(handle, alloc_inode, 1881 ac->ac_bh, group_bh, 1882 prev_group_bh, chain); 1883 if (status < 0) { 1884 mlog_errno(status); 1885 goto bail; 1886 } 1887 } 1888 1889 if (ac->ac_find_loc_only) 1890 goto out_loc_only; 1891 1892 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, 1893 ac->ac_bh, res->sr_bits, 1894 chain); 1895 if (status) { 1896 mlog_errno(status); 1897 goto bail; 1898 } 1899 1900 status = ocfs2_block_group_set_bits(handle, 1901 alloc_inode, 1902 bg, 1903 group_bh, 1904 res->sr_bit_offset, 1905 res->sr_bits, 1906 res->sr_max_contig_bits, 1907 0); 1908 if (status < 0) { 1909 ocfs2_rollback_alloc_dinode_counts(alloc_inode, 1910 ac->ac_bh, res->sr_bits, chain); 1911 mlog_errno(status); 1912 goto bail; 1913 } 1914 1915 trace_ocfs2_search_chain_end( 1916 (unsigned long long)le64_to_cpu(fe->i_blkno), 1917 res->sr_bits); 1918 1919 out_loc_only: 1920 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1921 bail: 1922 brelse(group_bh); 1923 brelse(prev_group_bh); 1924 1925 if (status) 1926 mlog_errno(status); 1927 return status; 1928 } 1929 1930 /* will give out up to bits_wanted contiguous bits. */ 1931 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 1932 handle_t *handle, 1933 u32 bits_wanted, 1934 u32 min_bits, 1935 struct ocfs2_suballoc_result *res) 1936 { 1937 int status; 1938 u16 victim, i; 1939 u16 bits_left = 0; 1940 u64 hint = ac->ac_last_group; 1941 struct ocfs2_chain_list *cl; 1942 struct ocfs2_dinode *fe; 1943 1944 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1945 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1946 BUG_ON(!ac->ac_bh); 1947 1948 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1949 1950 /* The bh was validated by the inode read during 1951 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 1952 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 1953 1954 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1955 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1956 status = ocfs2_error(ac->ac_inode->i_sb, 1957 "Chain allocator dinode %llu has %u used bits but only %u total\n", 1958 (unsigned long long)le64_to_cpu(fe->i_blkno), 1959 le32_to_cpu(fe->id1.bitmap1.i_used), 1960 le32_to_cpu(fe->id1.bitmap1.i_total)); 1961 goto bail; 1962 } 1963 1964 res->sr_bg_blkno = hint; 1965 if (res->sr_bg_blkno) { 1966 /* Attempt to short-circuit the usual search mechanism 1967 * by jumping straight to the most recently used 1968 * allocation group. This helps us maintain some 1969 * contiguousness across allocations. */ 1970 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1971 min_bits, res, &bits_left); 1972 if (!status) 1973 goto set_hint; 1974 if (status < 0 && status != -ENOSPC) { 1975 mlog_errno(status); 1976 goto bail; 1977 } 1978 } 1979 1980 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1981 1982 victim = ocfs2_find_victim_chain(cl); 1983 ac->ac_chain = victim; 1984 1985 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1986 res, &bits_left); 1987 if (!status) { 1988 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) 1989 hint = res->sr_bg_blkno; 1990 else 1991 hint = ocfs2_group_from_res(res); 1992 goto set_hint; 1993 } 1994 if (status < 0 && status != -ENOSPC) { 1995 mlog_errno(status); 1996 goto bail; 1997 } 1998 1999 trace_ocfs2_claim_suballoc_bits(victim); 2000 2001 /* If we didn't pick a good victim, then just default to 2002 * searching each chain in order. Don't allow chain relinking 2003 * because we only calculate enough journal credits for one 2004 * relink per alloc. */ 2005 ac->ac_disable_chain_relink = 1; 2006 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 2007 if (i == victim) 2008 continue; 2009 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) 2010 continue; 2011 2012 ac->ac_chain = i; 2013 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2014 res, &bits_left); 2015 if (!status) { 2016 hint = ocfs2_group_from_res(res); 2017 break; 2018 } 2019 if (status < 0 && status != -ENOSPC) { 2020 mlog_errno(status); 2021 goto bail; 2022 } 2023 } 2024 2025 set_hint: 2026 if (status != -ENOSPC) { 2027 /* If the next search of this group is not likely to 2028 * yield a suitable extent, then we reset the last 2029 * group hint so as to not waste a disk read */ 2030 if (bits_left < min_bits) 2031 ac->ac_last_group = 0; 2032 else 2033 ac->ac_last_group = hint; 2034 } 2035 2036 bail: 2037 if (status) 2038 mlog_errno(status); 2039 return status; 2040 } 2041 2042 int ocfs2_claim_metadata(handle_t *handle, 2043 struct ocfs2_alloc_context *ac, 2044 u32 bits_wanted, 2045 u64 *suballoc_loc, 2046 u16 *suballoc_bit_start, 2047 unsigned int *num_bits, 2048 u64 *blkno_start) 2049 { 2050 int status; 2051 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2052 2053 BUG_ON(!ac); 2054 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 2055 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 2056 2057 status = ocfs2_claim_suballoc_bits(ac, 2058 handle, 2059 bits_wanted, 2060 1, 2061 &res); 2062 if (status < 0) { 2063 mlog_errno(status); 2064 goto bail; 2065 } 2066 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2067 2068 *suballoc_loc = res.sr_bg_blkno; 2069 *suballoc_bit_start = res.sr_bit_offset; 2070 *blkno_start = res.sr_blkno; 2071 ac->ac_bits_given += res.sr_bits; 2072 *num_bits = res.sr_bits; 2073 status = 0; 2074 bail: 2075 if (status) 2076 mlog_errno(status); 2077 return status; 2078 } 2079 2080 static void ocfs2_init_inode_ac_group(struct inode *dir, 2081 struct buffer_head *parent_di_bh, 2082 struct ocfs2_alloc_context *ac) 2083 { 2084 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; 2085 /* 2086 * Try to allocate inodes from some specific group. 2087 * 2088 * If the parent dir has recorded the last group used in allocation, 2089 * cool, use it. Otherwise if we try to allocate new inode from the 2090 * same slot the parent dir belongs to, use the same chunk. 2091 * 2092 * We are very careful here to avoid the mistake of setting 2093 * ac_last_group to a group descriptor from a different (unlocked) slot. 2094 */ 2095 if (OCFS2_I(dir)->ip_last_used_group && 2096 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2097 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2098 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { 2099 if (di->i_suballoc_loc) 2100 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); 2101 else 2102 ac->ac_last_group = ocfs2_which_suballoc_group( 2103 le64_to_cpu(di->i_blkno), 2104 le16_to_cpu(di->i_suballoc_bit)); 2105 } 2106 } 2107 2108 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2109 struct ocfs2_alloc_context *ac) 2110 { 2111 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 2112 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2113 } 2114 2115 int ocfs2_find_new_inode_loc(struct inode *dir, 2116 struct buffer_head *parent_fe_bh, 2117 struct ocfs2_alloc_context *ac, 2118 u64 *fe_blkno) 2119 { 2120 int ret; 2121 handle_t *handle = NULL; 2122 struct ocfs2_suballoc_result *res; 2123 2124 BUG_ON(!ac); 2125 BUG_ON(ac->ac_bits_given != 0); 2126 BUG_ON(ac->ac_bits_wanted != 1); 2127 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2128 2129 res = kzalloc(sizeof(*res), GFP_NOFS); 2130 if (res == NULL) { 2131 ret = -ENOMEM; 2132 mlog_errno(ret); 2133 goto out; 2134 } 2135 2136 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2137 2138 /* 2139 * The handle started here is for chain relink. Alternatively, 2140 * we could just disable relink for these calls. 2141 */ 2142 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); 2143 if (IS_ERR(handle)) { 2144 ret = PTR_ERR(handle); 2145 handle = NULL; 2146 mlog_errno(ret); 2147 goto out; 2148 } 2149 2150 /* 2151 * This will instruct ocfs2_claim_suballoc_bits and 2152 * ocfs2_search_one_group to search but save actual allocation 2153 * for later. 2154 */ 2155 ac->ac_find_loc_only = 1; 2156 2157 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); 2158 if (ret < 0) { 2159 mlog_errno(ret); 2160 goto out; 2161 } 2162 2163 ac->ac_find_loc_priv = res; 2164 *fe_blkno = res->sr_blkno; 2165 ocfs2_update_inode_fsync_trans(handle, dir, 0); 2166 out: 2167 if (handle) 2168 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2169 2170 if (ret) 2171 kfree(res); 2172 2173 return ret; 2174 } 2175 2176 int ocfs2_claim_new_inode_at_loc(handle_t *handle, 2177 struct inode *dir, 2178 struct ocfs2_alloc_context *ac, 2179 u64 *suballoc_loc, 2180 u16 *suballoc_bit, 2181 u64 di_blkno) 2182 { 2183 int ret; 2184 u16 chain; 2185 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; 2186 struct buffer_head *bg_bh = NULL; 2187 struct ocfs2_group_desc *bg; 2188 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2189 2190 /* 2191 * Since di_blkno is being passed back in, we check for any 2192 * inconsistencies which may have happened between 2193 * calls. These are code bugs as di_blkno is not expected to 2194 * change once returned from ocfs2_find_new_inode_loc() 2195 */ 2196 BUG_ON(res->sr_blkno != di_blkno); 2197 2198 ret = ocfs2_read_group_descriptor(ac->ac_inode, di, 2199 res->sr_bg_stable_blkno, &bg_bh); 2200 if (ret) { 2201 mlog_errno(ret); 2202 goto out; 2203 } 2204 2205 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 2206 chain = le16_to_cpu(bg->bg_chain); 2207 2208 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, 2209 ac->ac_bh, res->sr_bits, 2210 chain); 2211 if (ret) { 2212 mlog_errno(ret); 2213 goto out; 2214 } 2215 2216 ret = ocfs2_block_group_set_bits(handle, 2217 ac->ac_inode, 2218 bg, 2219 bg_bh, 2220 res->sr_bit_offset, 2221 res->sr_bits, 2222 res->sr_max_contig_bits, 2223 0); 2224 if (ret < 0) { 2225 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, 2226 ac->ac_bh, res->sr_bits, chain); 2227 mlog_errno(ret); 2228 goto out; 2229 } 2230 2231 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, 2232 res->sr_bits); 2233 2234 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2235 2236 BUG_ON(res->sr_bits != 1); 2237 2238 *suballoc_loc = res->sr_bg_blkno; 2239 *suballoc_bit = res->sr_bit_offset; 2240 ac->ac_bits_given++; 2241 ocfs2_save_inode_ac_group(dir, ac); 2242 2243 out: 2244 brelse(bg_bh); 2245 2246 return ret; 2247 } 2248 2249 int ocfs2_claim_new_inode(handle_t *handle, 2250 struct inode *dir, 2251 struct buffer_head *parent_fe_bh, 2252 struct ocfs2_alloc_context *ac, 2253 u64 *suballoc_loc, 2254 u16 *suballoc_bit, 2255 u64 *fe_blkno) 2256 { 2257 int status; 2258 struct ocfs2_suballoc_result res; 2259 2260 BUG_ON(!ac); 2261 BUG_ON(ac->ac_bits_given != 0); 2262 BUG_ON(ac->ac_bits_wanted != 1); 2263 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2264 2265 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2266 2267 status = ocfs2_claim_suballoc_bits(ac, 2268 handle, 2269 1, 2270 1, 2271 &res); 2272 if (status < 0) { 2273 mlog_errno(status); 2274 goto bail; 2275 } 2276 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2277 2278 BUG_ON(res.sr_bits != 1); 2279 2280 *suballoc_loc = res.sr_bg_blkno; 2281 *suballoc_bit = res.sr_bit_offset; 2282 *fe_blkno = res.sr_blkno; 2283 ac->ac_bits_given++; 2284 ocfs2_save_inode_ac_group(dir, ac); 2285 status = 0; 2286 bail: 2287 if (status) 2288 mlog_errno(status); 2289 return status; 2290 } 2291 2292 /* translate a group desc. blkno and it's bitmap offset into 2293 * disk cluster offset. */ 2294 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 2295 u64 bg_blkno, 2296 u16 bg_bit_off) 2297 { 2298 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2299 u32 cluster = 0; 2300 2301 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2302 2303 if (bg_blkno != osb->first_cluster_group_blkno) 2304 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 2305 cluster += (u32) bg_bit_off; 2306 return cluster; 2307 } 2308 2309 /* given a cluster offset, calculate which block group it belongs to 2310 * and return that block offset. */ 2311 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 2312 { 2313 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2314 u32 group_no; 2315 2316 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2317 2318 group_no = cluster / osb->bitmap_cpg; 2319 if (!group_no) 2320 return osb->first_cluster_group_blkno; 2321 return ocfs2_clusters_to_blocks(inode->i_sb, 2322 group_no * osb->bitmap_cpg); 2323 } 2324 2325 /* given the block number of a cluster start, calculate which cluster 2326 * group and descriptor bitmap offset that corresponds to. */ 2327 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 2328 u64 data_blkno, 2329 u64 *bg_blkno, 2330 u16 *bg_bit_off) 2331 { 2332 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2333 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 2334 2335 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2336 2337 *bg_blkno = ocfs2_which_cluster_group(inode, 2338 data_cluster); 2339 2340 if (*bg_blkno == osb->first_cluster_group_blkno) 2341 *bg_bit_off = (u16) data_cluster; 2342 else 2343 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 2344 data_blkno - *bg_blkno); 2345 } 2346 2347 /* 2348 * min_bits - minimum contiguous chunk from this total allocation we 2349 * can handle. set to what we asked for originally for a full 2350 * contig. allocation, set to '1' to indicate we can deal with extents 2351 * of any size. 2352 */ 2353 int __ocfs2_claim_clusters(handle_t *handle, 2354 struct ocfs2_alloc_context *ac, 2355 u32 min_clusters, 2356 u32 max_clusters, 2357 u32 *cluster_start, 2358 u32 *num_clusters) 2359 { 2360 int status; 2361 unsigned int bits_wanted = max_clusters; 2362 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2363 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2364 2365 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2366 2367 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2368 && ac->ac_which != OCFS2_AC_USE_MAIN); 2369 2370 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2371 WARN_ON(min_clusters > 1); 2372 2373 status = ocfs2_claim_local_alloc_bits(osb, 2374 handle, 2375 ac, 2376 bits_wanted, 2377 cluster_start, 2378 num_clusters); 2379 if (!status) 2380 atomic_inc(&osb->alloc_stats.local_data); 2381 } else { 2382 if (min_clusters > (osb->bitmap_cpg - 1)) { 2383 /* The only paths asking for contiguousness 2384 * should know about this already. */ 2385 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 2386 "group bitmap size %u!\n", min_clusters, 2387 osb->bitmap_cpg); 2388 status = -ENOSPC; 2389 goto bail; 2390 } 2391 /* clamp the current request down to a realistic size. */ 2392 if (bits_wanted > (osb->bitmap_cpg - 1)) 2393 bits_wanted = osb->bitmap_cpg - 1; 2394 2395 status = ocfs2_claim_suballoc_bits(ac, 2396 handle, 2397 bits_wanted, 2398 min_clusters, 2399 &res); 2400 if (!status) { 2401 BUG_ON(res.sr_blkno); /* cluster alloc can't set */ 2402 *cluster_start = 2403 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2404 res.sr_bg_blkno, 2405 res.sr_bit_offset); 2406 atomic_inc(&osb->alloc_stats.bitmap_data); 2407 *num_clusters = res.sr_bits; 2408 } 2409 } 2410 if (status < 0) { 2411 if (status != -ENOSPC) 2412 mlog_errno(status); 2413 goto bail; 2414 } 2415 2416 ac->ac_bits_given += *num_clusters; 2417 2418 bail: 2419 if (status) 2420 mlog_errno(status); 2421 return status; 2422 } 2423 2424 int ocfs2_claim_clusters(handle_t *handle, 2425 struct ocfs2_alloc_context *ac, 2426 u32 min_clusters, 2427 u32 *cluster_start, 2428 u32 *num_clusters) 2429 { 2430 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2431 2432 return __ocfs2_claim_clusters(handle, ac, min_clusters, 2433 bits_wanted, cluster_start, num_clusters); 2434 } 2435 2436 static int ocfs2_block_group_clear_bits(handle_t *handle, 2437 struct inode *alloc_inode, 2438 struct ocfs2_group_desc *bg, 2439 struct buffer_head *group_bh, 2440 unsigned int bit_off, 2441 unsigned int num_bits, 2442 unsigned int max_contig_bits, 2443 void (*undo_fn)(unsigned int bit, 2444 unsigned long *bmap)) 2445 { 2446 int status; 2447 unsigned int tmp; 2448 u16 contig_bits; 2449 struct ocfs2_group_desc *undo_bg = NULL; 2450 struct journal_head *jh; 2451 2452 /* The caller got this descriptor from 2453 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2454 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2455 2456 trace_ocfs2_block_group_clear_bits(bit_off, num_bits); 2457 2458 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2459 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2460 group_bh, 2461 undo_fn ? 2462 OCFS2_JOURNAL_ACCESS_UNDO : 2463 OCFS2_JOURNAL_ACCESS_WRITE); 2464 if (status < 0) { 2465 mlog_errno(status); 2466 goto bail; 2467 } 2468 2469 jh = bh2jh(group_bh); 2470 if (undo_fn) { 2471 spin_lock(&jh->b_state_lock); 2472 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; 2473 BUG_ON(!undo_bg); 2474 } 2475 2476 tmp = num_bits; 2477 while(tmp--) { 2478 ocfs2_clear_bit((bit_off + tmp), 2479 (unsigned long *) bg->bg_bitmap); 2480 if (undo_fn) 2481 undo_fn(bit_off + tmp, 2482 (unsigned long *) undo_bg->bg_bitmap); 2483 } 2484 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2485 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2486 if (undo_fn) 2487 spin_unlock(&jh->b_state_lock); 2488 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 2489 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2490 le16_to_cpu(bg->bg_bits), 2491 le16_to_cpu(bg->bg_free_bits_count), 2492 num_bits); 2493 } 2494 2495 /* 2496 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), 2497 * we still need to rescan whole bitmap. 2498 */ 2499 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 2500 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 2501 le16_to_cpu(bg->bg_bits), 0); 2502 if (contig_bits > max_contig_bits) 2503 max_contig_bits = contig_bits; 2504 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 2505 } else { 2506 bg->bg_contig_free_bits = 0; 2507 } 2508 2509 if (undo_fn) 2510 spin_unlock(&jh->b_state_lock); 2511 2512 ocfs2_journal_dirty(handle, group_bh); 2513 bail: 2514 return status; 2515 } 2516 2517 /* 2518 * expects the suballoc inode to already be locked. 2519 */ 2520 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2521 struct inode *alloc_inode, 2522 struct buffer_head *alloc_bh, 2523 unsigned int start_bit, 2524 u64 bg_blkno, 2525 unsigned int count, 2526 void (*undo_fn)(unsigned int bit, 2527 unsigned long *bitmap)) 2528 { 2529 int status = 0; 2530 u32 tmp_used; 2531 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2532 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2533 struct buffer_head *group_bh = NULL; 2534 struct ocfs2_group_desc *group; 2535 __le16 old_bg_contig_free_bits = 0; 2536 2537 /* The alloc_bh comes from ocfs2_free_dinode() or 2538 * ocfs2_free_clusters(). The callers have all locked the 2539 * allocator and gotten alloc_bh from the lock call. This 2540 * validates the dinode buffer. Any corruption that has happened 2541 * is a code bug. */ 2542 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2543 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2544 2545 trace_ocfs2_free_suballoc_bits( 2546 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 2547 (unsigned long long)bg_blkno, 2548 start_bit, count); 2549 2550 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2551 &group_bh); 2552 if (status < 0) { 2553 mlog_errno(status); 2554 goto bail; 2555 } 2556 group = (struct ocfs2_group_desc *) group_bh->b_data; 2557 2558 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2559 2560 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2561 old_bg_contig_free_bits = group->bg_contig_free_bits; 2562 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2563 group, group_bh, 2564 start_bit, count, 0, undo_fn); 2565 if (status < 0) { 2566 mlog_errno(status); 2567 goto bail; 2568 } 2569 2570 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2571 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2572 if (status < 0) { 2573 mlog_errno(status); 2574 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, 2575 start_bit, count, 2576 le16_to_cpu(old_bg_contig_free_bits), 1); 2577 goto bail; 2578 } 2579 2580 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 2581 count); 2582 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2583 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2584 ocfs2_journal_dirty(handle, alloc_bh); 2585 2586 bail: 2587 brelse(group_bh); 2588 return status; 2589 } 2590 2591 int ocfs2_free_suballoc_bits(handle_t *handle, 2592 struct inode *alloc_inode, 2593 struct buffer_head *alloc_bh, 2594 unsigned int start_bit, 2595 u64 bg_blkno, 2596 unsigned int count) 2597 { 2598 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2599 start_bit, bg_blkno, count, NULL); 2600 } 2601 2602 int ocfs2_free_dinode(handle_t *handle, 2603 struct inode *inode_alloc_inode, 2604 struct buffer_head *inode_alloc_bh, 2605 struct ocfs2_dinode *di) 2606 { 2607 u64 blk = le64_to_cpu(di->i_blkno); 2608 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2609 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2610 2611 if (di->i_suballoc_loc) 2612 bg_blkno = le64_to_cpu(di->i_suballoc_loc); 2613 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2614 inode_alloc_bh, bit, bg_blkno, 1); 2615 } 2616 2617 static int _ocfs2_free_clusters(handle_t *handle, 2618 struct inode *bitmap_inode, 2619 struct buffer_head *bitmap_bh, 2620 u64 start_blk, 2621 unsigned int num_clusters, 2622 void (*undo_fn)(unsigned int bit, 2623 unsigned long *bitmap)) 2624 { 2625 int status; 2626 u16 bg_start_bit; 2627 u64 bg_blkno; 2628 2629 /* You can't ever have a contiguous set of clusters 2630 * bigger than a block group bitmap so we never have to worry 2631 * about looping on them. 2632 * This is expensive. We can safely remove once this stuff has 2633 * gotten tested really well. */ 2634 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, 2635 ocfs2_blocks_to_clusters(bitmap_inode->i_sb, 2636 start_blk))); 2637 2638 2639 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2640 &bg_start_bit); 2641 2642 trace_ocfs2_free_clusters((unsigned long long)bg_blkno, 2643 (unsigned long long)start_blk, 2644 bg_start_bit, num_clusters); 2645 2646 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2647 bg_start_bit, bg_blkno, 2648 num_clusters, undo_fn); 2649 if (status < 0) { 2650 mlog_errno(status); 2651 goto out; 2652 } 2653 2654 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 2655 num_clusters); 2656 2657 out: 2658 return status; 2659 } 2660 2661 int ocfs2_free_clusters(handle_t *handle, 2662 struct inode *bitmap_inode, 2663 struct buffer_head *bitmap_bh, 2664 u64 start_blk, 2665 unsigned int num_clusters) 2666 { 2667 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2668 start_blk, num_clusters, 2669 _ocfs2_set_bit); 2670 } 2671 2672 /* 2673 * Give never-used clusters back to the global bitmap. We don't need 2674 * to protect these bits in the undo buffer. 2675 */ 2676 int ocfs2_release_clusters(handle_t *handle, 2677 struct inode *bitmap_inode, 2678 struct buffer_head *bitmap_bh, 2679 u64 start_blk, 2680 unsigned int num_clusters) 2681 { 2682 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2683 start_blk, num_clusters, 2684 _ocfs2_clear_bit); 2685 } 2686 2687 /* 2688 * For a given allocation, determine which allocators will need to be 2689 * accessed, and lock them, reserving the appropriate number of bits. 2690 * 2691 * Sparse file systems call this from ocfs2_write_begin_nolock() 2692 * and ocfs2_allocate_unwritten_extents(). 2693 * 2694 * File systems which don't support holes call this from 2695 * ocfs2_extend_allocation(). 2696 */ 2697 int ocfs2_lock_allocators(struct inode *inode, 2698 struct ocfs2_extent_tree *et, 2699 u32 clusters_to_add, u32 extents_to_split, 2700 struct ocfs2_alloc_context **data_ac, 2701 struct ocfs2_alloc_context **meta_ac) 2702 { 2703 int ret = 0, num_free_extents; 2704 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 2705 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2706 2707 *meta_ac = NULL; 2708 if (data_ac) 2709 *data_ac = NULL; 2710 2711 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2712 2713 num_free_extents = ocfs2_num_free_extents(et); 2714 if (num_free_extents < 0) { 2715 ret = num_free_extents; 2716 mlog_errno(ret); 2717 goto out; 2718 } 2719 2720 /* 2721 * Sparse allocation file systems need to be more conservative 2722 * with reserving room for expansion - the actual allocation 2723 * happens while we've got a journal handle open so re-taking 2724 * a cluster lock (because we ran out of room for another 2725 * extent) will violate ordering rules. 2726 * 2727 * Most of the time we'll only be seeing this 1 cluster at a time 2728 * anyway. 2729 * 2730 * Always lock for any unwritten extents - we might want to 2731 * add blocks during a split. 2732 */ 2733 if (!num_free_extents || 2734 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 2735 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 2736 if (ret < 0) { 2737 if (ret != -ENOSPC) 2738 mlog_errno(ret); 2739 goto out; 2740 } 2741 } 2742 2743 if (clusters_to_add == 0) 2744 goto out; 2745 2746 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 2747 if (ret < 0) { 2748 if (ret != -ENOSPC) 2749 mlog_errno(ret); 2750 goto out; 2751 } 2752 2753 out: 2754 if (ret) { 2755 if (*meta_ac) { 2756 ocfs2_free_alloc_context(*meta_ac); 2757 *meta_ac = NULL; 2758 } 2759 2760 /* 2761 * We cannot have an error and a non null *data_ac. 2762 */ 2763 } 2764 2765 return ret; 2766 } 2767 2768 /* 2769 * Read the inode specified by blkno to get suballoc_slot and 2770 * suballoc_bit. 2771 */ 2772 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2773 u16 *suballoc_slot, u64 *group_blkno, 2774 u16 *suballoc_bit) 2775 { 2776 int status; 2777 struct buffer_head *inode_bh = NULL; 2778 struct ocfs2_dinode *inode_fe; 2779 2780 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); 2781 2782 /* dirty read disk */ 2783 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2784 if (status < 0) { 2785 mlog(ML_ERROR, "read block %llu failed %d\n", 2786 (unsigned long long)blkno, status); 2787 goto bail; 2788 } 2789 2790 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 2791 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 2792 mlog(ML_ERROR, "invalid inode %llu requested\n", 2793 (unsigned long long)blkno); 2794 status = -EINVAL; 2795 goto bail; 2796 } 2797 2798 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 2799 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 2800 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 2801 (unsigned long long)blkno, 2802 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 2803 status = -EINVAL; 2804 goto bail; 2805 } 2806 2807 if (suballoc_slot) 2808 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2809 if (suballoc_bit) 2810 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2811 if (group_blkno) 2812 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); 2813 2814 bail: 2815 brelse(inode_bh); 2816 2817 if (status) 2818 mlog_errno(status); 2819 return status; 2820 } 2821 2822 /* 2823 * test whether bit is SET in allocator bitmap or not. on success, 0 2824 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 2825 * is returned and *res is meaningless. Call this after you have 2826 * cluster locked against suballoc, or you may get a result based on 2827 * non-up2date contents 2828 */ 2829 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2830 struct inode *suballoc, 2831 struct buffer_head *alloc_bh, 2832 u64 group_blkno, u64 blkno, 2833 u16 bit, int *res) 2834 { 2835 struct ocfs2_dinode *alloc_di; 2836 struct ocfs2_group_desc *group; 2837 struct buffer_head *group_bh = NULL; 2838 u64 bg_blkno; 2839 int status; 2840 2841 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, 2842 (unsigned int)bit); 2843 2844 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 2845 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 2846 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2847 (unsigned int)bit, 2848 ocfs2_bits_per_group(&alloc_di->id2.i_chain)); 2849 status = -EINVAL; 2850 goto bail; 2851 } 2852 2853 bg_blkno = group_blkno ? group_blkno : 2854 ocfs2_which_suballoc_group(blkno, bit); 2855 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, 2856 &group_bh); 2857 if (status < 0) { 2858 mlog(ML_ERROR, "read group %llu failed %d\n", 2859 (unsigned long long)bg_blkno, status); 2860 goto bail; 2861 } 2862 2863 group = (struct ocfs2_group_desc *) group_bh->b_data; 2864 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 2865 2866 bail: 2867 brelse(group_bh); 2868 2869 if (status) 2870 mlog_errno(status); 2871 return status; 2872 } 2873 2874 /* 2875 * Test if the bit representing this inode (blkno) is set in the 2876 * suballocator. 2877 * 2878 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 2879 * 2880 * In the event of failure, a negative value is returned and *res is 2881 * meaningless. 2882 * 2883 * Callers must make sure to hold nfs_sync_lock to prevent 2884 * ocfs2_delete_inode() on another node from accessing the same 2885 * suballocator concurrently. 2886 */ 2887 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2888 { 2889 int status; 2890 u64 group_blkno = 0; 2891 u16 suballoc_bit = 0, suballoc_slot = 0; 2892 struct inode *inode_alloc_inode; 2893 struct buffer_head *alloc_bh = NULL; 2894 2895 trace_ocfs2_test_inode_bit((unsigned long long)blkno); 2896 2897 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2898 &group_blkno, &suballoc_bit); 2899 if (status < 0) { 2900 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2901 goto bail; 2902 } 2903 2904 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) 2905 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 2906 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 2907 else 2908 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 2909 INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 2910 if (!inode_alloc_inode) { 2911 /* the error code could be inaccurate, but we are not able to 2912 * get the correct one. */ 2913 status = -EINVAL; 2914 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 2915 (u32)suballoc_slot); 2916 goto bail; 2917 } 2918 2919 inode_lock(inode_alloc_inode); 2920 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2921 if (status < 0) { 2922 inode_unlock(inode_alloc_inode); 2923 iput(inode_alloc_inode); 2924 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2925 (u32)suballoc_slot, status); 2926 goto bail; 2927 } 2928 2929 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2930 group_blkno, blkno, suballoc_bit, res); 2931 if (status < 0) 2932 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2933 2934 ocfs2_inode_unlock(inode_alloc_inode, 0); 2935 inode_unlock(inode_alloc_inode); 2936 2937 iput(inode_alloc_inode); 2938 brelse(alloc_bh); 2939 bail: 2940 if (status) 2941 mlog_errno(status); 2942 return status; 2943 } 2944