1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * suballoc.c 4 * 5 * metadata alloc and free 6 * Inspired by ext3 block groups. 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/highmem.h> 15 16 #include <cluster/masklog.h> 17 18 #include "ocfs2.h" 19 20 #include "alloc.h" 21 #include "blockcheck.h" 22 #include "dlmglue.h" 23 #include "inode.h" 24 #include "journal.h" 25 #include "localalloc.h" 26 #include "suballoc.h" 27 #include "super.h" 28 #include "sysfile.h" 29 #include "uptodate.h" 30 #include "ocfs2_trace.h" 31 32 #include "buffer_head_io.h" 33 34 #define NOT_ALLOC_NEW_GROUP 0 35 #define ALLOC_NEW_GROUP 0x1 36 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 37 38 #define OCFS2_MAX_TO_STEAL 1024 39 40 struct ocfs2_suballoc_result { 41 u64 sr_bg_blkno; /* The bg we allocated from. Set 42 to 0 when a block group is 43 contiguous. */ 44 u64 sr_bg_stable_blkno; /* 45 * Doesn't change, always 46 * set to target block 47 * group descriptor 48 * block. 49 */ 50 u64 sr_blkno; /* The first allocated block */ 51 unsigned int sr_bit_offset; /* The bit in the bg */ 52 unsigned int sr_bits; /* How many bits we claimed */ 53 unsigned int sr_max_contig_bits; /* The length for contiguous 54 * free bits, only available 55 * for cluster group 56 */ 57 }; 58 59 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) 60 { 61 if (res->sr_blkno == 0) 62 return 0; 63 64 if (res->sr_bg_blkno) 65 return res->sr_bg_blkno; 66 67 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 68 } 69 70 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 71 static int ocfs2_block_group_fill(handle_t *handle, 72 struct inode *alloc_inode, 73 struct buffer_head *bg_bh, 74 u64 group_blkno, 75 unsigned int group_clusters, 76 u16 my_chain, 77 struct ocfs2_chain_list *cl); 78 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 79 struct inode *alloc_inode, 80 struct buffer_head *bh, 81 u64 max_block, 82 u64 *last_alloc_group, 83 int flags); 84 85 static int ocfs2_cluster_group_search(struct inode *inode, 86 struct buffer_head *group_bh, 87 u32 bits_wanted, u32 min_bits, 88 u64 max_block, 89 struct ocfs2_suballoc_result *res); 90 static int ocfs2_block_group_search(struct inode *inode, 91 struct buffer_head *group_bh, 92 u32 bits_wanted, u32 min_bits, 93 u64 max_block, 94 struct ocfs2_suballoc_result *res); 95 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 96 handle_t *handle, 97 u32 bits_wanted, 98 u32 min_bits, 99 struct ocfs2_suballoc_result *res); 100 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 101 int nr); 102 static int ocfs2_relink_block_group(handle_t *handle, 103 struct inode *alloc_inode, 104 struct buffer_head *fe_bh, 105 struct buffer_head *bg_bh, 106 struct buffer_head *prev_bg_bh, 107 u16 chain); 108 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 109 u32 wanted); 110 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 111 u64 bg_blkno, 112 u16 bg_bit_off); 113 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 114 u64 data_blkno, 115 u64 *bg_blkno, 116 u16 *bg_bit_off); 117 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 118 u32 bits_wanted, u64 max_block, 119 int flags, 120 struct ocfs2_alloc_context **ac); 121 122 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 123 { 124 struct inode *inode = ac->ac_inode; 125 126 if (inode) { 127 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 128 ocfs2_inode_unlock(inode, 1); 129 130 inode_unlock(inode); 131 132 iput(inode); 133 ac->ac_inode = NULL; 134 } 135 brelse(ac->ac_bh); 136 ac->ac_bh = NULL; 137 ac->ac_resv = NULL; 138 kfree(ac->ac_find_loc_priv); 139 ac->ac_find_loc_priv = NULL; 140 } 141 142 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 143 { 144 ocfs2_free_ac_resource(ac); 145 kfree(ac); 146 } 147 148 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 149 { 150 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 151 } 152 153 #define do_error(fmt, ...) \ 154 do { \ 155 if (resize) \ 156 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ 157 else \ 158 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 159 } while (0) 160 161 static int ocfs2_validate_gd_self(struct super_block *sb, 162 struct buffer_head *bh, 163 int resize) 164 { 165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 166 167 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 168 do_error("Group descriptor #%llu has bad signature %.*s\n", 169 (unsigned long long)bh->b_blocknr, 7, 170 gd->bg_signature); 171 } 172 173 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 174 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", 175 (unsigned long long)bh->b_blocknr, 176 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 177 } 178 179 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 180 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", 181 (unsigned long long)bh->b_blocknr, 182 le32_to_cpu(gd->bg_generation)); 183 } 184 185 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 186 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", 187 (unsigned long long)bh->b_blocknr, 188 le16_to_cpu(gd->bg_bits), 189 le16_to_cpu(gd->bg_free_bits_count)); 190 } 191 192 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 193 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", 194 (unsigned long long)bh->b_blocknr, 195 le16_to_cpu(gd->bg_bits), 196 8 * le16_to_cpu(gd->bg_size)); 197 } 198 199 return 0; 200 } 201 202 static int ocfs2_validate_gd_parent(struct super_block *sb, 203 struct ocfs2_dinode *di, 204 struct buffer_head *bh, 205 int resize) 206 { 207 unsigned int max_bits; 208 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 209 210 if (di->i_blkno != gd->bg_parent_dinode) { 211 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", 212 (unsigned long long)bh->b_blocknr, 213 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 214 (unsigned long long)le64_to_cpu(di->i_blkno)); 215 } 216 217 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 218 if (le16_to_cpu(gd->bg_bits) > max_bits) { 219 do_error("Group descriptor #%llu has bit count of %u\n", 220 (unsigned long long)bh->b_blocknr, 221 le16_to_cpu(gd->bg_bits)); 222 } 223 224 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 225 if ((le16_to_cpu(gd->bg_chain) > 226 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 227 ((le16_to_cpu(gd->bg_chain) == 228 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 229 do_error("Group descriptor #%llu has bad chain %u\n", 230 (unsigned long long)bh->b_blocknr, 231 le16_to_cpu(gd->bg_chain)); 232 } 233 234 return 0; 235 } 236 237 #undef do_error 238 239 /* 240 * This version only prints errors. It does not fail the filesystem, and 241 * exists only for resize. 242 */ 243 int ocfs2_check_group_descriptor(struct super_block *sb, 244 struct ocfs2_dinode *di, 245 struct buffer_head *bh) 246 { 247 int rc; 248 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 249 250 BUG_ON(!buffer_uptodate(bh)); 251 252 /* 253 * If the ecc fails, we return the error but otherwise 254 * leave the filesystem running. We know any error is 255 * local to this block. 256 */ 257 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 258 if (rc) { 259 mlog(ML_ERROR, 260 "Checksum failed for group descriptor %llu\n", 261 (unsigned long long)bh->b_blocknr); 262 } else 263 rc = ocfs2_validate_gd_self(sb, bh, 1); 264 if (!rc) 265 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 266 267 return rc; 268 } 269 270 static int ocfs2_validate_group_descriptor(struct super_block *sb, 271 struct buffer_head *bh) 272 { 273 int rc; 274 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 275 276 trace_ocfs2_validate_group_descriptor( 277 (unsigned long long)bh->b_blocknr); 278 279 BUG_ON(!buffer_uptodate(bh)); 280 281 /* 282 * If the ecc fails, we return the error but otherwise 283 * leave the filesystem running. We know any error is 284 * local to this block. 285 */ 286 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 287 if (rc) 288 return rc; 289 290 /* 291 * Errors after here are fatal. 292 */ 293 294 return ocfs2_validate_gd_self(sb, bh, 0); 295 } 296 297 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 298 u64 gd_blkno, struct buffer_head **bh) 299 { 300 int rc; 301 struct buffer_head *tmp = *bh; 302 303 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 304 ocfs2_validate_group_descriptor); 305 if (rc) 306 goto out; 307 308 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 309 if (rc) { 310 brelse(tmp); 311 goto out; 312 } 313 314 /* If ocfs2_read_block() got us a new bh, pass it up. */ 315 if (!*bh) 316 *bh = tmp; 317 318 out: 319 return rc; 320 } 321 322 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 323 struct ocfs2_group_desc *bg, 324 struct ocfs2_chain_list *cl, 325 u64 p_blkno, unsigned int clusters) 326 { 327 struct ocfs2_extent_list *el = &bg->bg_list; 328 struct ocfs2_extent_rec *rec; 329 330 BUG_ON(!ocfs2_supports_discontig_bg(osb)); 331 if (!el->l_next_free_rec) 332 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); 333 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; 334 rec->e_blkno = cpu_to_le64(p_blkno); 335 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 336 le16_to_cpu(cl->cl_bpc)); 337 rec->e_leaf_clusters = cpu_to_le16(clusters); 338 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 339 le16_add_cpu(&bg->bg_free_bits_count, 340 clusters * le16_to_cpu(cl->cl_bpc)); 341 le16_add_cpu(&el->l_next_free_rec, 1); 342 } 343 344 static int ocfs2_block_group_fill(handle_t *handle, 345 struct inode *alloc_inode, 346 struct buffer_head *bg_bh, 347 u64 group_blkno, 348 unsigned int group_clusters, 349 u16 my_chain, 350 struct ocfs2_chain_list *cl) 351 { 352 int status = 0; 353 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 354 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 355 struct super_block * sb = alloc_inode->i_sb; 356 357 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 358 status = ocfs2_error(alloc_inode->i_sb, 359 "group block (%llu) != b_blocknr (%llu)\n", 360 (unsigned long long)group_blkno, 361 (unsigned long long) bg_bh->b_blocknr); 362 goto bail; 363 } 364 365 status = ocfs2_journal_access_gd(handle, 366 INODE_CACHE(alloc_inode), 367 bg_bh, 368 OCFS2_JOURNAL_ACCESS_CREATE); 369 if (status < 0) { 370 mlog_errno(status); 371 goto bail; 372 } 373 374 memset(bg, 0, sb->s_blocksize); 375 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 376 bg->bg_generation = cpu_to_le32(osb->fs_generation); 377 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 378 osb->s_feature_incompat)); 379 bg->bg_chain = cpu_to_le16(my_chain); 380 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 381 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 382 bg->bg_blkno = cpu_to_le64(group_blkno); 383 if (group_clusters == le16_to_cpu(cl->cl_cpg)) 384 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 385 else 386 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, 387 group_clusters); 388 389 /* set the 1st bit in the bitmap to account for the descriptor block */ 390 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 391 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 392 393 ocfs2_journal_dirty(handle, bg_bh); 394 395 /* There is no need to zero out or otherwise initialize the 396 * other blocks in a group - All valid FS metadata in a block 397 * group stores the superblock fs_generation value at 398 * allocation time. */ 399 400 bail: 401 if (status) 402 mlog_errno(status); 403 return status; 404 } 405 406 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 407 { 408 u16 curr, best; 409 410 best = curr = 0; 411 while (curr < le16_to_cpu(cl->cl_count)) { 412 if (le32_to_cpu(cl->cl_recs[best].c_total) > 413 le32_to_cpu(cl->cl_recs[curr].c_total)) 414 best = curr; 415 curr++; 416 } 417 return best; 418 } 419 420 static struct buffer_head * 421 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, 422 struct inode *alloc_inode, 423 struct ocfs2_alloc_context *ac, 424 struct ocfs2_chain_list *cl) 425 { 426 int status; 427 u32 bit_off, num_bits; 428 u64 bg_blkno; 429 struct buffer_head *bg_bh; 430 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 431 432 status = ocfs2_claim_clusters(handle, ac, 433 le16_to_cpu(cl->cl_cpg), &bit_off, 434 &num_bits); 435 if (status < 0) { 436 if (status != -ENOSPC) 437 mlog_errno(status); 438 goto bail; 439 } 440 441 /* setup the group */ 442 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 443 trace_ocfs2_block_group_alloc_contig( 444 (unsigned long long)bg_blkno, alloc_rec); 445 446 bg_bh = sb_getblk(osb->sb, bg_blkno); 447 if (!bg_bh) { 448 status = -ENOMEM; 449 mlog_errno(status); 450 goto bail; 451 } 452 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 453 454 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 455 bg_blkno, num_bits, alloc_rec, cl); 456 if (status < 0) { 457 brelse(bg_bh); 458 mlog_errno(status); 459 } 460 461 bail: 462 return status ? ERR_PTR(status) : bg_bh; 463 } 464 465 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, 466 handle_t *handle, 467 struct ocfs2_alloc_context *ac, 468 unsigned int min_bits, 469 u32 *bit_off, u32 *num_bits) 470 { 471 int status = 0; 472 473 while (min_bits) { 474 status = ocfs2_claim_clusters(handle, ac, min_bits, 475 bit_off, num_bits); 476 if (status != -ENOSPC) 477 break; 478 479 min_bits >>= 1; 480 } 481 482 return status; 483 } 484 485 static int ocfs2_block_group_grow_discontig(handle_t *handle, 486 struct inode *alloc_inode, 487 struct buffer_head *bg_bh, 488 struct ocfs2_alloc_context *ac, 489 struct ocfs2_chain_list *cl, 490 unsigned int min_bits) 491 { 492 int status; 493 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 494 struct ocfs2_group_desc *bg = 495 (struct ocfs2_group_desc *)bg_bh->b_data; 496 unsigned int needed = le16_to_cpu(cl->cl_cpg) - 497 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 498 u32 p_cpos, clusters; 499 u64 p_blkno; 500 struct ocfs2_extent_list *el = &bg->bg_list; 501 502 status = ocfs2_journal_access_gd(handle, 503 INODE_CACHE(alloc_inode), 504 bg_bh, 505 OCFS2_JOURNAL_ACCESS_CREATE); 506 if (status < 0) { 507 mlog_errno(status); 508 goto bail; 509 } 510 511 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < 512 le16_to_cpu(el->l_count))) { 513 if (min_bits > needed) 514 min_bits = needed; 515 status = ocfs2_block_group_claim_bits(osb, handle, ac, 516 min_bits, &p_cpos, 517 &clusters); 518 if (status < 0) { 519 if (status != -ENOSPC) 520 mlog_errno(status); 521 goto bail; 522 } 523 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); 524 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, 525 clusters); 526 527 min_bits = clusters; 528 needed = le16_to_cpu(cl->cl_cpg) - 529 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 530 } 531 532 if (needed > 0) { 533 /* 534 * We have used up all the extent rec but can't fill up 535 * the cpg. So bail out. 536 */ 537 status = -ENOSPC; 538 goto bail; 539 } 540 541 ocfs2_journal_dirty(handle, bg_bh); 542 543 bail: 544 return status; 545 } 546 547 static void ocfs2_bg_alloc_cleanup(handle_t *handle, 548 struct ocfs2_alloc_context *cluster_ac, 549 struct inode *alloc_inode, 550 struct buffer_head *bg_bh) 551 { 552 int i, ret; 553 struct ocfs2_group_desc *bg; 554 struct ocfs2_extent_list *el; 555 struct ocfs2_extent_rec *rec; 556 557 if (!bg_bh) 558 return; 559 560 bg = (struct ocfs2_group_desc *)bg_bh->b_data; 561 el = &bg->bg_list; 562 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 563 rec = &el->l_recs[i]; 564 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, 565 cluster_ac->ac_bh, 566 le64_to_cpu(rec->e_blkno), 567 le16_to_cpu(rec->e_leaf_clusters)); 568 if (ret) 569 mlog_errno(ret); 570 /* Try all the clusters to free */ 571 } 572 573 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); 574 brelse(bg_bh); 575 } 576 577 static struct buffer_head * 578 ocfs2_block_group_alloc_discontig(handle_t *handle, 579 struct inode *alloc_inode, 580 struct ocfs2_alloc_context *ac, 581 struct ocfs2_chain_list *cl) 582 { 583 int status; 584 u32 bit_off, num_bits; 585 u64 bg_blkno; 586 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; 587 struct buffer_head *bg_bh = NULL; 588 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 589 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 590 591 if (!ocfs2_supports_discontig_bg(osb)) { 592 status = -ENOSPC; 593 goto bail; 594 } 595 596 status = ocfs2_extend_trans(handle, 597 ocfs2_calc_bg_discontig_credits(osb->sb)); 598 if (status) { 599 mlog_errno(status); 600 goto bail; 601 } 602 603 /* 604 * We're going to be grabbing from multiple cluster groups. 605 * We don't have enough credits to relink them all, and the 606 * cluster groups will be staying in cache for the duration of 607 * this operation. 608 */ 609 ac->ac_disable_chain_relink = 1; 610 611 /* Claim the first region */ 612 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, 613 &bit_off, &num_bits); 614 if (status < 0) { 615 if (status != -ENOSPC) 616 mlog_errno(status); 617 goto bail; 618 } 619 min_bits = num_bits; 620 621 /* setup the group */ 622 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 623 trace_ocfs2_block_group_alloc_discontig( 624 (unsigned long long)bg_blkno, alloc_rec); 625 626 bg_bh = sb_getblk(osb->sb, bg_blkno); 627 if (!bg_bh) { 628 status = -ENOMEM; 629 mlog_errno(status); 630 goto bail; 631 } 632 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 633 634 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 635 bg_blkno, num_bits, alloc_rec, cl); 636 if (status < 0) { 637 mlog_errno(status); 638 goto bail; 639 } 640 641 status = ocfs2_block_group_grow_discontig(handle, alloc_inode, 642 bg_bh, ac, cl, min_bits); 643 if (status) 644 mlog_errno(status); 645 646 bail: 647 if (status) 648 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); 649 return status ? ERR_PTR(status) : bg_bh; 650 } 651 652 /* 653 * We expect the block group allocator to already be locked. 654 */ 655 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 656 struct inode *alloc_inode, 657 struct buffer_head *bh, 658 u64 max_block, 659 u64 *last_alloc_group, 660 int flags) 661 { 662 int status, credits; 663 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 664 struct ocfs2_chain_list *cl; 665 struct ocfs2_alloc_context *ac = NULL; 666 handle_t *handle = NULL; 667 u16 alloc_rec; 668 struct buffer_head *bg_bh = NULL; 669 struct ocfs2_group_desc *bg; 670 671 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 672 673 cl = &fe->id2.i_chain; 674 status = ocfs2_reserve_clusters_with_limit(osb, 675 le16_to_cpu(cl->cl_cpg), 676 max_block, flags, &ac); 677 if (status < 0) { 678 if (status != -ENOSPC) 679 mlog_errno(status); 680 goto bail; 681 } 682 683 credits = ocfs2_calc_group_alloc_credits(osb->sb, 684 le16_to_cpu(cl->cl_cpg)); 685 handle = ocfs2_start_trans(osb, credits); 686 if (IS_ERR(handle)) { 687 status = PTR_ERR(handle); 688 handle = NULL; 689 mlog_errno(status); 690 goto bail; 691 } 692 693 if (last_alloc_group && *last_alloc_group != 0) { 694 trace_ocfs2_block_group_alloc( 695 (unsigned long long)*last_alloc_group); 696 ac->ac_last_group = *last_alloc_group; 697 } 698 699 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 700 ac, cl); 701 if (PTR_ERR(bg_bh) == -ENOSPC) { 702 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 703 bg_bh = ocfs2_block_group_alloc_discontig(handle, 704 alloc_inode, 705 ac, cl); 706 } 707 if (IS_ERR(bg_bh)) { 708 status = PTR_ERR(bg_bh); 709 bg_bh = NULL; 710 if (status != -ENOSPC) 711 mlog_errno(status); 712 goto bail; 713 } 714 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 715 716 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 717 bh, OCFS2_JOURNAL_ACCESS_WRITE); 718 if (status < 0) { 719 mlog_errno(status); 720 goto bail; 721 } 722 723 alloc_rec = le16_to_cpu(bg->bg_chain); 724 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 725 le16_to_cpu(bg->bg_free_bits_count)); 726 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 727 le16_to_cpu(bg->bg_bits)); 728 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; 729 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 730 le16_add_cpu(&cl->cl_next_free_rec, 1); 731 732 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 733 le16_to_cpu(bg->bg_free_bits_count)); 734 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 735 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 736 737 ocfs2_journal_dirty(handle, bh); 738 739 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 740 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 741 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 742 le32_to_cpu(fe->i_clusters))); 743 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 744 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 745 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 746 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 747 748 status = 0; 749 750 /* save the new last alloc group so that the caller can cache it. */ 751 if (last_alloc_group) 752 *last_alloc_group = ac->ac_last_group; 753 754 bail: 755 if (handle) 756 ocfs2_commit_trans(osb, handle); 757 758 if (ac) 759 ocfs2_free_alloc_context(ac); 760 761 brelse(bg_bh); 762 763 if (status) 764 mlog_errno(status); 765 return status; 766 } 767 768 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 769 struct ocfs2_alloc_context *ac, 770 int type, 771 u32 slot, 772 u64 *last_alloc_group, 773 int flags) 774 { 775 int status; 776 u32 bits_wanted = ac->ac_bits_wanted; 777 struct inode *alloc_inode; 778 struct buffer_head *bh = NULL; 779 struct ocfs2_dinode *fe; 780 u32 free_bits; 781 782 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 783 if (!alloc_inode) { 784 mlog_errno(-EINVAL); 785 return -EINVAL; 786 } 787 788 inode_lock(alloc_inode); 789 790 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 791 if (status < 0) { 792 inode_unlock(alloc_inode); 793 iput(alloc_inode); 794 795 mlog_errno(status); 796 return status; 797 } 798 799 ac->ac_inode = alloc_inode; 800 ac->ac_alloc_slot = slot; 801 802 fe = (struct ocfs2_dinode *) bh->b_data; 803 804 /* The bh was validated by the inode read inside 805 * ocfs2_inode_lock(). Any corruption is a code bug. */ 806 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 807 808 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 809 status = ocfs2_error(alloc_inode->i_sb, 810 "Invalid chain allocator %llu\n", 811 (unsigned long long)le64_to_cpu(fe->i_blkno)); 812 goto bail; 813 } 814 815 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 816 le32_to_cpu(fe->id1.bitmap1.i_used); 817 818 if (bits_wanted > free_bits) { 819 /* cluster bitmap never grows */ 820 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 821 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, 822 free_bits); 823 status = -ENOSPC; 824 goto bail; 825 } 826 827 if (!(flags & ALLOC_NEW_GROUP)) { 828 trace_ocfs2_reserve_suballoc_bits_no_new_group( 829 slot, bits_wanted, free_bits); 830 status = -ENOSPC; 831 goto bail; 832 } 833 834 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 835 ac->ac_max_block, 836 last_alloc_group, flags); 837 if (status < 0) { 838 if (status != -ENOSPC) 839 mlog_errno(status); 840 goto bail; 841 } 842 atomic_inc(&osb->alloc_stats.bg_extends); 843 844 /* You should never ask for this much metadata */ 845 BUG_ON(bits_wanted > 846 (le32_to_cpu(fe->id1.bitmap1.i_total) 847 - le32_to_cpu(fe->id1.bitmap1.i_used))); 848 } 849 850 get_bh(bh); 851 ac->ac_bh = bh; 852 bail: 853 brelse(bh); 854 855 if (status) 856 mlog_errno(status); 857 return status; 858 } 859 860 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 861 { 862 spin_lock(&osb->osb_lock); 863 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 864 spin_unlock(&osb->osb_lock); 865 atomic_set(&osb->s_num_inodes_stolen, 0); 866 } 867 868 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 869 { 870 spin_lock(&osb->osb_lock); 871 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 872 spin_unlock(&osb->osb_lock); 873 atomic_set(&osb->s_num_meta_stolen, 0); 874 } 875 876 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 877 { 878 ocfs2_init_inode_steal_slot(osb); 879 ocfs2_init_meta_steal_slot(osb); 880 } 881 882 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 883 { 884 spin_lock(&osb->osb_lock); 885 if (type == INODE_ALLOC_SYSTEM_INODE) 886 osb->s_inode_steal_slot = (u16)slot; 887 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 888 osb->s_meta_steal_slot = (u16)slot; 889 spin_unlock(&osb->osb_lock); 890 } 891 892 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 893 { 894 int slot = OCFS2_INVALID_SLOT; 895 896 spin_lock(&osb->osb_lock); 897 if (type == INODE_ALLOC_SYSTEM_INODE) 898 slot = osb->s_inode_steal_slot; 899 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 900 slot = osb->s_meta_steal_slot; 901 spin_unlock(&osb->osb_lock); 902 903 return slot; 904 } 905 906 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 907 { 908 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 909 } 910 911 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 912 { 913 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 914 } 915 916 static int ocfs2_steal_resource(struct ocfs2_super *osb, 917 struct ocfs2_alloc_context *ac, 918 int type) 919 { 920 int i, status = -ENOSPC; 921 int slot = __ocfs2_get_steal_slot(osb, type); 922 923 /* Start to steal resource from the first slot after ours. */ 924 if (slot == OCFS2_INVALID_SLOT) 925 slot = osb->slot_num + 1; 926 927 for (i = 0; i < osb->max_slots; i++, slot++) { 928 if (slot == osb->max_slots) 929 slot = 0; 930 931 if (slot == osb->slot_num) 932 continue; 933 934 status = ocfs2_reserve_suballoc_bits(osb, ac, 935 type, 936 (u32)slot, NULL, 937 NOT_ALLOC_NEW_GROUP); 938 if (status >= 0) { 939 __ocfs2_set_steal_slot(osb, slot, type); 940 break; 941 } 942 943 ocfs2_free_ac_resource(ac); 944 } 945 946 return status; 947 } 948 949 static int ocfs2_steal_inode(struct ocfs2_super *osb, 950 struct ocfs2_alloc_context *ac) 951 { 952 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 953 } 954 955 static int ocfs2_steal_meta(struct ocfs2_super *osb, 956 struct ocfs2_alloc_context *ac) 957 { 958 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 959 } 960 961 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 962 int blocks, 963 struct ocfs2_alloc_context **ac) 964 { 965 int status; 966 int slot = ocfs2_get_meta_steal_slot(osb); 967 968 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 969 if (!(*ac)) { 970 status = -ENOMEM; 971 mlog_errno(status); 972 goto bail; 973 } 974 975 (*ac)->ac_bits_wanted = blocks; 976 (*ac)->ac_which = OCFS2_AC_USE_META; 977 (*ac)->ac_group_search = ocfs2_block_group_search; 978 979 if (slot != OCFS2_INVALID_SLOT && 980 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 981 goto extent_steal; 982 983 atomic_set(&osb->s_num_meta_stolen, 0); 984 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 985 EXTENT_ALLOC_SYSTEM_INODE, 986 (u32)osb->slot_num, NULL, 987 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); 988 989 990 if (status >= 0) { 991 status = 0; 992 if (slot != OCFS2_INVALID_SLOT) 993 ocfs2_init_meta_steal_slot(osb); 994 goto bail; 995 } else if (status < 0 && status != -ENOSPC) { 996 mlog_errno(status); 997 goto bail; 998 } 999 1000 ocfs2_free_ac_resource(*ac); 1001 1002 extent_steal: 1003 status = ocfs2_steal_meta(osb, *ac); 1004 atomic_inc(&osb->s_num_meta_stolen); 1005 if (status < 0) { 1006 if (status != -ENOSPC) 1007 mlog_errno(status); 1008 goto bail; 1009 } 1010 1011 status = 0; 1012 bail: 1013 if ((status < 0) && *ac) { 1014 ocfs2_free_alloc_context(*ac); 1015 *ac = NULL; 1016 } 1017 1018 if (status) 1019 mlog_errno(status); 1020 return status; 1021 } 1022 1023 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 1024 struct ocfs2_extent_list *root_el, 1025 struct ocfs2_alloc_context **ac) 1026 { 1027 return ocfs2_reserve_new_metadata_blocks(osb, 1028 ocfs2_extend_meta_needed(root_el), 1029 ac); 1030 } 1031 1032 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 1033 struct ocfs2_alloc_context **ac) 1034 { 1035 int status; 1036 int slot = ocfs2_get_inode_steal_slot(osb); 1037 u64 alloc_group; 1038 1039 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1040 if (!(*ac)) { 1041 status = -ENOMEM; 1042 mlog_errno(status); 1043 goto bail; 1044 } 1045 1046 (*ac)->ac_bits_wanted = 1; 1047 (*ac)->ac_which = OCFS2_AC_USE_INODE; 1048 1049 (*ac)->ac_group_search = ocfs2_block_group_search; 1050 1051 /* 1052 * stat(2) can't handle i_ino > 32bits, so we tell the 1053 * lower levels not to allocate us a block group past that 1054 * limit. The 'inode64' mount option avoids this behavior. 1055 */ 1056 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 1057 (*ac)->ac_max_block = (u32)~0U; 1058 1059 /* 1060 * slot is set when we successfully steal inode from other nodes. 1061 * It is reset in 3 places: 1062 * 1. when we flush the truncate log 1063 * 2. when we complete local alloc recovery. 1064 * 3. when we successfully allocate from our own slot. 1065 * After it is set, we will go on stealing inodes until we find the 1066 * need to check our slots to see whether there is some space for us. 1067 */ 1068 if (slot != OCFS2_INVALID_SLOT && 1069 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 1070 goto inode_steal; 1071 1072 atomic_set(&osb->s_num_inodes_stolen, 0); 1073 alloc_group = osb->osb_inode_alloc_group; 1074 status = ocfs2_reserve_suballoc_bits(osb, *ac, 1075 INODE_ALLOC_SYSTEM_INODE, 1076 (u32)osb->slot_num, 1077 &alloc_group, 1078 ALLOC_NEW_GROUP | 1079 ALLOC_GROUPS_FROM_GLOBAL); 1080 if (status >= 0) { 1081 status = 0; 1082 1083 spin_lock(&osb->osb_lock); 1084 osb->osb_inode_alloc_group = alloc_group; 1085 spin_unlock(&osb->osb_lock); 1086 trace_ocfs2_reserve_new_inode_new_group( 1087 (unsigned long long)alloc_group); 1088 1089 /* 1090 * Some inodes must be freed by us, so try to allocate 1091 * from our own next time. 1092 */ 1093 if (slot != OCFS2_INVALID_SLOT) 1094 ocfs2_init_inode_steal_slot(osb); 1095 goto bail; 1096 } else if (status < 0 && status != -ENOSPC) { 1097 mlog_errno(status); 1098 goto bail; 1099 } 1100 1101 ocfs2_free_ac_resource(*ac); 1102 1103 inode_steal: 1104 status = ocfs2_steal_inode(osb, *ac); 1105 atomic_inc(&osb->s_num_inodes_stolen); 1106 if (status < 0) { 1107 if (status != -ENOSPC) 1108 mlog_errno(status); 1109 goto bail; 1110 } 1111 1112 status = 0; 1113 bail: 1114 if ((status < 0) && *ac) { 1115 ocfs2_free_alloc_context(*ac); 1116 *ac = NULL; 1117 } 1118 1119 if (status) 1120 mlog_errno(status); 1121 return status; 1122 } 1123 1124 /* local alloc code has to do the same thing, so rather than do this 1125 * twice.. */ 1126 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 1127 struct ocfs2_alloc_context *ac) 1128 { 1129 int status; 1130 1131 ac->ac_which = OCFS2_AC_USE_MAIN; 1132 ac->ac_group_search = ocfs2_cluster_group_search; 1133 1134 status = ocfs2_reserve_suballoc_bits(osb, ac, 1135 GLOBAL_BITMAP_SYSTEM_INODE, 1136 OCFS2_INVALID_SLOT, NULL, 1137 ALLOC_NEW_GROUP); 1138 if (status < 0 && status != -ENOSPC) 1139 mlog_errno(status); 1140 1141 return status; 1142 } 1143 1144 /* Callers don't need to care which bitmap (local alloc or main) to 1145 * use so we figure it out for them, but unfortunately this clutters 1146 * things a bit. */ 1147 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 1148 u32 bits_wanted, u64 max_block, 1149 int flags, 1150 struct ocfs2_alloc_context **ac) 1151 { 1152 int status, ret = 0; 1153 int retried = 0; 1154 1155 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1156 if (!(*ac)) { 1157 status = -ENOMEM; 1158 mlog_errno(status); 1159 goto bail; 1160 } 1161 1162 (*ac)->ac_bits_wanted = bits_wanted; 1163 (*ac)->ac_max_block = max_block; 1164 1165 status = -ENOSPC; 1166 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 1167 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 1168 status = ocfs2_reserve_local_alloc_bits(osb, 1169 bits_wanted, 1170 *ac); 1171 if ((status < 0) && (status != -ENOSPC)) { 1172 mlog_errno(status); 1173 goto bail; 1174 } 1175 } 1176 1177 if (status == -ENOSPC) { 1178 retry: 1179 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1180 /* Retry if there is sufficient space cached in truncate log */ 1181 if (status == -ENOSPC && !retried) { 1182 retried = 1; 1183 ocfs2_inode_unlock((*ac)->ac_inode, 1); 1184 inode_unlock((*ac)->ac_inode); 1185 1186 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1187 if (ret == 1) { 1188 iput((*ac)->ac_inode); 1189 (*ac)->ac_inode = NULL; 1190 goto retry; 1191 } 1192 1193 if (ret < 0) 1194 mlog_errno(ret); 1195 1196 inode_lock((*ac)->ac_inode); 1197 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1198 if (ret < 0) { 1199 mlog_errno(ret); 1200 inode_unlock((*ac)->ac_inode); 1201 iput((*ac)->ac_inode); 1202 (*ac)->ac_inode = NULL; 1203 goto bail; 1204 } 1205 } 1206 if (status < 0) { 1207 if (status != -ENOSPC) 1208 mlog_errno(status); 1209 goto bail; 1210 } 1211 } 1212 1213 status = 0; 1214 bail: 1215 if ((status < 0) && *ac) { 1216 ocfs2_free_alloc_context(*ac); 1217 *ac = NULL; 1218 } 1219 1220 if (status) 1221 mlog_errno(status); 1222 return status; 1223 } 1224 1225 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 1226 u32 bits_wanted, 1227 struct ocfs2_alloc_context **ac) 1228 { 1229 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 1230 ALLOC_NEW_GROUP, ac); 1231 } 1232 1233 /* 1234 * More or less lifted from ext3. I'll leave their description below: 1235 * 1236 * "For ext3 allocations, we must not reuse any blocks which are 1237 * allocated in the bitmap buffer's "last committed data" copy. This 1238 * prevents deletes from freeing up the page for reuse until we have 1239 * committed the delete transaction. 1240 * 1241 * If we didn't do this, then deleting something and reallocating it as 1242 * data would allow the old block to be overwritten before the 1243 * transaction committed (because we force data to disk before commit). 1244 * This would lead to corruption if we crashed between overwriting the 1245 * data and committing the delete. 1246 * 1247 * @@@ We may want to make this allocation behaviour conditional on 1248 * data-writes at some point, and disable it for metadata allocations or 1249 * sync-data inodes." 1250 * 1251 * Note: OCFS2 already does this differently for metadata vs data 1252 * allocations, as those bitmaps are separate and undo access is never 1253 * called on a metadata group descriptor. 1254 */ 1255 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1256 int nr) 1257 { 1258 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1259 struct journal_head *jh; 1260 int ret; 1261 1262 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1263 return 0; 1264 1265 jh = jbd2_journal_grab_journal_head(bg_bh); 1266 if (!jh) 1267 return 1; 1268 1269 spin_lock(&jh->b_state_lock); 1270 bg = (struct ocfs2_group_desc *) jh->b_committed_data; 1271 if (bg) 1272 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1273 else 1274 ret = 1; 1275 spin_unlock(&jh->b_state_lock); 1276 jbd2_journal_put_journal_head(jh); 1277 1278 return ret; 1279 } 1280 1281 u16 ocfs2_find_max_contig_free_bits(void *bitmap, 1282 u16 total_bits, u16 start) 1283 { 1284 u16 offset, free_bits; 1285 u16 contig_bits = 0; 1286 1287 while (start < total_bits) { 1288 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); 1289 if (offset == total_bits) 1290 break; 1291 1292 start = ocfs2_find_next_bit(bitmap, total_bits, offset); 1293 free_bits = start - offset; 1294 if (contig_bits < free_bits) 1295 contig_bits = free_bits; 1296 } 1297 1298 return contig_bits; 1299 } 1300 1301 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1302 struct buffer_head *bg_bh, 1303 unsigned int bits_wanted, 1304 unsigned int total_bits, 1305 struct ocfs2_suballoc_result *res) 1306 { 1307 void *bitmap; 1308 u16 best_offset, best_size; 1309 u16 prev_best_size = 0; 1310 int offset, start, found, status = 0; 1311 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1312 1313 /* Callers got this descriptor from 1314 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1315 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1316 1317 found = start = best_offset = best_size = 0; 1318 bitmap = bg->bg_bitmap; 1319 1320 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < 1321 total_bits) { 1322 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1323 /* We found a zero, but we can't use it as it 1324 * hasn't been put to disk yet! */ 1325 found = 0; 1326 start = offset + 1; 1327 } else if (offset == start) { 1328 /* we found a zero */ 1329 found++; 1330 /* move start to the next bit to test */ 1331 start++; 1332 } else { 1333 /* got a zero after some ones */ 1334 found = 1; 1335 start = offset + 1; 1336 prev_best_size = best_size; 1337 } 1338 if (found > best_size) { 1339 best_size = found; 1340 best_offset = start - found; 1341 } 1342 /* we got everything we needed */ 1343 if (found == bits_wanted) { 1344 /* mlog(0, "Found it all!\n"); */ 1345 break; 1346 } 1347 } 1348 1349 /* best_size will be allocated, we save prev_best_size */ 1350 res->sr_max_contig_bits = prev_best_size; 1351 if (best_size) { 1352 res->sr_bit_offset = best_offset; 1353 res->sr_bits = best_size; 1354 } else { 1355 status = -ENOSPC; 1356 /* No error log here -- see the comment above 1357 * ocfs2_test_bg_bit_allocatable */ 1358 } 1359 1360 return status; 1361 } 1362 1363 int ocfs2_block_group_set_bits(handle_t *handle, 1364 struct inode *alloc_inode, 1365 struct ocfs2_group_desc *bg, 1366 struct buffer_head *group_bh, 1367 unsigned int bit_off, 1368 unsigned int num_bits, 1369 unsigned int max_contig_bits, 1370 int fastpath) 1371 { 1372 int status; 1373 void *bitmap = bg->bg_bitmap; 1374 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1375 unsigned int start = bit_off + num_bits; 1376 u16 contig_bits; 1377 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1378 1379 /* All callers get the descriptor via 1380 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1381 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1382 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1383 1384 trace_ocfs2_block_group_set_bits(bit_off, num_bits); 1385 1386 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1387 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1388 1389 status = ocfs2_journal_access_gd(handle, 1390 INODE_CACHE(alloc_inode), 1391 group_bh, 1392 journal_type); 1393 if (status < 0) { 1394 mlog_errno(status); 1395 goto bail; 1396 } 1397 1398 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1399 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1400 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 1401 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1402 le16_to_cpu(bg->bg_bits), 1403 le16_to_cpu(bg->bg_free_bits_count), 1404 num_bits); 1405 } 1406 while(num_bits--) 1407 ocfs2_set_bit(bit_off++, bitmap); 1408 1409 /* 1410 * this is optimize path, caller set old contig value 1411 * in max_contig_bits to bypass finding action. 1412 */ 1413 if (fastpath) { 1414 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1415 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { 1416 /* 1417 * Usually, the block group bitmap allocates only 1 bit 1418 * at a time, while the cluster group allocates n bits 1419 * each time. Therefore, we only save the contig bits for 1420 * the cluster group. 1421 */ 1422 contig_bits = ocfs2_find_max_contig_free_bits(bitmap, 1423 le16_to_cpu(bg->bg_bits), start); 1424 if (contig_bits > max_contig_bits) 1425 max_contig_bits = contig_bits; 1426 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1427 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); 1428 } else { 1429 bg->bg_contig_free_bits = 0; 1430 } 1431 1432 ocfs2_journal_dirty(handle, group_bh); 1433 1434 bail: 1435 return status; 1436 } 1437 1438 /* find the one with the most empty bits */ 1439 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1440 { 1441 u16 curr, best; 1442 1443 BUG_ON(!cl->cl_next_free_rec); 1444 1445 best = curr = 0; 1446 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1447 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1448 le32_to_cpu(cl->cl_recs[best].c_free)) 1449 best = curr; 1450 curr++; 1451 } 1452 1453 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1454 return best; 1455 } 1456 1457 static int ocfs2_relink_block_group(handle_t *handle, 1458 struct inode *alloc_inode, 1459 struct buffer_head *fe_bh, 1460 struct buffer_head *bg_bh, 1461 struct buffer_head *prev_bg_bh, 1462 u16 chain) 1463 { 1464 int status; 1465 /* there is a really tiny chance the journal calls could fail, 1466 * but we wouldn't want inconsistent blocks in *any* case. */ 1467 u64 bg_ptr, prev_bg_ptr; 1468 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1469 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1470 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1471 1472 /* The caller got these descriptors from 1473 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1474 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1475 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1476 1477 trace_ocfs2_relink_block_group( 1478 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1479 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1480 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1481 1482 bg_ptr = le64_to_cpu(bg->bg_next_group); 1483 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1484 1485 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1486 prev_bg_bh, 1487 OCFS2_JOURNAL_ACCESS_WRITE); 1488 if (status < 0) 1489 goto out; 1490 1491 prev_bg->bg_next_group = bg->bg_next_group; 1492 ocfs2_journal_dirty(handle, prev_bg_bh); 1493 1494 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1495 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1496 if (status < 0) 1497 goto out_rollback_prev_bg; 1498 1499 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1500 ocfs2_journal_dirty(handle, bg_bh); 1501 1502 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1503 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1504 if (status < 0) 1505 goto out_rollback_bg; 1506 1507 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1508 ocfs2_journal_dirty(handle, fe_bh); 1509 1510 out: 1511 if (status < 0) 1512 mlog_errno(status); 1513 return status; 1514 1515 out_rollback_bg: 1516 bg->bg_next_group = cpu_to_le64(bg_ptr); 1517 out_rollback_prev_bg: 1518 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1519 goto out; 1520 } 1521 1522 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1523 u32 wanted) 1524 { 1525 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1526 } 1527 1528 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1529 * value on error. */ 1530 static int ocfs2_cluster_group_search(struct inode *inode, 1531 struct buffer_head *group_bh, 1532 u32 bits_wanted, u32 min_bits, 1533 u64 max_block, 1534 struct ocfs2_suballoc_result *res) 1535 { 1536 int search = -ENOSPC; 1537 int ret; 1538 u64 blkoff; 1539 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1540 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1541 unsigned int max_bits, gd_cluster_off; 1542 1543 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1544 1545 if (le16_to_cpu(gd->bg_contig_free_bits) && 1546 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) 1547 return -ENOSPC; 1548 1549 /* ->bg_contig_free_bits may un-initialized, so compare again */ 1550 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { 1551 max_bits = le16_to_cpu(gd->bg_bits); 1552 1553 /* Tail groups in cluster bitmaps which aren't cpg 1554 * aligned are prone to partial extension by a failed 1555 * fs resize. If the file system resize never got to 1556 * update the dinode cluster count, then we don't want 1557 * to trust any clusters past it, regardless of what 1558 * the group descriptor says. */ 1559 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1560 le64_to_cpu(gd->bg_blkno)); 1561 if ((gd_cluster_off + max_bits) > 1562 OCFS2_I(inode)->ip_clusters) { 1563 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1564 trace_ocfs2_cluster_group_search_wrong_max_bits( 1565 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1566 le16_to_cpu(gd->bg_bits), 1567 OCFS2_I(inode)->ip_clusters, max_bits); 1568 } 1569 1570 ret = ocfs2_block_group_find_clear_bits(osb, 1571 group_bh, bits_wanted, 1572 max_bits, res); 1573 if (ret) 1574 return ret; 1575 1576 if (max_block) { 1577 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1578 gd_cluster_off + 1579 res->sr_bit_offset + 1580 res->sr_bits); 1581 trace_ocfs2_cluster_group_search_max_block( 1582 (unsigned long long)blkoff, 1583 (unsigned long long)max_block); 1584 if (blkoff > max_block) 1585 return -ENOSPC; 1586 } 1587 1588 /* ocfs2_block_group_find_clear_bits() might 1589 * return success, but we still want to return 1590 * -ENOSPC unless it found the minimum number 1591 * of bits. */ 1592 if (min_bits <= res->sr_bits) 1593 search = 0; /* success */ 1594 } 1595 1596 return search; 1597 } 1598 1599 static int ocfs2_block_group_search(struct inode *inode, 1600 struct buffer_head *group_bh, 1601 u32 bits_wanted, u32 min_bits, 1602 u64 max_block, 1603 struct ocfs2_suballoc_result *res) 1604 { 1605 int ret = -ENOSPC; 1606 u64 blkoff; 1607 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1608 1609 BUG_ON(min_bits != 1); 1610 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1611 1612 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { 1613 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1614 group_bh, bits_wanted, 1615 le16_to_cpu(bg->bg_bits), 1616 res); 1617 if (!ret && max_block) { 1618 blkoff = le64_to_cpu(bg->bg_blkno) + 1619 res->sr_bit_offset + res->sr_bits; 1620 trace_ocfs2_block_group_search_max_block( 1621 (unsigned long long)blkoff, 1622 (unsigned long long)max_block); 1623 if (blkoff > max_block) 1624 ret = -ENOSPC; 1625 } 1626 } 1627 1628 return ret; 1629 } 1630 1631 int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1632 handle_t *handle, 1633 struct buffer_head *di_bh, 1634 u32 num_bits, 1635 u16 chain) 1636 { 1637 int ret; 1638 u32 tmp_used; 1639 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1640 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1641 1642 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1643 OCFS2_JOURNAL_ACCESS_WRITE); 1644 if (ret < 0) { 1645 mlog_errno(ret); 1646 goto out; 1647 } 1648 1649 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1650 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1651 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1652 ocfs2_journal_dirty(handle, di_bh); 1653 1654 out: 1655 return ret; 1656 } 1657 1658 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, 1659 struct buffer_head *di_bh, 1660 u32 num_bits, 1661 u16 chain) 1662 { 1663 u32 tmp_used; 1664 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1665 struct ocfs2_chain_list *cl; 1666 1667 cl = (struct ocfs2_chain_list *)&di->id2.i_chain; 1668 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1669 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); 1670 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); 1671 } 1672 1673 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1674 struct ocfs2_extent_rec *rec, 1675 struct ocfs2_chain_list *cl) 1676 { 1677 unsigned int bpc = le16_to_cpu(cl->cl_bpc); 1678 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; 1679 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; 1680 1681 if (res->sr_bit_offset < bitoff) 1682 return 0; 1683 if (res->sr_bit_offset >= (bitoff + bitcount)) 1684 return 0; 1685 res->sr_blkno = le64_to_cpu(rec->e_blkno) + 1686 (res->sr_bit_offset - bitoff); 1687 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) 1688 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; 1689 return 1; 1690 } 1691 1692 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, 1693 struct ocfs2_group_desc *bg, 1694 struct ocfs2_suballoc_result *res) 1695 { 1696 int i; 1697 u64 bg_blkno = res->sr_bg_blkno; /* Save off */ 1698 struct ocfs2_extent_rec *rec; 1699 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1700 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1701 1702 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { 1703 res->sr_blkno = 0; 1704 return; 1705 } 1706 1707 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; 1708 res->sr_bg_blkno = 0; /* Clear it for contig block groups */ 1709 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || 1710 !bg->bg_list.l_next_free_rec) 1711 return; 1712 1713 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { 1714 rec = &bg->bg_list.l_recs[i]; 1715 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { 1716 res->sr_bg_blkno = bg_blkno; /* Restore */ 1717 break; 1718 } 1719 } 1720 } 1721 1722 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1723 handle_t *handle, 1724 u32 bits_wanted, 1725 u32 min_bits, 1726 struct ocfs2_suballoc_result *res, 1727 u16 *bits_left) 1728 { 1729 int ret; 1730 struct buffer_head *group_bh = NULL; 1731 struct ocfs2_group_desc *gd; 1732 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1733 struct inode *alloc_inode = ac->ac_inode; 1734 1735 ret = ocfs2_read_group_descriptor(alloc_inode, di, 1736 res->sr_bg_blkno, &group_bh); 1737 if (ret < 0) { 1738 mlog_errno(ret); 1739 return ret; 1740 } 1741 1742 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1743 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1744 ac->ac_max_block, res); 1745 if (ret < 0) { 1746 if (ret != -ENOSPC) 1747 mlog_errno(ret); 1748 goto out; 1749 } 1750 1751 if (!ret) 1752 ocfs2_bg_discontig_fix_result(ac, gd, res); 1753 1754 /* 1755 * sr_bg_blkno might have been changed by 1756 * ocfs2_bg_discontig_fix_result 1757 */ 1758 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1759 1760 if (ac->ac_find_loc_only) 1761 goto out_loc_only; 1762 1763 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1764 res->sr_bits, 1765 le16_to_cpu(gd->bg_chain)); 1766 if (ret < 0) { 1767 mlog_errno(ret); 1768 goto out; 1769 } 1770 1771 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1772 res->sr_bit_offset, res->sr_bits, 1773 res->sr_max_contig_bits, 0); 1774 if (ret < 0) { 1775 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, 1776 res->sr_bits, 1777 le16_to_cpu(gd->bg_chain)); 1778 mlog_errno(ret); 1779 } 1780 1781 out_loc_only: 1782 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1783 1784 out: 1785 brelse(group_bh); 1786 1787 return ret; 1788 } 1789 1790 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1791 handle_t *handle, 1792 u32 bits_wanted, 1793 u32 min_bits, 1794 struct ocfs2_suballoc_result *res, 1795 u16 *bits_left) 1796 { 1797 int status; 1798 u16 chain; 1799 u32 contig_bits; 1800 u64 next_group; 1801 struct inode *alloc_inode = ac->ac_inode; 1802 struct buffer_head *group_bh = NULL; 1803 struct buffer_head *prev_group_bh = NULL; 1804 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1805 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1806 struct ocfs2_group_desc *bg; 1807 1808 chain = ac->ac_chain; 1809 trace_ocfs2_search_chain_begin( 1810 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 1811 bits_wanted, chain); 1812 1813 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1814 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1815 &group_bh); 1816 if (status < 0) { 1817 mlog_errno(status); 1818 goto bail; 1819 } 1820 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1821 1822 status = -ENOSPC; 1823 /* for now, the chain search is a bit simplistic. We just use 1824 * the 1st group with any empty bits. */ 1825 while (1) { 1826 if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { 1827 contig_bits = le16_to_cpu(bg->bg_contig_free_bits); 1828 if (!contig_bits) 1829 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 1830 le16_to_cpu(bg->bg_bits), 0); 1831 if (bits_wanted > contig_bits && contig_bits >= min_bits) 1832 bits_wanted = contig_bits; 1833 } 1834 1835 status = ac->ac_group_search(alloc_inode, group_bh, 1836 bits_wanted, min_bits, 1837 ac->ac_max_block, res); 1838 if (status != -ENOSPC) 1839 break; 1840 if (!bg->bg_next_group) 1841 break; 1842 1843 brelse(prev_group_bh); 1844 prev_group_bh = NULL; 1845 1846 next_group = le64_to_cpu(bg->bg_next_group); 1847 prev_group_bh = group_bh; 1848 group_bh = NULL; 1849 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1850 next_group, &group_bh); 1851 if (status < 0) { 1852 mlog_errno(status); 1853 goto bail; 1854 } 1855 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1856 } 1857 if (status < 0) { 1858 if (status != -ENOSPC) 1859 mlog_errno(status); 1860 goto bail; 1861 } 1862 1863 trace_ocfs2_search_chain_succ( 1864 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); 1865 1866 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1867 1868 BUG_ON(res->sr_bits == 0); 1869 if (!status) 1870 ocfs2_bg_discontig_fix_result(ac, bg, res); 1871 1872 /* 1873 * sr_bg_blkno might have been changed by 1874 * ocfs2_bg_discontig_fix_result 1875 */ 1876 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1877 1878 /* 1879 * Keep track of previous block descriptor read. When 1880 * we find a target, if we have read more than X 1881 * number of descriptors, and the target is reasonably 1882 * empty, relink him to top of his chain. 1883 * 1884 * We've read 0 extra blocks and only send one more to 1885 * the transaction, yet the next guy to search has a 1886 * much easier time. 1887 * 1888 * Do this *after* figuring out how many bits we're taking out 1889 * of our target group. 1890 */ 1891 if (!ac->ac_disable_chain_relink && 1892 (prev_group_bh) && 1893 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { 1894 status = ocfs2_relink_block_group(handle, alloc_inode, 1895 ac->ac_bh, group_bh, 1896 prev_group_bh, chain); 1897 if (status < 0) { 1898 mlog_errno(status); 1899 goto bail; 1900 } 1901 } 1902 1903 if (ac->ac_find_loc_only) 1904 goto out_loc_only; 1905 1906 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, 1907 ac->ac_bh, res->sr_bits, 1908 chain); 1909 if (status) { 1910 mlog_errno(status); 1911 goto bail; 1912 } 1913 1914 status = ocfs2_block_group_set_bits(handle, 1915 alloc_inode, 1916 bg, 1917 group_bh, 1918 res->sr_bit_offset, 1919 res->sr_bits, 1920 res->sr_max_contig_bits, 1921 0); 1922 if (status < 0) { 1923 ocfs2_rollback_alloc_dinode_counts(alloc_inode, 1924 ac->ac_bh, res->sr_bits, chain); 1925 mlog_errno(status); 1926 goto bail; 1927 } 1928 1929 trace_ocfs2_search_chain_end( 1930 (unsigned long long)le64_to_cpu(fe->i_blkno), 1931 res->sr_bits); 1932 1933 out_loc_only: 1934 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1935 bail: 1936 brelse(group_bh); 1937 brelse(prev_group_bh); 1938 1939 if (status) 1940 mlog_errno(status); 1941 return status; 1942 } 1943 1944 /* will give out up to bits_wanted contiguous bits. */ 1945 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 1946 handle_t *handle, 1947 u32 bits_wanted, 1948 u32 min_bits, 1949 struct ocfs2_suballoc_result *res) 1950 { 1951 int status; 1952 u16 victim, i; 1953 u16 bits_left = 0; 1954 u64 hint = ac->ac_last_group; 1955 struct ocfs2_chain_list *cl; 1956 struct ocfs2_dinode *fe; 1957 1958 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1959 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1960 BUG_ON(!ac->ac_bh); 1961 1962 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1963 1964 /* The bh was validated by the inode read during 1965 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 1966 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 1967 1968 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1969 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1970 status = ocfs2_error(ac->ac_inode->i_sb, 1971 "Chain allocator dinode %llu has %u used bits but only %u total\n", 1972 (unsigned long long)le64_to_cpu(fe->i_blkno), 1973 le32_to_cpu(fe->id1.bitmap1.i_used), 1974 le32_to_cpu(fe->id1.bitmap1.i_total)); 1975 goto bail; 1976 } 1977 1978 res->sr_bg_blkno = hint; 1979 if (res->sr_bg_blkno) { 1980 /* Attempt to short-circuit the usual search mechanism 1981 * by jumping straight to the most recently used 1982 * allocation group. This helps us maintain some 1983 * contiguousness across allocations. */ 1984 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1985 min_bits, res, &bits_left); 1986 if (!status) 1987 goto set_hint; 1988 if (status < 0 && status != -ENOSPC) { 1989 mlog_errno(status); 1990 goto bail; 1991 } 1992 } 1993 1994 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1995 1996 victim = ocfs2_find_victim_chain(cl); 1997 ac->ac_chain = victim; 1998 1999 search: 2000 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2001 res, &bits_left); 2002 if (!status) { 2003 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) 2004 hint = res->sr_bg_blkno; 2005 else 2006 hint = ocfs2_group_from_res(res); 2007 goto set_hint; 2008 } 2009 if (status < 0 && status != -ENOSPC) { 2010 mlog_errno(status); 2011 goto bail; 2012 } 2013 2014 trace_ocfs2_claim_suballoc_bits(victim); 2015 2016 /* If we didn't pick a good victim, then just default to 2017 * searching each chain in order. Don't allow chain relinking 2018 * because we only calculate enough journal credits for one 2019 * relink per alloc. */ 2020 ac->ac_disable_chain_relink = 1; 2021 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 2022 if (i == victim) 2023 continue; 2024 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) 2025 continue; 2026 2027 ac->ac_chain = i; 2028 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2029 res, &bits_left); 2030 if (!status) { 2031 hint = ocfs2_group_from_res(res); 2032 break; 2033 } 2034 if (status < 0 && status != -ENOSPC) { 2035 mlog_errno(status); 2036 goto bail; 2037 } 2038 } 2039 2040 /* Chains can't supply the bits_wanted contiguous space. 2041 * We should switch to using every single bit when allocating 2042 * from the global bitmap. */ 2043 if (i == le16_to_cpu(cl->cl_next_free_rec) && 2044 status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { 2045 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 2046 ac->ac_chain = victim; 2047 goto search; 2048 } 2049 2050 set_hint: 2051 if (status != -ENOSPC) { 2052 /* If the next search of this group is not likely to 2053 * yield a suitable extent, then we reset the last 2054 * group hint so as to not waste a disk read */ 2055 if (bits_left < min_bits) 2056 ac->ac_last_group = 0; 2057 else 2058 ac->ac_last_group = hint; 2059 } 2060 2061 bail: 2062 if (status) 2063 mlog_errno(status); 2064 return status; 2065 } 2066 2067 int ocfs2_claim_metadata(handle_t *handle, 2068 struct ocfs2_alloc_context *ac, 2069 u32 bits_wanted, 2070 u64 *suballoc_loc, 2071 u16 *suballoc_bit_start, 2072 unsigned int *num_bits, 2073 u64 *blkno_start) 2074 { 2075 int status; 2076 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2077 2078 BUG_ON(!ac); 2079 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 2080 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 2081 2082 status = ocfs2_claim_suballoc_bits(ac, 2083 handle, 2084 bits_wanted, 2085 1, 2086 &res); 2087 if (status < 0) { 2088 mlog_errno(status); 2089 goto bail; 2090 } 2091 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2092 2093 *suballoc_loc = res.sr_bg_blkno; 2094 *suballoc_bit_start = res.sr_bit_offset; 2095 *blkno_start = res.sr_blkno; 2096 ac->ac_bits_given += res.sr_bits; 2097 *num_bits = res.sr_bits; 2098 status = 0; 2099 bail: 2100 if (status) 2101 mlog_errno(status); 2102 return status; 2103 } 2104 2105 static void ocfs2_init_inode_ac_group(struct inode *dir, 2106 struct buffer_head *parent_di_bh, 2107 struct ocfs2_alloc_context *ac) 2108 { 2109 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; 2110 /* 2111 * Try to allocate inodes from some specific group. 2112 * 2113 * If the parent dir has recorded the last group used in allocation, 2114 * cool, use it. Otherwise if we try to allocate new inode from the 2115 * same slot the parent dir belongs to, use the same chunk. 2116 * 2117 * We are very careful here to avoid the mistake of setting 2118 * ac_last_group to a group descriptor from a different (unlocked) slot. 2119 */ 2120 if (OCFS2_I(dir)->ip_last_used_group && 2121 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2122 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2123 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { 2124 if (di->i_suballoc_loc) 2125 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); 2126 else 2127 ac->ac_last_group = ocfs2_which_suballoc_group( 2128 le64_to_cpu(di->i_blkno), 2129 le16_to_cpu(di->i_suballoc_bit)); 2130 } 2131 } 2132 2133 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2134 struct ocfs2_alloc_context *ac) 2135 { 2136 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 2137 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2138 } 2139 2140 int ocfs2_find_new_inode_loc(struct inode *dir, 2141 struct buffer_head *parent_fe_bh, 2142 struct ocfs2_alloc_context *ac, 2143 u64 *fe_blkno) 2144 { 2145 int ret; 2146 handle_t *handle = NULL; 2147 struct ocfs2_suballoc_result *res; 2148 2149 BUG_ON(!ac); 2150 BUG_ON(ac->ac_bits_given != 0); 2151 BUG_ON(ac->ac_bits_wanted != 1); 2152 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2153 2154 res = kzalloc(sizeof(*res), GFP_NOFS); 2155 if (res == NULL) { 2156 ret = -ENOMEM; 2157 mlog_errno(ret); 2158 goto out; 2159 } 2160 2161 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2162 2163 /* 2164 * The handle started here is for chain relink. Alternatively, 2165 * we could just disable relink for these calls. 2166 */ 2167 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); 2168 if (IS_ERR(handle)) { 2169 ret = PTR_ERR(handle); 2170 handle = NULL; 2171 mlog_errno(ret); 2172 goto out; 2173 } 2174 2175 /* 2176 * This will instruct ocfs2_claim_suballoc_bits and 2177 * ocfs2_search_one_group to search but save actual allocation 2178 * for later. 2179 */ 2180 ac->ac_find_loc_only = 1; 2181 2182 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); 2183 if (ret < 0) { 2184 mlog_errno(ret); 2185 goto out; 2186 } 2187 2188 ac->ac_find_loc_priv = res; 2189 *fe_blkno = res->sr_blkno; 2190 ocfs2_update_inode_fsync_trans(handle, dir, 0); 2191 out: 2192 if (handle) 2193 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2194 2195 if (ret) 2196 kfree(res); 2197 2198 return ret; 2199 } 2200 2201 int ocfs2_claim_new_inode_at_loc(handle_t *handle, 2202 struct inode *dir, 2203 struct ocfs2_alloc_context *ac, 2204 u64 *suballoc_loc, 2205 u16 *suballoc_bit, 2206 u64 di_blkno) 2207 { 2208 int ret; 2209 u16 chain; 2210 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; 2211 struct buffer_head *bg_bh = NULL; 2212 struct ocfs2_group_desc *bg; 2213 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2214 2215 /* 2216 * Since di_blkno is being passed back in, we check for any 2217 * inconsistencies which may have happened between 2218 * calls. These are code bugs as di_blkno is not expected to 2219 * change once returned from ocfs2_find_new_inode_loc() 2220 */ 2221 BUG_ON(res->sr_blkno != di_blkno); 2222 2223 ret = ocfs2_read_group_descriptor(ac->ac_inode, di, 2224 res->sr_bg_stable_blkno, &bg_bh); 2225 if (ret) { 2226 mlog_errno(ret); 2227 goto out; 2228 } 2229 2230 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 2231 chain = le16_to_cpu(bg->bg_chain); 2232 2233 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, 2234 ac->ac_bh, res->sr_bits, 2235 chain); 2236 if (ret) { 2237 mlog_errno(ret); 2238 goto out; 2239 } 2240 2241 ret = ocfs2_block_group_set_bits(handle, 2242 ac->ac_inode, 2243 bg, 2244 bg_bh, 2245 res->sr_bit_offset, 2246 res->sr_bits, 2247 res->sr_max_contig_bits, 2248 0); 2249 if (ret < 0) { 2250 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, 2251 ac->ac_bh, res->sr_bits, chain); 2252 mlog_errno(ret); 2253 goto out; 2254 } 2255 2256 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, 2257 res->sr_bits); 2258 2259 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2260 2261 BUG_ON(res->sr_bits != 1); 2262 2263 *suballoc_loc = res->sr_bg_blkno; 2264 *suballoc_bit = res->sr_bit_offset; 2265 ac->ac_bits_given++; 2266 ocfs2_save_inode_ac_group(dir, ac); 2267 2268 out: 2269 brelse(bg_bh); 2270 2271 return ret; 2272 } 2273 2274 int ocfs2_claim_new_inode(handle_t *handle, 2275 struct inode *dir, 2276 struct buffer_head *parent_fe_bh, 2277 struct ocfs2_alloc_context *ac, 2278 u64 *suballoc_loc, 2279 u16 *suballoc_bit, 2280 u64 *fe_blkno) 2281 { 2282 int status; 2283 struct ocfs2_suballoc_result res; 2284 2285 BUG_ON(!ac); 2286 BUG_ON(ac->ac_bits_given != 0); 2287 BUG_ON(ac->ac_bits_wanted != 1); 2288 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2289 2290 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2291 2292 status = ocfs2_claim_suballoc_bits(ac, 2293 handle, 2294 1, 2295 1, 2296 &res); 2297 if (status < 0) { 2298 mlog_errno(status); 2299 goto bail; 2300 } 2301 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2302 2303 BUG_ON(res.sr_bits != 1); 2304 2305 *suballoc_loc = res.sr_bg_blkno; 2306 *suballoc_bit = res.sr_bit_offset; 2307 *fe_blkno = res.sr_blkno; 2308 ac->ac_bits_given++; 2309 ocfs2_save_inode_ac_group(dir, ac); 2310 status = 0; 2311 bail: 2312 if (status) 2313 mlog_errno(status); 2314 return status; 2315 } 2316 2317 /* translate a group desc. blkno and it's bitmap offset into 2318 * disk cluster offset. */ 2319 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 2320 u64 bg_blkno, 2321 u16 bg_bit_off) 2322 { 2323 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2324 u32 cluster = 0; 2325 2326 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2327 2328 if (bg_blkno != osb->first_cluster_group_blkno) 2329 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 2330 cluster += (u32) bg_bit_off; 2331 return cluster; 2332 } 2333 2334 /* given a cluster offset, calculate which block group it belongs to 2335 * and return that block offset. */ 2336 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 2337 { 2338 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2339 u32 group_no; 2340 2341 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2342 2343 group_no = cluster / osb->bitmap_cpg; 2344 if (!group_no) 2345 return osb->first_cluster_group_blkno; 2346 return ocfs2_clusters_to_blocks(inode->i_sb, 2347 group_no * osb->bitmap_cpg); 2348 } 2349 2350 /* given the block number of a cluster start, calculate which cluster 2351 * group and descriptor bitmap offset that corresponds to. */ 2352 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 2353 u64 data_blkno, 2354 u64 *bg_blkno, 2355 u16 *bg_bit_off) 2356 { 2357 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2358 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 2359 2360 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2361 2362 *bg_blkno = ocfs2_which_cluster_group(inode, 2363 data_cluster); 2364 2365 if (*bg_blkno == osb->first_cluster_group_blkno) 2366 *bg_bit_off = (u16) data_cluster; 2367 else 2368 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 2369 data_blkno - *bg_blkno); 2370 } 2371 2372 /* 2373 * min_bits - minimum contiguous chunk from this total allocation we 2374 * can handle. set to what we asked for originally for a full 2375 * contig. allocation, set to '1' to indicate we can deal with extents 2376 * of any size. 2377 */ 2378 int __ocfs2_claim_clusters(handle_t *handle, 2379 struct ocfs2_alloc_context *ac, 2380 u32 min_clusters, 2381 u32 max_clusters, 2382 u32 *cluster_start, 2383 u32 *num_clusters) 2384 { 2385 int status; 2386 unsigned int bits_wanted = max_clusters; 2387 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2388 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2389 2390 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2391 2392 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2393 && ac->ac_which != OCFS2_AC_USE_MAIN 2394 && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); 2395 2396 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2397 WARN_ON(min_clusters > 1); 2398 2399 status = ocfs2_claim_local_alloc_bits(osb, 2400 handle, 2401 ac, 2402 bits_wanted, 2403 cluster_start, 2404 num_clusters); 2405 if (!status) 2406 atomic_inc(&osb->alloc_stats.local_data); 2407 } else { 2408 if (min_clusters > (osb->bitmap_cpg - 1)) { 2409 /* The only paths asking for contiguousness 2410 * should know about this already. */ 2411 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 2412 "group bitmap size %u!\n", min_clusters, 2413 osb->bitmap_cpg); 2414 status = -ENOSPC; 2415 goto bail; 2416 } 2417 /* clamp the current request down to a realistic size. */ 2418 if (bits_wanted > (osb->bitmap_cpg - 1)) 2419 bits_wanted = osb->bitmap_cpg - 1; 2420 2421 status = ocfs2_claim_suballoc_bits(ac, 2422 handle, 2423 bits_wanted, 2424 min_clusters, 2425 &res); 2426 if (!status) { 2427 BUG_ON(res.sr_blkno); /* cluster alloc can't set */ 2428 *cluster_start = 2429 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2430 res.sr_bg_blkno, 2431 res.sr_bit_offset); 2432 atomic_inc(&osb->alloc_stats.bitmap_data); 2433 *num_clusters = res.sr_bits; 2434 } 2435 } 2436 if (status < 0) { 2437 if (status != -ENOSPC) 2438 mlog_errno(status); 2439 goto bail; 2440 } 2441 2442 ac->ac_bits_given += *num_clusters; 2443 2444 bail: 2445 if (status) 2446 mlog_errno(status); 2447 return status; 2448 } 2449 2450 int ocfs2_claim_clusters(handle_t *handle, 2451 struct ocfs2_alloc_context *ac, 2452 u32 min_clusters, 2453 u32 *cluster_start, 2454 u32 *num_clusters) 2455 { 2456 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2457 2458 return __ocfs2_claim_clusters(handle, ac, min_clusters, 2459 bits_wanted, cluster_start, num_clusters); 2460 } 2461 2462 static int ocfs2_block_group_clear_bits(handle_t *handle, 2463 struct inode *alloc_inode, 2464 struct ocfs2_group_desc *bg, 2465 struct buffer_head *group_bh, 2466 unsigned int bit_off, 2467 unsigned int num_bits, 2468 unsigned int max_contig_bits, 2469 void (*undo_fn)(unsigned int bit, 2470 unsigned long *bmap)) 2471 { 2472 int status; 2473 unsigned int tmp; 2474 u16 contig_bits; 2475 struct ocfs2_group_desc *undo_bg = NULL; 2476 struct journal_head *jh; 2477 2478 /* The caller got this descriptor from 2479 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2480 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2481 2482 trace_ocfs2_block_group_clear_bits(bit_off, num_bits); 2483 2484 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2485 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2486 group_bh, 2487 undo_fn ? 2488 OCFS2_JOURNAL_ACCESS_UNDO : 2489 OCFS2_JOURNAL_ACCESS_WRITE); 2490 if (status < 0) { 2491 mlog_errno(status); 2492 goto bail; 2493 } 2494 2495 jh = bh2jh(group_bh); 2496 if (undo_fn) { 2497 spin_lock(&jh->b_state_lock); 2498 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; 2499 BUG_ON(!undo_bg); 2500 } 2501 2502 tmp = num_bits; 2503 while(tmp--) { 2504 ocfs2_clear_bit((bit_off + tmp), 2505 (unsigned long *) bg->bg_bitmap); 2506 if (undo_fn) 2507 undo_fn(bit_off + tmp, 2508 (unsigned long *) undo_bg->bg_bitmap); 2509 } 2510 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2511 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2512 if (undo_fn) 2513 spin_unlock(&jh->b_state_lock); 2514 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 2515 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2516 le16_to_cpu(bg->bg_bits), 2517 le16_to_cpu(bg->bg_free_bits_count), 2518 num_bits); 2519 } 2520 2521 /* 2522 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), 2523 * we still need to rescan whole bitmap. 2524 */ 2525 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 2526 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 2527 le16_to_cpu(bg->bg_bits), 0); 2528 if (contig_bits > max_contig_bits) 2529 max_contig_bits = contig_bits; 2530 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 2531 } else { 2532 bg->bg_contig_free_bits = 0; 2533 } 2534 2535 if (undo_fn) 2536 spin_unlock(&jh->b_state_lock); 2537 2538 ocfs2_journal_dirty(handle, group_bh); 2539 bail: 2540 return status; 2541 } 2542 2543 /* 2544 * expects the suballoc inode to already be locked. 2545 */ 2546 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2547 struct inode *alloc_inode, 2548 struct buffer_head *alloc_bh, 2549 unsigned int start_bit, 2550 u64 bg_blkno, 2551 unsigned int count, 2552 void (*undo_fn)(unsigned int bit, 2553 unsigned long *bitmap)) 2554 { 2555 int status = 0; 2556 u32 tmp_used; 2557 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2558 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2559 struct buffer_head *group_bh = NULL; 2560 struct ocfs2_group_desc *group; 2561 __le16 old_bg_contig_free_bits = 0; 2562 2563 /* The alloc_bh comes from ocfs2_free_dinode() or 2564 * ocfs2_free_clusters(). The callers have all locked the 2565 * allocator and gotten alloc_bh from the lock call. This 2566 * validates the dinode buffer. Any corruption that has happened 2567 * is a code bug. */ 2568 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2569 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2570 2571 trace_ocfs2_free_suballoc_bits( 2572 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 2573 (unsigned long long)bg_blkno, 2574 start_bit, count); 2575 2576 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2577 &group_bh); 2578 if (status < 0) { 2579 mlog_errno(status); 2580 goto bail; 2581 } 2582 group = (struct ocfs2_group_desc *) group_bh->b_data; 2583 2584 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2585 2586 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2587 old_bg_contig_free_bits = group->bg_contig_free_bits; 2588 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2589 group, group_bh, 2590 start_bit, count, 0, undo_fn); 2591 if (status < 0) { 2592 mlog_errno(status); 2593 goto bail; 2594 } 2595 2596 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2597 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2598 if (status < 0) { 2599 mlog_errno(status); 2600 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, 2601 start_bit, count, 2602 le16_to_cpu(old_bg_contig_free_bits), 1); 2603 goto bail; 2604 } 2605 2606 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 2607 count); 2608 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2609 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2610 ocfs2_journal_dirty(handle, alloc_bh); 2611 2612 bail: 2613 brelse(group_bh); 2614 return status; 2615 } 2616 2617 int ocfs2_free_suballoc_bits(handle_t *handle, 2618 struct inode *alloc_inode, 2619 struct buffer_head *alloc_bh, 2620 unsigned int start_bit, 2621 u64 bg_blkno, 2622 unsigned int count) 2623 { 2624 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2625 start_bit, bg_blkno, count, NULL); 2626 } 2627 2628 int ocfs2_free_dinode(handle_t *handle, 2629 struct inode *inode_alloc_inode, 2630 struct buffer_head *inode_alloc_bh, 2631 struct ocfs2_dinode *di) 2632 { 2633 u64 blk = le64_to_cpu(di->i_blkno); 2634 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2635 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2636 2637 if (di->i_suballoc_loc) 2638 bg_blkno = le64_to_cpu(di->i_suballoc_loc); 2639 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2640 inode_alloc_bh, bit, bg_blkno, 1); 2641 } 2642 2643 static int _ocfs2_free_clusters(handle_t *handle, 2644 struct inode *bitmap_inode, 2645 struct buffer_head *bitmap_bh, 2646 u64 start_blk, 2647 unsigned int num_clusters, 2648 void (*undo_fn)(unsigned int bit, 2649 unsigned long *bitmap)) 2650 { 2651 int status; 2652 u16 bg_start_bit; 2653 u64 bg_blkno; 2654 2655 /* You can't ever have a contiguous set of clusters 2656 * bigger than a block group bitmap so we never have to worry 2657 * about looping on them. 2658 * This is expensive. We can safely remove once this stuff has 2659 * gotten tested really well. */ 2660 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, 2661 ocfs2_blocks_to_clusters(bitmap_inode->i_sb, 2662 start_blk))); 2663 2664 2665 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2666 &bg_start_bit); 2667 2668 trace_ocfs2_free_clusters((unsigned long long)bg_blkno, 2669 (unsigned long long)start_blk, 2670 bg_start_bit, num_clusters); 2671 2672 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2673 bg_start_bit, bg_blkno, 2674 num_clusters, undo_fn); 2675 if (status < 0) { 2676 mlog_errno(status); 2677 goto out; 2678 } 2679 2680 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 2681 num_clusters); 2682 2683 out: 2684 return status; 2685 } 2686 2687 int ocfs2_free_clusters(handle_t *handle, 2688 struct inode *bitmap_inode, 2689 struct buffer_head *bitmap_bh, 2690 u64 start_blk, 2691 unsigned int num_clusters) 2692 { 2693 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2694 start_blk, num_clusters, 2695 _ocfs2_set_bit); 2696 } 2697 2698 /* 2699 * Give never-used clusters back to the global bitmap. We don't need 2700 * to protect these bits in the undo buffer. 2701 */ 2702 int ocfs2_release_clusters(handle_t *handle, 2703 struct inode *bitmap_inode, 2704 struct buffer_head *bitmap_bh, 2705 u64 start_blk, 2706 unsigned int num_clusters) 2707 { 2708 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2709 start_blk, num_clusters, 2710 _ocfs2_clear_bit); 2711 } 2712 2713 /* 2714 * For a given allocation, determine which allocators will need to be 2715 * accessed, and lock them, reserving the appropriate number of bits. 2716 * 2717 * Sparse file systems call this from ocfs2_write_begin_nolock() 2718 * and ocfs2_allocate_unwritten_extents(). 2719 * 2720 * File systems which don't support holes call this from 2721 * ocfs2_extend_allocation(). 2722 */ 2723 int ocfs2_lock_allocators(struct inode *inode, 2724 struct ocfs2_extent_tree *et, 2725 u32 clusters_to_add, u32 extents_to_split, 2726 struct ocfs2_alloc_context **data_ac, 2727 struct ocfs2_alloc_context **meta_ac) 2728 { 2729 int ret = 0, num_free_extents; 2730 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 2731 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2732 2733 *meta_ac = NULL; 2734 if (data_ac) 2735 *data_ac = NULL; 2736 2737 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2738 2739 num_free_extents = ocfs2_num_free_extents(et); 2740 if (num_free_extents < 0) { 2741 ret = num_free_extents; 2742 mlog_errno(ret); 2743 goto out; 2744 } 2745 2746 /* 2747 * Sparse allocation file systems need to be more conservative 2748 * with reserving room for expansion - the actual allocation 2749 * happens while we've got a journal handle open so re-taking 2750 * a cluster lock (because we ran out of room for another 2751 * extent) will violate ordering rules. 2752 * 2753 * Most of the time we'll only be seeing this 1 cluster at a time 2754 * anyway. 2755 * 2756 * Always lock for any unwritten extents - we might want to 2757 * add blocks during a split. 2758 */ 2759 if (!num_free_extents || 2760 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 2761 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 2762 if (ret < 0) { 2763 if (ret != -ENOSPC) 2764 mlog_errno(ret); 2765 goto out; 2766 } 2767 } 2768 2769 if (clusters_to_add == 0) 2770 goto out; 2771 2772 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 2773 if (ret < 0) { 2774 if (ret != -ENOSPC) 2775 mlog_errno(ret); 2776 goto out; 2777 } 2778 2779 out: 2780 if (ret) { 2781 if (*meta_ac) { 2782 ocfs2_free_alloc_context(*meta_ac); 2783 *meta_ac = NULL; 2784 } 2785 2786 /* 2787 * We cannot have an error and a non null *data_ac. 2788 */ 2789 } 2790 2791 return ret; 2792 } 2793 2794 /* 2795 * Read the inode specified by blkno to get suballoc_slot and 2796 * suballoc_bit. 2797 */ 2798 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2799 u16 *suballoc_slot, u64 *group_blkno, 2800 u16 *suballoc_bit) 2801 { 2802 int status; 2803 struct buffer_head *inode_bh = NULL; 2804 struct ocfs2_dinode *inode_fe; 2805 2806 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); 2807 2808 /* dirty read disk */ 2809 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2810 if (status < 0) { 2811 mlog(ML_ERROR, "read block %llu failed %d\n", 2812 (unsigned long long)blkno, status); 2813 goto bail; 2814 } 2815 2816 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 2817 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 2818 mlog(ML_ERROR, "invalid inode %llu requested\n", 2819 (unsigned long long)blkno); 2820 status = -EINVAL; 2821 goto bail; 2822 } 2823 2824 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 2825 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 2826 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 2827 (unsigned long long)blkno, 2828 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 2829 status = -EINVAL; 2830 goto bail; 2831 } 2832 2833 if (suballoc_slot) 2834 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2835 if (suballoc_bit) 2836 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2837 if (group_blkno) 2838 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); 2839 2840 bail: 2841 brelse(inode_bh); 2842 2843 if (status) 2844 mlog_errno(status); 2845 return status; 2846 } 2847 2848 /* 2849 * test whether bit is SET in allocator bitmap or not. on success, 0 2850 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 2851 * is returned and *res is meaningless. Call this after you have 2852 * cluster locked against suballoc, or you may get a result based on 2853 * non-up2date contents 2854 */ 2855 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2856 struct inode *suballoc, 2857 struct buffer_head *alloc_bh, 2858 u64 group_blkno, u64 blkno, 2859 u16 bit, int *res) 2860 { 2861 struct ocfs2_dinode *alloc_di; 2862 struct ocfs2_group_desc *group; 2863 struct buffer_head *group_bh = NULL; 2864 u64 bg_blkno; 2865 int status; 2866 2867 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, 2868 (unsigned int)bit); 2869 2870 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 2871 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 2872 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2873 (unsigned int)bit, 2874 ocfs2_bits_per_group(&alloc_di->id2.i_chain)); 2875 status = -EINVAL; 2876 goto bail; 2877 } 2878 2879 bg_blkno = group_blkno ? group_blkno : 2880 ocfs2_which_suballoc_group(blkno, bit); 2881 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, 2882 &group_bh); 2883 if (status < 0) { 2884 mlog(ML_ERROR, "read group %llu failed %d\n", 2885 (unsigned long long)bg_blkno, status); 2886 goto bail; 2887 } 2888 2889 group = (struct ocfs2_group_desc *) group_bh->b_data; 2890 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 2891 2892 bail: 2893 brelse(group_bh); 2894 2895 if (status) 2896 mlog_errno(status); 2897 return status; 2898 } 2899 2900 /* 2901 * Test if the bit representing this inode (blkno) is set in the 2902 * suballocator. 2903 * 2904 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 2905 * 2906 * In the event of failure, a negative value is returned and *res is 2907 * meaningless. 2908 * 2909 * Callers must make sure to hold nfs_sync_lock to prevent 2910 * ocfs2_delete_inode() on another node from accessing the same 2911 * suballocator concurrently. 2912 */ 2913 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2914 { 2915 int status; 2916 u64 group_blkno = 0; 2917 u16 suballoc_bit = 0, suballoc_slot = 0; 2918 struct inode *inode_alloc_inode; 2919 struct buffer_head *alloc_bh = NULL; 2920 2921 trace_ocfs2_test_inode_bit((unsigned long long)blkno); 2922 2923 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2924 &group_blkno, &suballoc_bit); 2925 if (status < 0) { 2926 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2927 goto bail; 2928 } 2929 2930 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) 2931 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 2932 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 2933 else 2934 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 2935 INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 2936 if (!inode_alloc_inode) { 2937 /* the error code could be inaccurate, but we are not able to 2938 * get the correct one. */ 2939 status = -EINVAL; 2940 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 2941 (u32)suballoc_slot); 2942 goto bail; 2943 } 2944 2945 inode_lock(inode_alloc_inode); 2946 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2947 if (status < 0) { 2948 inode_unlock(inode_alloc_inode); 2949 iput(inode_alloc_inode); 2950 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2951 (u32)suballoc_slot, status); 2952 goto bail; 2953 } 2954 2955 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2956 group_blkno, blkno, suballoc_bit, res); 2957 if (status < 0) 2958 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2959 2960 ocfs2_inode_unlock(inode_alloc_inode, 0); 2961 inode_unlock(inode_alloc_inode); 2962 2963 iput(inode_alloc_inode); 2964 brelse(alloc_bh); 2965 bail: 2966 if (status) 2967 mlog_errno(status); 2968 return status; 2969 } 2970