1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * suballoc.c 4 * 5 * metadata alloc and free 6 * Inspired by ext3 block groups. 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/string.h> 15 #include <linux/highmem.h> 16 17 #include <cluster/masklog.h> 18 19 #include "ocfs2.h" 20 21 #include "alloc.h" 22 #include "blockcheck.h" 23 #include "dlmglue.h" 24 #include "inode.h" 25 #include "journal.h" 26 #include "localalloc.h" 27 #include "suballoc.h" 28 #include "super.h" 29 #include "sysfile.h" 30 #include "uptodate.h" 31 #include "ocfs2_trace.h" 32 33 #include "buffer_head_io.h" 34 35 #define NOT_ALLOC_NEW_GROUP 0 36 #define ALLOC_NEW_GROUP 0x1 37 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 38 39 #define OCFS2_MAX_TO_STEAL 1024 40 41 struct ocfs2_suballoc_result { 42 u64 sr_bg_blkno; /* The bg we allocated from. Set 43 to 0 when a block group is 44 contiguous. */ 45 u64 sr_bg_stable_blkno; /* 46 * Doesn't change, always 47 * set to target block 48 * group descriptor 49 * block. 50 */ 51 u64 sr_blkno; /* The first allocated block */ 52 unsigned int sr_bit_offset; /* The bit in the bg */ 53 unsigned int sr_bits; /* How many bits we claimed */ 54 unsigned int sr_max_contig_bits; /* The length for contiguous 55 * free bits, only available 56 * for cluster group 57 */ 58 }; 59 60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) 61 { 62 if (res->sr_blkno == 0) 63 return 0; 64 65 if (res->sr_bg_blkno) 66 return res->sr_bg_blkno; 67 68 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 69 } 70 71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 72 static int ocfs2_block_group_fill(handle_t *handle, 73 struct inode *alloc_inode, 74 struct buffer_head *bg_bh, 75 u64 group_blkno, 76 unsigned int group_clusters, 77 u16 my_chain, 78 struct ocfs2_chain_list *cl); 79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 80 struct inode *alloc_inode, 81 struct buffer_head *bh, 82 u64 max_block, 83 u64 *last_alloc_group, 84 int flags); 85 86 static int ocfs2_cluster_group_search(struct inode *inode, 87 struct buffer_head *group_bh, 88 u32 bits_wanted, u32 min_bits, 89 u64 max_block, 90 struct ocfs2_suballoc_result *res); 91 static int ocfs2_block_group_search(struct inode *inode, 92 struct buffer_head *group_bh, 93 u32 bits_wanted, u32 min_bits, 94 u64 max_block, 95 struct ocfs2_suballoc_result *res); 96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 97 handle_t *handle, 98 u32 bits_wanted, 99 u32 min_bits, 100 struct ocfs2_suballoc_result *res); 101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 102 int nr); 103 static int ocfs2_relink_block_group(handle_t *handle, 104 struct inode *alloc_inode, 105 struct buffer_head *fe_bh, 106 struct buffer_head *bg_bh, 107 struct buffer_head *prev_bg_bh, 108 u16 chain); 109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 110 u32 wanted); 111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 112 u64 bg_blkno, 113 u16 bg_bit_off); 114 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 115 u64 data_blkno, 116 u64 *bg_blkno, 117 u16 *bg_bit_off); 118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 119 u32 bits_wanted, u64 max_block, 120 int flags, 121 struct ocfs2_alloc_context **ac); 122 123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 124 { 125 struct inode *inode = ac->ac_inode; 126 127 if (inode) { 128 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 129 ocfs2_inode_unlock(inode, 1); 130 131 inode_unlock(inode); 132 133 iput(inode); 134 ac->ac_inode = NULL; 135 } 136 brelse(ac->ac_bh); 137 ac->ac_bh = NULL; 138 ac->ac_resv = NULL; 139 kfree(ac->ac_find_loc_priv); 140 ac->ac_find_loc_priv = NULL; 141 } 142 143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 144 { 145 ocfs2_free_ac_resource(ac); 146 kfree(ac); 147 } 148 149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 150 { 151 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 152 } 153 154 #define do_error(fmt, ...) \ 155 do { \ 156 if (resize) \ 157 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ 158 else \ 159 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 160 } while (0) 161 162 static int ocfs2_validate_gd_self(struct super_block *sb, 163 struct buffer_head *bh, 164 int resize) 165 { 166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 167 168 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 169 do_error("Group descriptor #%llu has bad signature %.*s\n", 170 (unsigned long long)bh->b_blocknr, 7, 171 gd->bg_signature); 172 } 173 174 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 175 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", 176 (unsigned long long)bh->b_blocknr, 177 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 178 } 179 180 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 181 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", 182 (unsigned long long)bh->b_blocknr, 183 le32_to_cpu(gd->bg_generation)); 184 } 185 186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 187 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", 188 (unsigned long long)bh->b_blocknr, 189 le16_to_cpu(gd->bg_bits), 190 le16_to_cpu(gd->bg_free_bits_count)); 191 } 192 193 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 194 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", 195 (unsigned long long)bh->b_blocknr, 196 le16_to_cpu(gd->bg_bits), 197 8 * le16_to_cpu(gd->bg_size)); 198 } 199 200 return 0; 201 } 202 203 static int ocfs2_validate_gd_parent(struct super_block *sb, 204 struct ocfs2_dinode *di, 205 struct buffer_head *bh, 206 int resize) 207 { 208 unsigned int max_bits; 209 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 211 if (di->i_blkno != gd->bg_parent_dinode) { 212 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", 213 (unsigned long long)bh->b_blocknr, 214 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 215 (unsigned long long)le64_to_cpu(di->i_blkno)); 216 } 217 218 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 219 if (le16_to_cpu(gd->bg_bits) > max_bits) { 220 do_error("Group descriptor #%llu has bit count of %u\n", 221 (unsigned long long)bh->b_blocknr, 222 le16_to_cpu(gd->bg_bits)); 223 } 224 225 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 226 if ((le16_to_cpu(gd->bg_chain) > 227 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 228 ((le16_to_cpu(gd->bg_chain) == 229 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 230 do_error("Group descriptor #%llu has bad chain %u\n", 231 (unsigned long long)bh->b_blocknr, 232 le16_to_cpu(gd->bg_chain)); 233 } 234 235 return 0; 236 } 237 238 #undef do_error 239 240 /* 241 * This version only prints errors. It does not fail the filesystem, and 242 * exists only for resize. 243 */ 244 int ocfs2_check_group_descriptor(struct super_block *sb, 245 struct ocfs2_dinode *di, 246 struct buffer_head *bh) 247 { 248 int rc; 249 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 250 251 BUG_ON(!buffer_uptodate(bh)); 252 253 /* 254 * If the ecc fails, we return the error but otherwise 255 * leave the filesystem running. We know any error is 256 * local to this block. 257 */ 258 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 259 if (rc) { 260 mlog(ML_ERROR, 261 "Checksum failed for group descriptor %llu\n", 262 (unsigned long long)bh->b_blocknr); 263 } else 264 rc = ocfs2_validate_gd_self(sb, bh, 1); 265 if (!rc) 266 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 267 268 return rc; 269 } 270 271 static int ocfs2_validate_group_descriptor(struct super_block *sb, 272 struct buffer_head *bh) 273 { 274 int rc; 275 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 276 277 trace_ocfs2_validate_group_descriptor( 278 (unsigned long long)bh->b_blocknr); 279 280 BUG_ON(!buffer_uptodate(bh)); 281 282 /* 283 * If the ecc fails, we return the error but otherwise 284 * leave the filesystem running. We know any error is 285 * local to this block. 286 */ 287 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 288 if (rc) 289 return rc; 290 291 /* 292 * Errors after here are fatal. 293 */ 294 295 return ocfs2_validate_gd_self(sb, bh, 0); 296 } 297 298 /* 299 * The hint group descriptor (gd) may already have been released 300 * in _ocfs2_free_suballoc_bits(). We first check the gd signature, 301 * then perform the standard ocfs2_read_group_descriptor() jobs. 302 * 303 * If the gd signature is invalid, we return 'rc=0' and set 304 * '*released=1'. The caller is expected to handle this specific case. 305 * Otherwise, we return the actual error code. 306 * 307 * We treat gd signature corruption case as a release case. The 308 * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain() 309 * to search each gd block. The code will eventually find this 310 * corrupted gd block - Late, but not missed. 311 * 312 * Note: 313 * The caller is responsible for initializing the '*released' status. 314 */ 315 static int ocfs2_read_hint_group_descriptor(struct inode *inode, 316 struct ocfs2_dinode *di, u64 gd_blkno, 317 struct buffer_head **bh, int *released) 318 { 319 int rc; 320 struct buffer_head *tmp = *bh; 321 struct ocfs2_group_desc *gd; 322 323 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL); 324 if (rc) 325 goto out; 326 327 gd = (struct ocfs2_group_desc *) tmp->b_data; 328 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 329 /* 330 * Invalid gd cache was set in ocfs2_read_block(), 331 * which will affect block_group allocation. 332 * Path: 333 * ocfs2_reserve_suballoc_bits 334 * ocfs2_block_group_alloc 335 * ocfs2_block_group_alloc_contig 336 * ocfs2_set_new_buffer_uptodate 337 */ 338 ocfs2_remove_from_cache(INODE_CACHE(inode), tmp); 339 *released = 1; /* we return 'rc=0' for this case */ 340 goto free_bh; 341 } 342 343 /* below jobs same with ocfs2_read_group_descriptor() */ 344 if (!buffer_jbd(tmp)) { 345 rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp); 346 if (rc) 347 goto free_bh; 348 } 349 350 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 351 if (rc) 352 goto free_bh; 353 354 /* If ocfs2_read_block() got us a new bh, pass it up. */ 355 if (!*bh) 356 *bh = tmp; 357 358 return rc; 359 360 free_bh: 361 brelse(tmp); 362 out: 363 return rc; 364 } 365 366 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 367 u64 gd_blkno, struct buffer_head **bh) 368 { 369 int rc; 370 struct buffer_head *tmp = *bh; 371 372 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 373 ocfs2_validate_group_descriptor); 374 if (rc) 375 goto out; 376 377 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 378 if (rc) { 379 brelse(tmp); 380 goto out; 381 } 382 383 /* If ocfs2_read_block() got us a new bh, pass it up. */ 384 if (!*bh) 385 *bh = tmp; 386 387 out: 388 return rc; 389 } 390 391 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 392 struct ocfs2_group_desc *bg, 393 struct ocfs2_chain_list *cl, 394 u64 p_blkno, unsigned int clusters) 395 { 396 struct ocfs2_extent_list *el = &bg->bg_list; 397 struct ocfs2_extent_rec *rec; 398 399 BUG_ON(!ocfs2_supports_discontig_bg(osb)); 400 if (!el->l_next_free_rec) 401 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); 402 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; 403 rec->e_blkno = cpu_to_le64(p_blkno); 404 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 405 le16_to_cpu(cl->cl_bpc)); 406 rec->e_leaf_clusters = cpu_to_le16(clusters); 407 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 408 le16_add_cpu(&bg->bg_free_bits_count, 409 clusters * le16_to_cpu(cl->cl_bpc)); 410 le16_add_cpu(&el->l_next_free_rec, 1); 411 } 412 413 static int ocfs2_block_group_fill(handle_t *handle, 414 struct inode *alloc_inode, 415 struct buffer_head *bg_bh, 416 u64 group_blkno, 417 unsigned int group_clusters, 418 u16 my_chain, 419 struct ocfs2_chain_list *cl) 420 { 421 int status = 0; 422 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 423 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 424 struct super_block * sb = alloc_inode->i_sb; 425 426 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 427 status = ocfs2_error(alloc_inode->i_sb, 428 "group block (%llu) != b_blocknr (%llu)\n", 429 (unsigned long long)group_blkno, 430 (unsigned long long) bg_bh->b_blocknr); 431 goto bail; 432 } 433 434 status = ocfs2_journal_access_gd(handle, 435 INODE_CACHE(alloc_inode), 436 bg_bh, 437 OCFS2_JOURNAL_ACCESS_CREATE); 438 if (status < 0) { 439 mlog_errno(status); 440 goto bail; 441 } 442 443 memset(bg, 0, sb->s_blocksize); 444 strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 445 bg->bg_generation = cpu_to_le32(osb->fs_generation); 446 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 447 osb->s_feature_incompat)); 448 bg->bg_chain = cpu_to_le16(my_chain); 449 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 450 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 451 bg->bg_blkno = cpu_to_le64(group_blkno); 452 if (group_clusters == le16_to_cpu(cl->cl_cpg)) 453 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 454 else 455 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, 456 group_clusters); 457 458 /* set the 1st bit in the bitmap to account for the descriptor block */ 459 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 460 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 461 462 ocfs2_journal_dirty(handle, bg_bh); 463 464 /* There is no need to zero out or otherwise initialize the 465 * other blocks in a group - All valid FS metadata in a block 466 * group stores the superblock fs_generation value at 467 * allocation time. */ 468 469 bail: 470 if (status) 471 mlog_errno(status); 472 return status; 473 } 474 475 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 476 { 477 u16 curr, best; 478 479 best = curr = 0; 480 while (curr < le16_to_cpu(cl->cl_count)) { 481 if (le32_to_cpu(cl->cl_recs[best].c_total) > 482 le32_to_cpu(cl->cl_recs[curr].c_total)) 483 best = curr; 484 curr++; 485 } 486 return best; 487 } 488 489 static struct buffer_head * 490 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, 491 struct inode *alloc_inode, 492 struct ocfs2_alloc_context *ac, 493 struct ocfs2_chain_list *cl) 494 { 495 int status; 496 u32 bit_off, num_bits; 497 u64 bg_blkno; 498 struct buffer_head *bg_bh; 499 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 500 501 status = ocfs2_claim_clusters(handle, ac, 502 le16_to_cpu(cl->cl_cpg), &bit_off, 503 &num_bits); 504 if (status < 0) { 505 if (status != -ENOSPC) 506 mlog_errno(status); 507 goto bail; 508 } 509 510 /* setup the group */ 511 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 512 trace_ocfs2_block_group_alloc_contig( 513 (unsigned long long)bg_blkno, alloc_rec); 514 515 bg_bh = sb_getblk(osb->sb, bg_blkno); 516 if (!bg_bh) { 517 status = -ENOMEM; 518 mlog_errno(status); 519 goto bail; 520 } 521 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 522 523 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 524 bg_blkno, num_bits, alloc_rec, cl); 525 if (status < 0) { 526 brelse(bg_bh); 527 mlog_errno(status); 528 } 529 530 bail: 531 return status ? ERR_PTR(status) : bg_bh; 532 } 533 534 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, 535 handle_t *handle, 536 struct ocfs2_alloc_context *ac, 537 unsigned int min_bits, 538 u32 *bit_off, u32 *num_bits) 539 { 540 int status = 0; 541 542 while (min_bits) { 543 status = ocfs2_claim_clusters(handle, ac, min_bits, 544 bit_off, num_bits); 545 if (status != -ENOSPC) 546 break; 547 548 min_bits >>= 1; 549 } 550 551 return status; 552 } 553 554 static int ocfs2_block_group_grow_discontig(handle_t *handle, 555 struct inode *alloc_inode, 556 struct buffer_head *bg_bh, 557 struct ocfs2_alloc_context *ac, 558 struct ocfs2_chain_list *cl, 559 unsigned int min_bits) 560 { 561 int status; 562 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 563 struct ocfs2_group_desc *bg = 564 (struct ocfs2_group_desc *)bg_bh->b_data; 565 unsigned int needed = le16_to_cpu(cl->cl_cpg) - 566 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 567 u32 p_cpos, clusters; 568 u64 p_blkno; 569 struct ocfs2_extent_list *el = &bg->bg_list; 570 571 status = ocfs2_journal_access_gd(handle, 572 INODE_CACHE(alloc_inode), 573 bg_bh, 574 OCFS2_JOURNAL_ACCESS_CREATE); 575 if (status < 0) { 576 mlog_errno(status); 577 goto bail; 578 } 579 580 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < 581 le16_to_cpu(el->l_count))) { 582 if (min_bits > needed) 583 min_bits = needed; 584 status = ocfs2_block_group_claim_bits(osb, handle, ac, 585 min_bits, &p_cpos, 586 &clusters); 587 if (status < 0) { 588 if (status != -ENOSPC) 589 mlog_errno(status); 590 goto bail; 591 } 592 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); 593 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, 594 clusters); 595 596 min_bits = clusters; 597 needed = le16_to_cpu(cl->cl_cpg) - 598 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 599 } 600 601 if (needed > 0) { 602 /* 603 * We have used up all the extent rec but can't fill up 604 * the cpg. So bail out. 605 */ 606 status = -ENOSPC; 607 goto bail; 608 } 609 610 ocfs2_journal_dirty(handle, bg_bh); 611 612 bail: 613 return status; 614 } 615 616 static void ocfs2_bg_alloc_cleanup(handle_t *handle, 617 struct ocfs2_alloc_context *cluster_ac, 618 struct inode *alloc_inode, 619 struct buffer_head *bg_bh) 620 { 621 int i, ret; 622 struct ocfs2_group_desc *bg; 623 struct ocfs2_extent_list *el; 624 struct ocfs2_extent_rec *rec; 625 626 if (!bg_bh) 627 return; 628 629 bg = (struct ocfs2_group_desc *)bg_bh->b_data; 630 el = &bg->bg_list; 631 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 632 rec = &el->l_recs[i]; 633 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, 634 cluster_ac->ac_bh, 635 le64_to_cpu(rec->e_blkno), 636 le16_to_cpu(rec->e_leaf_clusters)); 637 if (ret) 638 mlog_errno(ret); 639 /* Try all the clusters to free */ 640 } 641 642 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); 643 brelse(bg_bh); 644 } 645 646 static struct buffer_head * 647 ocfs2_block_group_alloc_discontig(handle_t *handle, 648 struct inode *alloc_inode, 649 struct ocfs2_alloc_context *ac, 650 struct ocfs2_chain_list *cl) 651 { 652 int status; 653 u32 bit_off, num_bits; 654 u64 bg_blkno; 655 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; 656 struct buffer_head *bg_bh = NULL; 657 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 658 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 659 660 if (!ocfs2_supports_discontig_bg(osb)) { 661 status = -ENOSPC; 662 goto bail; 663 } 664 665 status = ocfs2_extend_trans(handle, 666 ocfs2_calc_bg_discontig_credits(osb->sb)); 667 if (status) { 668 mlog_errno(status); 669 goto bail; 670 } 671 672 /* 673 * We're going to be grabbing from multiple cluster groups. 674 * We don't have enough credits to relink them all, and the 675 * cluster groups will be staying in cache for the duration of 676 * this operation. 677 */ 678 ac->ac_disable_chain_relink = 1; 679 680 /* Claim the first region */ 681 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, 682 &bit_off, &num_bits); 683 if (status < 0) { 684 if (status != -ENOSPC) 685 mlog_errno(status); 686 goto bail; 687 } 688 min_bits = num_bits; 689 690 /* setup the group */ 691 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 692 trace_ocfs2_block_group_alloc_discontig( 693 (unsigned long long)bg_blkno, alloc_rec); 694 695 bg_bh = sb_getblk(osb->sb, bg_blkno); 696 if (!bg_bh) { 697 status = -ENOMEM; 698 mlog_errno(status); 699 goto bail; 700 } 701 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 702 703 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 704 bg_blkno, num_bits, alloc_rec, cl); 705 if (status < 0) { 706 mlog_errno(status); 707 goto bail; 708 } 709 710 status = ocfs2_block_group_grow_discontig(handle, alloc_inode, 711 bg_bh, ac, cl, min_bits); 712 if (status) 713 mlog_errno(status); 714 715 bail: 716 if (status) 717 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); 718 return status ? ERR_PTR(status) : bg_bh; 719 } 720 721 /* 722 * We expect the block group allocator to already be locked. 723 */ 724 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 725 struct inode *alloc_inode, 726 struct buffer_head *bh, 727 u64 max_block, 728 u64 *last_alloc_group, 729 int flags) 730 { 731 int status, credits; 732 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 733 struct ocfs2_chain_list *cl; 734 struct ocfs2_alloc_context *ac = NULL; 735 handle_t *handle = NULL; 736 u16 alloc_rec; 737 struct buffer_head *bg_bh = NULL; 738 struct ocfs2_group_desc *bg; 739 740 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 741 742 cl = &fe->id2.i_chain; 743 status = ocfs2_reserve_clusters_with_limit(osb, 744 le16_to_cpu(cl->cl_cpg), 745 max_block, flags, &ac); 746 if (status < 0) { 747 if (status != -ENOSPC) 748 mlog_errno(status); 749 goto bail; 750 } 751 752 credits = ocfs2_calc_group_alloc_credits(osb->sb, 753 le16_to_cpu(cl->cl_cpg)); 754 handle = ocfs2_start_trans(osb, credits); 755 if (IS_ERR(handle)) { 756 status = PTR_ERR(handle); 757 handle = NULL; 758 mlog_errno(status); 759 goto bail; 760 } 761 762 if (last_alloc_group && *last_alloc_group != 0) { 763 trace_ocfs2_block_group_alloc( 764 (unsigned long long)*last_alloc_group); 765 ac->ac_last_group = *last_alloc_group; 766 } 767 768 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 769 ac, cl); 770 if (PTR_ERR(bg_bh) == -ENOSPC) { 771 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 772 bg_bh = ocfs2_block_group_alloc_discontig(handle, 773 alloc_inode, 774 ac, cl); 775 } 776 if (IS_ERR(bg_bh)) { 777 status = PTR_ERR(bg_bh); 778 bg_bh = NULL; 779 if (status != -ENOSPC) 780 mlog_errno(status); 781 goto bail; 782 } 783 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 784 785 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 786 bh, OCFS2_JOURNAL_ACCESS_WRITE); 787 if (status < 0) { 788 mlog_errno(status); 789 goto bail; 790 } 791 792 alloc_rec = le16_to_cpu(bg->bg_chain); 793 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 794 le16_to_cpu(bg->bg_free_bits_count)); 795 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 796 le16_to_cpu(bg->bg_bits)); 797 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; 798 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 799 le16_add_cpu(&cl->cl_next_free_rec, 1); 800 801 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 802 le16_to_cpu(bg->bg_free_bits_count)); 803 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 804 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 805 806 ocfs2_journal_dirty(handle, bh); 807 808 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 809 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 810 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 811 le32_to_cpu(fe->i_clusters))); 812 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 813 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 814 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 815 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 816 817 status = 0; 818 819 /* save the new last alloc group so that the caller can cache it. */ 820 if (last_alloc_group) 821 *last_alloc_group = ac->ac_last_group; 822 823 bail: 824 if (handle) 825 ocfs2_commit_trans(osb, handle); 826 827 if (ac) 828 ocfs2_free_alloc_context(ac); 829 830 brelse(bg_bh); 831 832 if (status) 833 mlog_errno(status); 834 return status; 835 } 836 837 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 838 struct ocfs2_alloc_context *ac, 839 int type, 840 u32 slot, 841 u64 *last_alloc_group, 842 int flags) 843 { 844 int status; 845 u32 bits_wanted = ac->ac_bits_wanted; 846 struct inode *alloc_inode; 847 struct buffer_head *bh = NULL; 848 struct ocfs2_dinode *fe; 849 u32 free_bits; 850 851 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 852 if (!alloc_inode) { 853 mlog_errno(-EINVAL); 854 return -EINVAL; 855 } 856 857 inode_lock(alloc_inode); 858 859 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 860 if (status < 0) { 861 inode_unlock(alloc_inode); 862 iput(alloc_inode); 863 864 mlog_errno(status); 865 return status; 866 } 867 868 ac->ac_inode = alloc_inode; 869 ac->ac_alloc_slot = slot; 870 871 fe = (struct ocfs2_dinode *) bh->b_data; 872 873 /* The bh was validated by the inode read inside 874 * ocfs2_inode_lock(). Any corruption is a code bug. */ 875 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 876 877 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 878 status = ocfs2_error(alloc_inode->i_sb, 879 "Invalid chain allocator %llu\n", 880 (unsigned long long)le64_to_cpu(fe->i_blkno)); 881 goto bail; 882 } 883 884 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 885 le32_to_cpu(fe->id1.bitmap1.i_used); 886 887 if (bits_wanted > free_bits) { 888 /* cluster bitmap never grows */ 889 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 890 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, 891 free_bits); 892 status = -ENOSPC; 893 goto bail; 894 } 895 896 if (!(flags & ALLOC_NEW_GROUP)) { 897 trace_ocfs2_reserve_suballoc_bits_no_new_group( 898 slot, bits_wanted, free_bits); 899 status = -ENOSPC; 900 goto bail; 901 } 902 903 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 904 ac->ac_max_block, 905 last_alloc_group, flags); 906 if (status < 0) { 907 if (status != -ENOSPC) 908 mlog_errno(status); 909 goto bail; 910 } 911 atomic_inc(&osb->alloc_stats.bg_extends); 912 913 /* You should never ask for this much metadata */ 914 BUG_ON(bits_wanted > 915 (le32_to_cpu(fe->id1.bitmap1.i_total) 916 - le32_to_cpu(fe->id1.bitmap1.i_used))); 917 } 918 919 get_bh(bh); 920 ac->ac_bh = bh; 921 bail: 922 brelse(bh); 923 924 if (status) 925 mlog_errno(status); 926 return status; 927 } 928 929 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 930 { 931 spin_lock(&osb->osb_lock); 932 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 933 spin_unlock(&osb->osb_lock); 934 atomic_set(&osb->s_num_inodes_stolen, 0); 935 } 936 937 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 938 { 939 spin_lock(&osb->osb_lock); 940 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 941 spin_unlock(&osb->osb_lock); 942 atomic_set(&osb->s_num_meta_stolen, 0); 943 } 944 945 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 946 { 947 ocfs2_init_inode_steal_slot(osb); 948 ocfs2_init_meta_steal_slot(osb); 949 } 950 951 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 952 { 953 spin_lock(&osb->osb_lock); 954 if (type == INODE_ALLOC_SYSTEM_INODE) 955 osb->s_inode_steal_slot = (u16)slot; 956 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 957 osb->s_meta_steal_slot = (u16)slot; 958 spin_unlock(&osb->osb_lock); 959 } 960 961 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 962 { 963 int slot = OCFS2_INVALID_SLOT; 964 965 spin_lock(&osb->osb_lock); 966 if (type == INODE_ALLOC_SYSTEM_INODE) 967 slot = osb->s_inode_steal_slot; 968 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 969 slot = osb->s_meta_steal_slot; 970 spin_unlock(&osb->osb_lock); 971 972 return slot; 973 } 974 975 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 976 { 977 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 978 } 979 980 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 981 { 982 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 983 } 984 985 static int ocfs2_steal_resource(struct ocfs2_super *osb, 986 struct ocfs2_alloc_context *ac, 987 int type) 988 { 989 int i, status = -ENOSPC; 990 int slot = __ocfs2_get_steal_slot(osb, type); 991 992 /* Start to steal resource from the first slot after ours. */ 993 if (slot == OCFS2_INVALID_SLOT) 994 slot = osb->slot_num + 1; 995 996 for (i = 0; i < osb->max_slots; i++, slot++) { 997 if (slot == osb->max_slots) 998 slot = 0; 999 1000 if (slot == osb->slot_num) 1001 continue; 1002 1003 status = ocfs2_reserve_suballoc_bits(osb, ac, 1004 type, 1005 (u32)slot, NULL, 1006 NOT_ALLOC_NEW_GROUP); 1007 if (status >= 0) { 1008 __ocfs2_set_steal_slot(osb, slot, type); 1009 break; 1010 } 1011 1012 ocfs2_free_ac_resource(ac); 1013 } 1014 1015 return status; 1016 } 1017 1018 static int ocfs2_steal_inode(struct ocfs2_super *osb, 1019 struct ocfs2_alloc_context *ac) 1020 { 1021 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 1022 } 1023 1024 static int ocfs2_steal_meta(struct ocfs2_super *osb, 1025 struct ocfs2_alloc_context *ac) 1026 { 1027 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 1028 } 1029 1030 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 1031 int blocks, 1032 struct ocfs2_alloc_context **ac) 1033 { 1034 int status; 1035 int slot = ocfs2_get_meta_steal_slot(osb); 1036 1037 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1038 if (!(*ac)) { 1039 status = -ENOMEM; 1040 mlog_errno(status); 1041 goto bail; 1042 } 1043 1044 (*ac)->ac_bits_wanted = blocks; 1045 (*ac)->ac_which = OCFS2_AC_USE_META; 1046 (*ac)->ac_group_search = ocfs2_block_group_search; 1047 1048 if (slot != OCFS2_INVALID_SLOT && 1049 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 1050 goto extent_steal; 1051 1052 atomic_set(&osb->s_num_meta_stolen, 0); 1053 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1054 EXTENT_ALLOC_SYSTEM_INODE, 1055 (u32)osb->slot_num, NULL, 1056 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); 1057 1058 1059 if (status >= 0) { 1060 status = 0; 1061 if (slot != OCFS2_INVALID_SLOT) 1062 ocfs2_init_meta_steal_slot(osb); 1063 goto bail; 1064 } else if (status < 0 && status != -ENOSPC) { 1065 mlog_errno(status); 1066 goto bail; 1067 } 1068 1069 ocfs2_free_ac_resource(*ac); 1070 1071 extent_steal: 1072 status = ocfs2_steal_meta(osb, *ac); 1073 atomic_inc(&osb->s_num_meta_stolen); 1074 if (status < 0) { 1075 if (status != -ENOSPC) 1076 mlog_errno(status); 1077 goto bail; 1078 } 1079 1080 status = 0; 1081 bail: 1082 if ((status < 0) && *ac) { 1083 ocfs2_free_alloc_context(*ac); 1084 *ac = NULL; 1085 } 1086 1087 if (status) 1088 mlog_errno(status); 1089 return status; 1090 } 1091 1092 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 1093 struct ocfs2_extent_list *root_el, 1094 struct ocfs2_alloc_context **ac) 1095 { 1096 return ocfs2_reserve_new_metadata_blocks(osb, 1097 ocfs2_extend_meta_needed(root_el), 1098 ac); 1099 } 1100 1101 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 1102 struct ocfs2_alloc_context **ac) 1103 { 1104 int status; 1105 int slot = ocfs2_get_inode_steal_slot(osb); 1106 u64 alloc_group; 1107 1108 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1109 if (!(*ac)) { 1110 status = -ENOMEM; 1111 mlog_errno(status); 1112 goto bail; 1113 } 1114 1115 (*ac)->ac_bits_wanted = 1; 1116 (*ac)->ac_which = OCFS2_AC_USE_INODE; 1117 1118 (*ac)->ac_group_search = ocfs2_block_group_search; 1119 1120 /* 1121 * stat(2) can't handle i_ino > 32bits, so we tell the 1122 * lower levels not to allocate us a block group past that 1123 * limit. The 'inode64' mount option avoids this behavior. 1124 */ 1125 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 1126 (*ac)->ac_max_block = (u32)~0U; 1127 1128 /* 1129 * slot is set when we successfully steal inode from other nodes. 1130 * It is reset in 3 places: 1131 * 1. when we flush the truncate log 1132 * 2. when we complete local alloc recovery. 1133 * 3. when we successfully allocate from our own slot. 1134 * After it is set, we will go on stealing inodes until we find the 1135 * need to check our slots to see whether there is some space for us. 1136 */ 1137 if (slot != OCFS2_INVALID_SLOT && 1138 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 1139 goto inode_steal; 1140 1141 atomic_set(&osb->s_num_inodes_stolen, 0); 1142 alloc_group = osb->osb_inode_alloc_group; 1143 status = ocfs2_reserve_suballoc_bits(osb, *ac, 1144 INODE_ALLOC_SYSTEM_INODE, 1145 (u32)osb->slot_num, 1146 &alloc_group, 1147 ALLOC_NEW_GROUP | 1148 ALLOC_GROUPS_FROM_GLOBAL); 1149 if (status >= 0) { 1150 status = 0; 1151 1152 spin_lock(&osb->osb_lock); 1153 osb->osb_inode_alloc_group = alloc_group; 1154 spin_unlock(&osb->osb_lock); 1155 trace_ocfs2_reserve_new_inode_new_group( 1156 (unsigned long long)alloc_group); 1157 1158 /* 1159 * Some inodes must be freed by us, so try to allocate 1160 * from our own next time. 1161 */ 1162 if (slot != OCFS2_INVALID_SLOT) 1163 ocfs2_init_inode_steal_slot(osb); 1164 goto bail; 1165 } else if (status < 0 && status != -ENOSPC) { 1166 mlog_errno(status); 1167 goto bail; 1168 } 1169 1170 ocfs2_free_ac_resource(*ac); 1171 1172 inode_steal: 1173 status = ocfs2_steal_inode(osb, *ac); 1174 atomic_inc(&osb->s_num_inodes_stolen); 1175 if (status < 0) { 1176 if (status != -ENOSPC) 1177 mlog_errno(status); 1178 goto bail; 1179 } 1180 1181 status = 0; 1182 bail: 1183 if ((status < 0) && *ac) { 1184 ocfs2_free_alloc_context(*ac); 1185 *ac = NULL; 1186 } 1187 1188 if (status) 1189 mlog_errno(status); 1190 return status; 1191 } 1192 1193 /* local alloc code has to do the same thing, so rather than do this 1194 * twice.. */ 1195 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 1196 struct ocfs2_alloc_context *ac) 1197 { 1198 int status; 1199 1200 ac->ac_which = OCFS2_AC_USE_MAIN; 1201 ac->ac_group_search = ocfs2_cluster_group_search; 1202 1203 status = ocfs2_reserve_suballoc_bits(osb, ac, 1204 GLOBAL_BITMAP_SYSTEM_INODE, 1205 OCFS2_INVALID_SLOT, NULL, 1206 ALLOC_NEW_GROUP); 1207 if (status < 0 && status != -ENOSPC) 1208 mlog_errno(status); 1209 1210 return status; 1211 } 1212 1213 /* Callers don't need to care which bitmap (local alloc or main) to 1214 * use so we figure it out for them, but unfortunately this clutters 1215 * things a bit. */ 1216 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 1217 u32 bits_wanted, u64 max_block, 1218 int flags, 1219 struct ocfs2_alloc_context **ac) 1220 { 1221 int status, ret = 0; 1222 int retried = 0; 1223 1224 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1225 if (!(*ac)) { 1226 status = -ENOMEM; 1227 mlog_errno(status); 1228 goto bail; 1229 } 1230 1231 (*ac)->ac_bits_wanted = bits_wanted; 1232 (*ac)->ac_max_block = max_block; 1233 1234 status = -ENOSPC; 1235 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 1236 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 1237 status = ocfs2_reserve_local_alloc_bits(osb, 1238 bits_wanted, 1239 *ac); 1240 if ((status < 0) && (status != -ENOSPC)) { 1241 mlog_errno(status); 1242 goto bail; 1243 } 1244 } 1245 1246 if (status == -ENOSPC) { 1247 retry: 1248 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1249 /* Retry if there is sufficient space cached in truncate log */ 1250 if (status == -ENOSPC && !retried) { 1251 retried = 1; 1252 ocfs2_inode_unlock((*ac)->ac_inode, 1); 1253 inode_unlock((*ac)->ac_inode); 1254 1255 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1256 if (ret == 1) { 1257 iput((*ac)->ac_inode); 1258 (*ac)->ac_inode = NULL; 1259 goto retry; 1260 } 1261 1262 if (ret < 0) 1263 mlog_errno(ret); 1264 1265 inode_lock((*ac)->ac_inode); 1266 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1267 if (ret < 0) { 1268 mlog_errno(ret); 1269 inode_unlock((*ac)->ac_inode); 1270 iput((*ac)->ac_inode); 1271 (*ac)->ac_inode = NULL; 1272 goto bail; 1273 } 1274 } 1275 if (status < 0) { 1276 if (status != -ENOSPC) 1277 mlog_errno(status); 1278 goto bail; 1279 } 1280 } 1281 1282 status = 0; 1283 bail: 1284 if ((status < 0) && *ac) { 1285 ocfs2_free_alloc_context(*ac); 1286 *ac = NULL; 1287 } 1288 1289 if (status) 1290 mlog_errno(status); 1291 return status; 1292 } 1293 1294 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 1295 u32 bits_wanted, 1296 struct ocfs2_alloc_context **ac) 1297 { 1298 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 1299 ALLOC_NEW_GROUP, ac); 1300 } 1301 1302 /* 1303 * More or less lifted from ext3. I'll leave their description below: 1304 * 1305 * "For ext3 allocations, we must not reuse any blocks which are 1306 * allocated in the bitmap buffer's "last committed data" copy. This 1307 * prevents deletes from freeing up the page for reuse until we have 1308 * committed the delete transaction. 1309 * 1310 * If we didn't do this, then deleting something and reallocating it as 1311 * data would allow the old block to be overwritten before the 1312 * transaction committed (because we force data to disk before commit). 1313 * This would lead to corruption if we crashed between overwriting the 1314 * data and committing the delete. 1315 * 1316 * @@@ We may want to make this allocation behaviour conditional on 1317 * data-writes at some point, and disable it for metadata allocations or 1318 * sync-data inodes." 1319 * 1320 * Note: OCFS2 already does this differently for metadata vs data 1321 * allocations, as those bitmaps are separate and undo access is never 1322 * called on a metadata group descriptor. 1323 */ 1324 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1325 int nr) 1326 { 1327 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1328 struct journal_head *jh; 1329 int ret; 1330 1331 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1332 return 0; 1333 1334 jh = jbd2_journal_grab_journal_head(bg_bh); 1335 if (!jh) 1336 return 1; 1337 1338 spin_lock(&jh->b_state_lock); 1339 bg = (struct ocfs2_group_desc *) jh->b_committed_data; 1340 if (bg) 1341 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1342 else 1343 ret = 1; 1344 spin_unlock(&jh->b_state_lock); 1345 jbd2_journal_put_journal_head(jh); 1346 1347 return ret; 1348 } 1349 1350 u16 ocfs2_find_max_contig_free_bits(void *bitmap, 1351 u16 total_bits, u16 start) 1352 { 1353 u16 offset, free_bits; 1354 u16 contig_bits = 0; 1355 1356 while (start < total_bits) { 1357 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); 1358 if (offset == total_bits) 1359 break; 1360 1361 start = ocfs2_find_next_bit(bitmap, total_bits, offset); 1362 free_bits = start - offset; 1363 if (contig_bits < free_bits) 1364 contig_bits = free_bits; 1365 } 1366 1367 return contig_bits; 1368 } 1369 1370 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1371 struct buffer_head *bg_bh, 1372 unsigned int bits_wanted, 1373 unsigned int total_bits, 1374 struct ocfs2_suballoc_result *res) 1375 { 1376 void *bitmap; 1377 u16 best_offset, best_size; 1378 u16 prev_best_size = 0; 1379 int offset, start, found, status = 0; 1380 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1381 1382 /* Callers got this descriptor from 1383 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1384 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1385 1386 found = start = best_offset = best_size = 0; 1387 bitmap = bg->bg_bitmap; 1388 1389 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < 1390 total_bits) { 1391 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1392 /* We found a zero, but we can't use it as it 1393 * hasn't been put to disk yet! */ 1394 found = 0; 1395 start = offset + 1; 1396 } else if (offset == start) { 1397 /* we found a zero */ 1398 found++; 1399 /* move start to the next bit to test */ 1400 start++; 1401 } else { 1402 /* got a zero after some ones */ 1403 found = 1; 1404 start = offset + 1; 1405 prev_best_size = best_size; 1406 } 1407 if (found > best_size) { 1408 best_size = found; 1409 best_offset = start - found; 1410 } 1411 /* we got everything we needed */ 1412 if (found == bits_wanted) { 1413 /* mlog(0, "Found it all!\n"); */ 1414 break; 1415 } 1416 } 1417 1418 /* best_size will be allocated, we save prev_best_size */ 1419 res->sr_max_contig_bits = prev_best_size; 1420 if (best_size) { 1421 res->sr_bit_offset = best_offset; 1422 res->sr_bits = best_size; 1423 } else { 1424 status = -ENOSPC; 1425 /* No error log here -- see the comment above 1426 * ocfs2_test_bg_bit_allocatable */ 1427 } 1428 1429 return status; 1430 } 1431 1432 int ocfs2_block_group_set_bits(handle_t *handle, 1433 struct inode *alloc_inode, 1434 struct ocfs2_group_desc *bg, 1435 struct buffer_head *group_bh, 1436 unsigned int bit_off, 1437 unsigned int num_bits, 1438 unsigned int max_contig_bits, 1439 int fastpath) 1440 { 1441 int status; 1442 void *bitmap = bg->bg_bitmap; 1443 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1444 unsigned int start = bit_off + num_bits; 1445 u16 contig_bits; 1446 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1447 1448 /* All callers get the descriptor via 1449 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1450 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1451 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1452 1453 trace_ocfs2_block_group_set_bits(bit_off, num_bits); 1454 1455 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1456 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1457 1458 status = ocfs2_journal_access_gd(handle, 1459 INODE_CACHE(alloc_inode), 1460 group_bh, 1461 journal_type); 1462 if (status < 0) { 1463 mlog_errno(status); 1464 goto bail; 1465 } 1466 1467 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1468 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1469 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 1470 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1471 le16_to_cpu(bg->bg_bits), 1472 le16_to_cpu(bg->bg_free_bits_count), 1473 num_bits); 1474 } 1475 while(num_bits--) 1476 ocfs2_set_bit(bit_off++, bitmap); 1477 1478 /* 1479 * this is optimize path, caller set old contig value 1480 * in max_contig_bits to bypass finding action. 1481 */ 1482 if (fastpath) { 1483 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1484 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { 1485 /* 1486 * Usually, the block group bitmap allocates only 1 bit 1487 * at a time, while the cluster group allocates n bits 1488 * each time. Therefore, we only save the contig bits for 1489 * the cluster group. 1490 */ 1491 contig_bits = ocfs2_find_max_contig_free_bits(bitmap, 1492 le16_to_cpu(bg->bg_bits), start); 1493 if (contig_bits > max_contig_bits) 1494 max_contig_bits = contig_bits; 1495 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1496 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); 1497 } else { 1498 bg->bg_contig_free_bits = 0; 1499 } 1500 1501 ocfs2_journal_dirty(handle, group_bh); 1502 1503 bail: 1504 return status; 1505 } 1506 1507 /* find the one with the most empty bits */ 1508 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1509 { 1510 u16 curr, best; 1511 1512 BUG_ON(!cl->cl_next_free_rec); 1513 1514 best = curr = 0; 1515 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1516 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1517 le32_to_cpu(cl->cl_recs[best].c_free)) 1518 best = curr; 1519 curr++; 1520 } 1521 1522 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1523 return best; 1524 } 1525 1526 static int ocfs2_relink_block_group(handle_t *handle, 1527 struct inode *alloc_inode, 1528 struct buffer_head *fe_bh, 1529 struct buffer_head *bg_bh, 1530 struct buffer_head *prev_bg_bh, 1531 u16 chain) 1532 { 1533 int status; 1534 /* there is a really tiny chance the journal calls could fail, 1535 * but we wouldn't want inconsistent blocks in *any* case. */ 1536 u64 bg_ptr, prev_bg_ptr; 1537 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1538 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1539 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1540 1541 /* The caller got these descriptors from 1542 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1543 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1544 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1545 1546 trace_ocfs2_relink_block_group( 1547 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1548 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1549 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1550 1551 bg_ptr = le64_to_cpu(bg->bg_next_group); 1552 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1553 1554 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1555 prev_bg_bh, 1556 OCFS2_JOURNAL_ACCESS_WRITE); 1557 if (status < 0) 1558 goto out; 1559 1560 prev_bg->bg_next_group = bg->bg_next_group; 1561 ocfs2_journal_dirty(handle, prev_bg_bh); 1562 1563 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1564 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1565 if (status < 0) 1566 goto out_rollback_prev_bg; 1567 1568 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1569 ocfs2_journal_dirty(handle, bg_bh); 1570 1571 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1572 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1573 if (status < 0) 1574 goto out_rollback_bg; 1575 1576 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1577 ocfs2_journal_dirty(handle, fe_bh); 1578 1579 out: 1580 if (status < 0) 1581 mlog_errno(status); 1582 return status; 1583 1584 out_rollback_bg: 1585 bg->bg_next_group = cpu_to_le64(bg_ptr); 1586 out_rollback_prev_bg: 1587 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1588 goto out; 1589 } 1590 1591 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1592 u32 wanted) 1593 { 1594 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1595 } 1596 1597 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1598 * value on error. */ 1599 static int ocfs2_cluster_group_search(struct inode *inode, 1600 struct buffer_head *group_bh, 1601 u32 bits_wanted, u32 min_bits, 1602 u64 max_block, 1603 struct ocfs2_suballoc_result *res) 1604 { 1605 int search = -ENOSPC; 1606 int ret; 1607 u64 blkoff; 1608 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1609 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1610 unsigned int max_bits, gd_cluster_off; 1611 1612 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1613 1614 if (le16_to_cpu(gd->bg_contig_free_bits) && 1615 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) 1616 return -ENOSPC; 1617 1618 /* ->bg_contig_free_bits may un-initialized, so compare again */ 1619 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { 1620 max_bits = le16_to_cpu(gd->bg_bits); 1621 1622 /* Tail groups in cluster bitmaps which aren't cpg 1623 * aligned are prone to partial extension by a failed 1624 * fs resize. If the file system resize never got to 1625 * update the dinode cluster count, then we don't want 1626 * to trust any clusters past it, regardless of what 1627 * the group descriptor says. */ 1628 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1629 le64_to_cpu(gd->bg_blkno)); 1630 if ((gd_cluster_off + max_bits) > 1631 OCFS2_I(inode)->ip_clusters) { 1632 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1633 trace_ocfs2_cluster_group_search_wrong_max_bits( 1634 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1635 le16_to_cpu(gd->bg_bits), 1636 OCFS2_I(inode)->ip_clusters, max_bits); 1637 } 1638 1639 ret = ocfs2_block_group_find_clear_bits(osb, 1640 group_bh, bits_wanted, 1641 max_bits, res); 1642 if (ret) 1643 return ret; 1644 1645 if (max_block) { 1646 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1647 gd_cluster_off + 1648 res->sr_bit_offset + 1649 res->sr_bits); 1650 trace_ocfs2_cluster_group_search_max_block( 1651 (unsigned long long)blkoff, 1652 (unsigned long long)max_block); 1653 if (blkoff > max_block) 1654 return -ENOSPC; 1655 } 1656 1657 /* ocfs2_block_group_find_clear_bits() might 1658 * return success, but we still want to return 1659 * -ENOSPC unless it found the minimum number 1660 * of bits. */ 1661 if (min_bits <= res->sr_bits) 1662 search = 0; /* success */ 1663 } 1664 1665 return search; 1666 } 1667 1668 static int ocfs2_block_group_search(struct inode *inode, 1669 struct buffer_head *group_bh, 1670 u32 bits_wanted, u32 min_bits, 1671 u64 max_block, 1672 struct ocfs2_suballoc_result *res) 1673 { 1674 int ret = -ENOSPC; 1675 u64 blkoff; 1676 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1677 1678 BUG_ON(min_bits != 1); 1679 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1680 1681 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { 1682 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1683 group_bh, bits_wanted, 1684 le16_to_cpu(bg->bg_bits), 1685 res); 1686 if (!ret && max_block) { 1687 blkoff = le64_to_cpu(bg->bg_blkno) + 1688 res->sr_bit_offset + res->sr_bits; 1689 trace_ocfs2_block_group_search_max_block( 1690 (unsigned long long)blkoff, 1691 (unsigned long long)max_block); 1692 if (blkoff > max_block) 1693 ret = -ENOSPC; 1694 } 1695 } 1696 1697 return ret; 1698 } 1699 1700 int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1701 handle_t *handle, 1702 struct buffer_head *di_bh, 1703 u32 num_bits, 1704 u16 chain) 1705 { 1706 int ret; 1707 u32 tmp_used; 1708 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1709 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1710 1711 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1712 OCFS2_JOURNAL_ACCESS_WRITE); 1713 if (ret < 0) { 1714 mlog_errno(ret); 1715 goto out; 1716 } 1717 1718 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1719 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1720 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1721 ocfs2_journal_dirty(handle, di_bh); 1722 1723 out: 1724 return ret; 1725 } 1726 1727 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, 1728 struct buffer_head *di_bh, 1729 u32 num_bits, 1730 u16 chain) 1731 { 1732 u32 tmp_used; 1733 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1734 struct ocfs2_chain_list *cl; 1735 1736 cl = (struct ocfs2_chain_list *)&di->id2.i_chain; 1737 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1738 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); 1739 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); 1740 } 1741 1742 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1743 struct ocfs2_extent_rec *rec, 1744 struct ocfs2_chain_list *cl) 1745 { 1746 unsigned int bpc = le16_to_cpu(cl->cl_bpc); 1747 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; 1748 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; 1749 1750 if (res->sr_bit_offset < bitoff) 1751 return 0; 1752 if (res->sr_bit_offset >= (bitoff + bitcount)) 1753 return 0; 1754 res->sr_blkno = le64_to_cpu(rec->e_blkno) + 1755 (res->sr_bit_offset - bitoff); 1756 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) 1757 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; 1758 return 1; 1759 } 1760 1761 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, 1762 struct ocfs2_group_desc *bg, 1763 struct ocfs2_suballoc_result *res) 1764 { 1765 int i; 1766 u64 bg_blkno = res->sr_bg_blkno; /* Save off */ 1767 struct ocfs2_extent_rec *rec; 1768 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1769 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1770 1771 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { 1772 res->sr_blkno = 0; 1773 return; 1774 } 1775 1776 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; 1777 res->sr_bg_blkno = 0; /* Clear it for contig block groups */ 1778 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || 1779 !bg->bg_list.l_next_free_rec) 1780 return; 1781 1782 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { 1783 rec = &bg->bg_list.l_recs[i]; 1784 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { 1785 res->sr_bg_blkno = bg_blkno; /* Restore */ 1786 break; 1787 } 1788 } 1789 } 1790 1791 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1792 handle_t *handle, 1793 u32 bits_wanted, 1794 u32 min_bits, 1795 struct ocfs2_suballoc_result *res, 1796 u16 *bits_left, int *released) 1797 { 1798 int ret; 1799 struct buffer_head *group_bh = NULL; 1800 struct ocfs2_group_desc *gd; 1801 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1802 struct inode *alloc_inode = ac->ac_inode; 1803 1804 ret = ocfs2_read_hint_group_descriptor(alloc_inode, di, 1805 res->sr_bg_blkno, &group_bh, released); 1806 if (*released) { 1807 return 0; 1808 } else if (ret < 0) { 1809 mlog_errno(ret); 1810 return ret; 1811 } 1812 1813 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1814 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1815 ac->ac_max_block, res); 1816 if (ret < 0) { 1817 if (ret != -ENOSPC) 1818 mlog_errno(ret); 1819 goto out; 1820 } 1821 1822 if (!ret) 1823 ocfs2_bg_discontig_fix_result(ac, gd, res); 1824 1825 /* 1826 * sr_bg_blkno might have been changed by 1827 * ocfs2_bg_discontig_fix_result 1828 */ 1829 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1830 1831 if (ac->ac_find_loc_only) 1832 goto out_loc_only; 1833 1834 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1835 res->sr_bits, 1836 le16_to_cpu(gd->bg_chain)); 1837 if (ret < 0) { 1838 mlog_errno(ret); 1839 goto out; 1840 } 1841 1842 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1843 res->sr_bit_offset, res->sr_bits, 1844 res->sr_max_contig_bits, 0); 1845 if (ret < 0) { 1846 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, 1847 res->sr_bits, 1848 le16_to_cpu(gd->bg_chain)); 1849 mlog_errno(ret); 1850 } 1851 1852 out_loc_only: 1853 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1854 1855 out: 1856 brelse(group_bh); 1857 1858 return ret; 1859 } 1860 1861 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1862 handle_t *handle, 1863 u32 bits_wanted, 1864 u32 min_bits, 1865 struct ocfs2_suballoc_result *res, 1866 u16 *bits_left) 1867 { 1868 int status; 1869 u16 chain; 1870 u32 contig_bits; 1871 u64 next_group; 1872 struct inode *alloc_inode = ac->ac_inode; 1873 struct buffer_head *group_bh = NULL; 1874 struct buffer_head *prev_group_bh = NULL; 1875 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1876 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1877 struct ocfs2_group_desc *bg; 1878 1879 chain = ac->ac_chain; 1880 trace_ocfs2_search_chain_begin( 1881 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 1882 bits_wanted, chain); 1883 1884 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1885 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1886 &group_bh); 1887 if (status < 0) { 1888 mlog_errno(status); 1889 goto bail; 1890 } 1891 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1892 1893 status = -ENOSPC; 1894 /* for now, the chain search is a bit simplistic. We just use 1895 * the 1st group with any empty bits. */ 1896 while (1) { 1897 if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { 1898 contig_bits = le16_to_cpu(bg->bg_contig_free_bits); 1899 if (!contig_bits) 1900 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 1901 le16_to_cpu(bg->bg_bits), 0); 1902 if (bits_wanted > contig_bits && contig_bits >= min_bits) 1903 bits_wanted = contig_bits; 1904 } 1905 1906 status = ac->ac_group_search(alloc_inode, group_bh, 1907 bits_wanted, min_bits, 1908 ac->ac_max_block, res); 1909 if (status != -ENOSPC) 1910 break; 1911 if (!bg->bg_next_group) 1912 break; 1913 1914 brelse(prev_group_bh); 1915 prev_group_bh = NULL; 1916 1917 next_group = le64_to_cpu(bg->bg_next_group); 1918 prev_group_bh = group_bh; 1919 group_bh = NULL; 1920 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1921 next_group, &group_bh); 1922 if (status < 0) { 1923 mlog_errno(status); 1924 goto bail; 1925 } 1926 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1927 } 1928 if (status < 0) { 1929 if (status != -ENOSPC) 1930 mlog_errno(status); 1931 goto bail; 1932 } 1933 1934 trace_ocfs2_search_chain_succ( 1935 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); 1936 1937 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1938 1939 BUG_ON(res->sr_bits == 0); 1940 if (!status) 1941 ocfs2_bg_discontig_fix_result(ac, bg, res); 1942 1943 /* 1944 * sr_bg_blkno might have been changed by 1945 * ocfs2_bg_discontig_fix_result 1946 */ 1947 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1948 1949 /* 1950 * Keep track of previous block descriptor read. When 1951 * we find a target, if we have read more than X 1952 * number of descriptors, and the target is reasonably 1953 * empty, relink him to top of his chain. 1954 * 1955 * We've read 0 extra blocks and only send one more to 1956 * the transaction, yet the next guy to search has a 1957 * much easier time. 1958 * 1959 * Do this *after* figuring out how many bits we're taking out 1960 * of our target group. 1961 */ 1962 if (!ac->ac_disable_chain_relink && 1963 (prev_group_bh) && 1964 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { 1965 status = ocfs2_relink_block_group(handle, alloc_inode, 1966 ac->ac_bh, group_bh, 1967 prev_group_bh, chain); 1968 if (status < 0) { 1969 mlog_errno(status); 1970 goto bail; 1971 } 1972 } 1973 1974 if (ac->ac_find_loc_only) 1975 goto out_loc_only; 1976 1977 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, 1978 ac->ac_bh, res->sr_bits, 1979 chain); 1980 if (status) { 1981 mlog_errno(status); 1982 goto bail; 1983 } 1984 1985 status = ocfs2_block_group_set_bits(handle, 1986 alloc_inode, 1987 bg, 1988 group_bh, 1989 res->sr_bit_offset, 1990 res->sr_bits, 1991 res->sr_max_contig_bits, 1992 0); 1993 if (status < 0) { 1994 ocfs2_rollback_alloc_dinode_counts(alloc_inode, 1995 ac->ac_bh, res->sr_bits, chain); 1996 mlog_errno(status); 1997 goto bail; 1998 } 1999 2000 trace_ocfs2_search_chain_end( 2001 (unsigned long long)le64_to_cpu(fe->i_blkno), 2002 res->sr_bits); 2003 2004 out_loc_only: 2005 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 2006 bail: 2007 brelse(group_bh); 2008 brelse(prev_group_bh); 2009 2010 if (status) 2011 mlog_errno(status); 2012 return status; 2013 } 2014 2015 /* will give out up to bits_wanted contiguous bits. */ 2016 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 2017 handle_t *handle, 2018 u32 bits_wanted, 2019 u32 min_bits, 2020 struct ocfs2_suballoc_result *res) 2021 { 2022 int status; 2023 int released = 0; 2024 u16 victim, i; 2025 u16 bits_left = 0; 2026 u64 hint = ac->ac_last_group; 2027 struct ocfs2_chain_list *cl; 2028 struct ocfs2_dinode *fe; 2029 2030 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2031 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 2032 BUG_ON(!ac->ac_bh); 2033 2034 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2035 2036 /* The bh was validated by the inode read during 2037 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 2038 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2039 2040 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 2041 le32_to_cpu(fe->id1.bitmap1.i_total)) { 2042 status = ocfs2_error(ac->ac_inode->i_sb, 2043 "Chain allocator dinode %llu has %u used bits but only %u total\n", 2044 (unsigned long long)le64_to_cpu(fe->i_blkno), 2045 le32_to_cpu(fe->id1.bitmap1.i_used), 2046 le32_to_cpu(fe->id1.bitmap1.i_total)); 2047 goto bail; 2048 } 2049 2050 /* the hint bg may already be released, we quiet search this group. */ 2051 res->sr_bg_blkno = hint; 2052 if (res->sr_bg_blkno) { 2053 /* Attempt to short-circuit the usual search mechanism 2054 * by jumping straight to the most recently used 2055 * allocation group. This helps us maintain some 2056 * contiguousness across allocations. */ 2057 status = ocfs2_search_one_group(ac, handle, bits_wanted, 2058 min_bits, res, &bits_left, 2059 &released); 2060 if (released) { 2061 res->sr_bg_blkno = 0; 2062 goto chain_search; 2063 } 2064 if (!status) 2065 goto set_hint; 2066 if (status < 0 && status != -ENOSPC) { 2067 mlog_errno(status); 2068 goto bail; 2069 } 2070 } 2071 chain_search: 2072 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 2073 if (!le16_to_cpu(cl->cl_next_free_rec) || 2074 le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { 2075 status = ocfs2_error(ac->ac_inode->i_sb, 2076 "Chain allocator dinode %llu has invalid next " 2077 "free chain record %u, but only %u total\n", 2078 (unsigned long long)le64_to_cpu(fe->i_blkno), 2079 le16_to_cpu(cl->cl_next_free_rec), 2080 le16_to_cpu(cl->cl_count)); 2081 goto bail; 2082 } 2083 2084 victim = ocfs2_find_victim_chain(cl); 2085 ac->ac_chain = victim; 2086 2087 search: 2088 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2089 res, &bits_left); 2090 if (!status) { 2091 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) 2092 hint = res->sr_bg_blkno; 2093 else 2094 hint = ocfs2_group_from_res(res); 2095 goto set_hint; 2096 } 2097 if (status < 0 && status != -ENOSPC) { 2098 mlog_errno(status); 2099 goto bail; 2100 } 2101 2102 trace_ocfs2_claim_suballoc_bits(victim); 2103 2104 /* If we didn't pick a good victim, then just default to 2105 * searching each chain in order. Don't allow chain relinking 2106 * because we only calculate enough journal credits for one 2107 * relink per alloc. */ 2108 ac->ac_disable_chain_relink = 1; 2109 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 2110 if (i == victim) 2111 continue; 2112 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) 2113 continue; 2114 2115 ac->ac_chain = i; 2116 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2117 res, &bits_left); 2118 if (!status) { 2119 hint = ocfs2_group_from_res(res); 2120 break; 2121 } 2122 if (status < 0 && status != -ENOSPC) { 2123 mlog_errno(status); 2124 goto bail; 2125 } 2126 } 2127 2128 /* Chains can't supply the bits_wanted contiguous space. 2129 * We should switch to using every single bit when allocating 2130 * from the global bitmap. */ 2131 if (i == le16_to_cpu(cl->cl_next_free_rec) && 2132 status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { 2133 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 2134 ac->ac_chain = victim; 2135 goto search; 2136 } 2137 2138 set_hint: 2139 if (status != -ENOSPC) { 2140 /* If the next search of this group is not likely to 2141 * yield a suitable extent, then we reset the last 2142 * group hint so as to not waste a disk read */ 2143 if (bits_left < min_bits) 2144 ac->ac_last_group = 0; 2145 else 2146 ac->ac_last_group = hint; 2147 } 2148 2149 bail: 2150 if (status) 2151 mlog_errno(status); 2152 return status; 2153 } 2154 2155 int ocfs2_claim_metadata(handle_t *handle, 2156 struct ocfs2_alloc_context *ac, 2157 u32 bits_wanted, 2158 u64 *suballoc_loc, 2159 u16 *suballoc_bit_start, 2160 unsigned int *num_bits, 2161 u64 *blkno_start) 2162 { 2163 int status; 2164 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2165 2166 BUG_ON(!ac); 2167 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 2168 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 2169 2170 status = ocfs2_claim_suballoc_bits(ac, 2171 handle, 2172 bits_wanted, 2173 1, 2174 &res); 2175 if (status < 0) { 2176 mlog_errno(status); 2177 goto bail; 2178 } 2179 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2180 2181 *suballoc_loc = res.sr_bg_blkno; 2182 *suballoc_bit_start = res.sr_bit_offset; 2183 *blkno_start = res.sr_blkno; 2184 ac->ac_bits_given += res.sr_bits; 2185 *num_bits = res.sr_bits; 2186 status = 0; 2187 bail: 2188 if (status) 2189 mlog_errno(status); 2190 return status; 2191 } 2192 2193 /* 2194 * after ocfs2 has the ability to release block group unused space, 2195 * the ->ip_last_used_group may be invalid. so this function returns 2196 * ac->ac_last_group need to verify. 2197 * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details. 2198 */ 2199 static void ocfs2_init_inode_ac_group(struct inode *dir, 2200 struct buffer_head *parent_di_bh, 2201 struct ocfs2_alloc_context *ac) 2202 { 2203 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; 2204 /* 2205 * Try to allocate inodes from some specific group. 2206 * 2207 * If the parent dir has recorded the last group used in allocation, 2208 * cool, use it. Otherwise if we try to allocate new inode from the 2209 * same slot the parent dir belongs to, use the same chunk. 2210 * 2211 * We are very careful here to avoid the mistake of setting 2212 * ac_last_group to a group descriptor from a different (unlocked) slot. 2213 */ 2214 if (OCFS2_I(dir)->ip_last_used_group && 2215 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2216 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2217 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { 2218 if (di->i_suballoc_loc) 2219 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); 2220 else 2221 ac->ac_last_group = ocfs2_which_suballoc_group( 2222 le64_to_cpu(di->i_blkno), 2223 le16_to_cpu(di->i_suballoc_bit)); 2224 } 2225 } 2226 2227 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2228 struct ocfs2_alloc_context *ac) 2229 { 2230 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 2231 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2232 } 2233 2234 int ocfs2_find_new_inode_loc(struct inode *dir, 2235 struct buffer_head *parent_fe_bh, 2236 struct ocfs2_alloc_context *ac, 2237 u64 *fe_blkno) 2238 { 2239 int ret; 2240 handle_t *handle = NULL; 2241 struct ocfs2_suballoc_result *res; 2242 2243 BUG_ON(!ac); 2244 BUG_ON(ac->ac_bits_given != 0); 2245 BUG_ON(ac->ac_bits_wanted != 1); 2246 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2247 2248 res = kzalloc(sizeof(*res), GFP_NOFS); 2249 if (res == NULL) { 2250 ret = -ENOMEM; 2251 mlog_errno(ret); 2252 goto out; 2253 } 2254 2255 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2256 2257 /* 2258 * The handle started here is for chain relink. Alternatively, 2259 * we could just disable relink for these calls. 2260 */ 2261 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); 2262 if (IS_ERR(handle)) { 2263 ret = PTR_ERR(handle); 2264 handle = NULL; 2265 mlog_errno(ret); 2266 goto out; 2267 } 2268 2269 /* 2270 * This will instruct ocfs2_claim_suballoc_bits and 2271 * ocfs2_search_one_group to search but save actual allocation 2272 * for later. 2273 */ 2274 ac->ac_find_loc_only = 1; 2275 2276 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); 2277 if (ret < 0) { 2278 mlog_errno(ret); 2279 goto out; 2280 } 2281 2282 ac->ac_find_loc_priv = res; 2283 *fe_blkno = res->sr_blkno; 2284 ocfs2_update_inode_fsync_trans(handle, dir, 0); 2285 out: 2286 if (handle) 2287 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2288 2289 if (ret) 2290 kfree(res); 2291 2292 return ret; 2293 } 2294 2295 int ocfs2_claim_new_inode_at_loc(handle_t *handle, 2296 struct inode *dir, 2297 struct ocfs2_alloc_context *ac, 2298 u64 *suballoc_loc, 2299 u16 *suballoc_bit, 2300 u64 di_blkno) 2301 { 2302 int ret; 2303 u16 chain; 2304 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; 2305 struct buffer_head *bg_bh = NULL; 2306 struct ocfs2_group_desc *bg; 2307 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2308 2309 /* 2310 * Since di_blkno is being passed back in, we check for any 2311 * inconsistencies which may have happened between 2312 * calls. These are code bugs as di_blkno is not expected to 2313 * change once returned from ocfs2_find_new_inode_loc() 2314 */ 2315 BUG_ON(res->sr_blkno != di_blkno); 2316 2317 ret = ocfs2_read_group_descriptor(ac->ac_inode, di, 2318 res->sr_bg_stable_blkno, &bg_bh); 2319 if (ret) { 2320 mlog_errno(ret); 2321 goto out; 2322 } 2323 2324 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 2325 chain = le16_to_cpu(bg->bg_chain); 2326 2327 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, 2328 ac->ac_bh, res->sr_bits, 2329 chain); 2330 if (ret) { 2331 mlog_errno(ret); 2332 goto out; 2333 } 2334 2335 ret = ocfs2_block_group_set_bits(handle, 2336 ac->ac_inode, 2337 bg, 2338 bg_bh, 2339 res->sr_bit_offset, 2340 res->sr_bits, 2341 res->sr_max_contig_bits, 2342 0); 2343 if (ret < 0) { 2344 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, 2345 ac->ac_bh, res->sr_bits, chain); 2346 mlog_errno(ret); 2347 goto out; 2348 } 2349 2350 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, 2351 res->sr_bits); 2352 2353 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2354 2355 BUG_ON(res->sr_bits != 1); 2356 2357 *suballoc_loc = res->sr_bg_blkno; 2358 *suballoc_bit = res->sr_bit_offset; 2359 ac->ac_bits_given++; 2360 ocfs2_save_inode_ac_group(dir, ac); 2361 2362 out: 2363 brelse(bg_bh); 2364 2365 return ret; 2366 } 2367 2368 int ocfs2_claim_new_inode(handle_t *handle, 2369 struct inode *dir, 2370 struct buffer_head *parent_fe_bh, 2371 struct ocfs2_alloc_context *ac, 2372 u64 *suballoc_loc, 2373 u16 *suballoc_bit, 2374 u64 *fe_blkno) 2375 { 2376 int status; 2377 struct ocfs2_suballoc_result res; 2378 2379 BUG_ON(!ac); 2380 BUG_ON(ac->ac_bits_given != 0); 2381 BUG_ON(ac->ac_bits_wanted != 1); 2382 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2383 2384 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2385 2386 status = ocfs2_claim_suballoc_bits(ac, 2387 handle, 2388 1, 2389 1, 2390 &res); 2391 if (status < 0) { 2392 mlog_errno(status); 2393 goto bail; 2394 } 2395 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2396 2397 BUG_ON(res.sr_bits != 1); 2398 2399 *suballoc_loc = res.sr_bg_blkno; 2400 *suballoc_bit = res.sr_bit_offset; 2401 *fe_blkno = res.sr_blkno; 2402 ac->ac_bits_given++; 2403 ocfs2_save_inode_ac_group(dir, ac); 2404 status = 0; 2405 bail: 2406 if (status) 2407 mlog_errno(status); 2408 return status; 2409 } 2410 2411 /* translate a group desc. blkno and it's bitmap offset into 2412 * disk cluster offset. */ 2413 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 2414 u64 bg_blkno, 2415 u16 bg_bit_off) 2416 { 2417 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2418 u32 cluster = 0; 2419 2420 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2421 2422 if (bg_blkno != osb->first_cluster_group_blkno) 2423 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 2424 cluster += (u32) bg_bit_off; 2425 return cluster; 2426 } 2427 2428 /* given a cluster offset, calculate which block group it belongs to 2429 * and return that block offset. */ 2430 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 2431 { 2432 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2433 u32 group_no; 2434 2435 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2436 2437 group_no = cluster / osb->bitmap_cpg; 2438 if (!group_no) 2439 return osb->first_cluster_group_blkno; 2440 return ocfs2_clusters_to_blocks(inode->i_sb, 2441 group_no * osb->bitmap_cpg); 2442 } 2443 2444 /* given the block number of a cluster start, calculate which cluster 2445 * group and descriptor bitmap offset that corresponds to. */ 2446 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 2447 u64 data_blkno, 2448 u64 *bg_blkno, 2449 u16 *bg_bit_off) 2450 { 2451 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2452 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 2453 2454 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2455 2456 *bg_blkno = ocfs2_which_cluster_group(inode, 2457 data_cluster); 2458 2459 if (*bg_blkno == osb->first_cluster_group_blkno) 2460 *bg_bit_off = (u16) data_cluster; 2461 else 2462 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 2463 data_blkno - *bg_blkno); 2464 } 2465 2466 /* 2467 * min_bits - minimum contiguous chunk from this total allocation we 2468 * can handle. set to what we asked for originally for a full 2469 * contig. allocation, set to '1' to indicate we can deal with extents 2470 * of any size. 2471 */ 2472 int __ocfs2_claim_clusters(handle_t *handle, 2473 struct ocfs2_alloc_context *ac, 2474 u32 min_clusters, 2475 u32 max_clusters, 2476 u32 *cluster_start, 2477 u32 *num_clusters) 2478 { 2479 int status; 2480 unsigned int bits_wanted = max_clusters; 2481 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2482 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2483 2484 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2485 2486 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2487 && ac->ac_which != OCFS2_AC_USE_MAIN 2488 && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); 2489 2490 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2491 WARN_ON(min_clusters > 1); 2492 2493 status = ocfs2_claim_local_alloc_bits(osb, 2494 handle, 2495 ac, 2496 bits_wanted, 2497 cluster_start, 2498 num_clusters); 2499 if (!status) 2500 atomic_inc(&osb->alloc_stats.local_data); 2501 } else { 2502 if (min_clusters > (osb->bitmap_cpg - 1)) { 2503 /* The only paths asking for contiguousness 2504 * should know about this already. */ 2505 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 2506 "group bitmap size %u!\n", min_clusters, 2507 osb->bitmap_cpg); 2508 status = -ENOSPC; 2509 goto bail; 2510 } 2511 /* clamp the current request down to a realistic size. */ 2512 if (bits_wanted > (osb->bitmap_cpg - 1)) 2513 bits_wanted = osb->bitmap_cpg - 1; 2514 2515 status = ocfs2_claim_suballoc_bits(ac, 2516 handle, 2517 bits_wanted, 2518 min_clusters, 2519 &res); 2520 if (!status) { 2521 BUG_ON(res.sr_blkno); /* cluster alloc can't set */ 2522 *cluster_start = 2523 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2524 res.sr_bg_blkno, 2525 res.sr_bit_offset); 2526 atomic_inc(&osb->alloc_stats.bitmap_data); 2527 *num_clusters = res.sr_bits; 2528 } 2529 } 2530 if (status < 0) { 2531 if (status != -ENOSPC) 2532 mlog_errno(status); 2533 goto bail; 2534 } 2535 2536 ac->ac_bits_given += *num_clusters; 2537 2538 bail: 2539 if (status) 2540 mlog_errno(status); 2541 return status; 2542 } 2543 2544 int ocfs2_claim_clusters(handle_t *handle, 2545 struct ocfs2_alloc_context *ac, 2546 u32 min_clusters, 2547 u32 *cluster_start, 2548 u32 *num_clusters) 2549 { 2550 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2551 2552 return __ocfs2_claim_clusters(handle, ac, min_clusters, 2553 bits_wanted, cluster_start, num_clusters); 2554 } 2555 2556 static int ocfs2_block_group_clear_bits(handle_t *handle, 2557 struct inode *alloc_inode, 2558 struct ocfs2_group_desc *bg, 2559 struct buffer_head *group_bh, 2560 unsigned int bit_off, 2561 unsigned int num_bits, 2562 unsigned int max_contig_bits, 2563 void (*undo_fn)(unsigned int bit, 2564 unsigned long *bmap)) 2565 { 2566 int status; 2567 unsigned int tmp; 2568 u16 contig_bits; 2569 struct ocfs2_group_desc *undo_bg = NULL; 2570 struct journal_head *jh; 2571 2572 /* The caller got this descriptor from 2573 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2574 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2575 2576 trace_ocfs2_block_group_clear_bits(bit_off, num_bits); 2577 2578 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2579 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2580 group_bh, 2581 undo_fn ? 2582 OCFS2_JOURNAL_ACCESS_UNDO : 2583 OCFS2_JOURNAL_ACCESS_WRITE); 2584 if (status < 0) { 2585 mlog_errno(status); 2586 goto bail; 2587 } 2588 2589 jh = bh2jh(group_bh); 2590 if (undo_fn) { 2591 spin_lock(&jh->b_state_lock); 2592 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; 2593 BUG_ON(!undo_bg); 2594 } 2595 2596 tmp = num_bits; 2597 while(tmp--) { 2598 ocfs2_clear_bit((bit_off + tmp), 2599 (unsigned long *) bg->bg_bitmap); 2600 if (undo_fn) 2601 undo_fn(bit_off + tmp, 2602 (unsigned long *) undo_bg->bg_bitmap); 2603 } 2604 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2605 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2606 if (undo_fn) 2607 spin_unlock(&jh->b_state_lock); 2608 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 2609 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2610 le16_to_cpu(bg->bg_bits), 2611 le16_to_cpu(bg->bg_free_bits_count), 2612 num_bits); 2613 } 2614 2615 /* 2616 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), 2617 * we still need to rescan whole bitmap. 2618 */ 2619 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 2620 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 2621 le16_to_cpu(bg->bg_bits), 0); 2622 if (contig_bits > max_contig_bits) 2623 max_contig_bits = contig_bits; 2624 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 2625 } else { 2626 bg->bg_contig_free_bits = 0; 2627 } 2628 2629 if (undo_fn) 2630 spin_unlock(&jh->b_state_lock); 2631 2632 ocfs2_journal_dirty(handle, group_bh); 2633 bail: 2634 return status; 2635 } 2636 2637 /* 2638 * Reclaim the suballocator managed space to main bitmap. 2639 * This function first works on the suballocator to perform the 2640 * cleanup rec/alloc_inode job, then switches to the main bitmap 2641 * to reclaim released space. 2642 * 2643 * handle: The transaction handle 2644 * alloc_inode: The suballoc inode 2645 * alloc_bh: The buffer_head of suballoc inode 2646 * group_bh: The group descriptor buffer_head of suballocator managed. 2647 * Caller should release the input group_bh. 2648 */ 2649 static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle, 2650 struct inode *alloc_inode, 2651 struct buffer_head *alloc_bh, 2652 struct buffer_head *group_bh) 2653 { 2654 int idx, status = 0; 2655 int i, next_free_rec, len = 0; 2656 __le16 old_bg_contig_free_bits = 0; 2657 u16 start_bit; 2658 u32 tmp_used; 2659 u64 bg_blkno, start_blk; 2660 unsigned int count; 2661 struct ocfs2_chain_rec *rec; 2662 struct buffer_head *main_bm_bh = NULL; 2663 struct inode *main_bm_inode = NULL; 2664 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 2665 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2666 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2667 struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data; 2668 2669 idx = le16_to_cpu(group->bg_chain); 2670 rec = &(cl->cl_recs[idx]); 2671 2672 status = ocfs2_extend_trans(handle, 2673 ocfs2_calc_group_alloc_credits(osb->sb, 2674 le16_to_cpu(cl->cl_cpg))); 2675 if (status) { 2676 mlog_errno(status); 2677 goto bail; 2678 } 2679 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2680 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2681 if (status < 0) { 2682 mlog_errno(status); 2683 goto bail; 2684 } 2685 2686 /* 2687 * Only clear the suballocator rec item in-place. 2688 * 2689 * If idx is not the last, we don't compress (remove the empty item) 2690 * the cl_recs[]. If not, we need to do lots jobs. 2691 * 2692 * Compress cl_recs[] code example: 2693 * if (idx != cl->cl_next_free_rec - 1) 2694 * memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1], 2695 * sizeof(struct ocfs2_chain_rec) * 2696 * (cl->cl_next_free_rec - idx - 1)); 2697 * for(i = idx; i < cl->cl_next_free_rec-1; i++) { 2698 * group->bg_chain = "later group->bg_chain"; 2699 * group->bg_blkno = xxx; 2700 * ... ... 2701 * } 2702 */ 2703 2704 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total); 2705 fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total)); 2706 2707 /* Substraction 1 for the block group itself */ 2708 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2709 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1); 2710 2711 tmp_used = le32_to_cpu(fe->i_clusters); 2712 fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg)); 2713 2714 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 2715 OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters); 2716 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 2717 le32_to_cpu(fe->i_clusters))); 2718 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 2719 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 2720 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 2721 2722 ocfs2_journal_dirty(handle, alloc_bh); 2723 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 2724 2725 start_blk = le64_to_cpu(rec->c_blkno); 2726 count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc); 2727 2728 /* 2729 * If the rec is the last one, let's compress the chain list by 2730 * removing the empty cl_recs[] at the end. 2731 */ 2732 next_free_rec = le16_to_cpu(cl->cl_next_free_rec); 2733 if (idx == (next_free_rec - 1)) { 2734 len++; /* the last item should be counted first */ 2735 for (i = (next_free_rec - 2); i > 0; i--) { 2736 if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total) 2737 len++; 2738 else 2739 break; 2740 } 2741 } 2742 le16_add_cpu(&cl->cl_next_free_rec, -len); 2743 2744 rec->c_free = 0; 2745 rec->c_total = 0; 2746 rec->c_blkno = 0; 2747 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh); 2748 memset(group, 0, sizeof(struct ocfs2_group_desc)); 2749 2750 /* prepare job for reclaim clusters */ 2751 main_bm_inode = ocfs2_get_system_file_inode(osb, 2752 GLOBAL_BITMAP_SYSTEM_INODE, 2753 OCFS2_INVALID_SLOT); 2754 if (!main_bm_inode) 2755 goto bail; /* ignore the error in reclaim path */ 2756 2757 inode_lock(main_bm_inode); 2758 2759 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); 2760 if (status < 0) 2761 goto free_bm_inode; /* ignore the error in reclaim path */ 2762 2763 ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno, 2764 &start_bit); 2765 fe = (struct ocfs2_dinode *) main_bm_bh->b_data; 2766 cl = &fe->id2.i_chain; 2767 /* reuse group_bh, caller will release the input group_bh */ 2768 group_bh = NULL; 2769 2770 /* reclaim clusters to global_bitmap */ 2771 status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno, 2772 &group_bh); 2773 if (status < 0) { 2774 mlog_errno(status); 2775 goto free_bm_bh; 2776 } 2777 group = (struct ocfs2_group_desc *) group_bh->b_data; 2778 2779 if ((count + start_bit) > le16_to_cpu(group->bg_bits)) { 2780 ocfs2_error(alloc_inode->i_sb, 2781 "reclaim length (%d) beyands block group length (%d)", 2782 count + start_bit, le16_to_cpu(group->bg_bits)); 2783 goto free_group_bh; 2784 } 2785 2786 old_bg_contig_free_bits = group->bg_contig_free_bits; 2787 status = ocfs2_block_group_clear_bits(handle, main_bm_inode, 2788 group, group_bh, 2789 start_bit, count, 0, 2790 _ocfs2_clear_bit); 2791 if (status < 0) { 2792 mlog_errno(status); 2793 goto free_group_bh; 2794 } 2795 2796 status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 2797 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2798 if (status < 0) { 2799 mlog_errno(status); 2800 ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh, 2801 start_bit, count, 2802 le16_to_cpu(old_bg_contig_free_bits), 1); 2803 goto free_group_bh; 2804 } 2805 2806 idx = le16_to_cpu(group->bg_chain); 2807 rec = &(cl->cl_recs[idx]); 2808 2809 le32_add_cpu(&rec->c_free, count); 2810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2811 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2812 ocfs2_journal_dirty(handle, main_bm_bh); 2813 2814 free_group_bh: 2815 brelse(group_bh); 2816 2817 free_bm_bh: 2818 ocfs2_inode_unlock(main_bm_inode, 1); 2819 brelse(main_bm_bh); 2820 2821 free_bm_inode: 2822 inode_unlock(main_bm_inode); 2823 iput(main_bm_inode); 2824 2825 bail: 2826 return status; 2827 } 2828 2829 /* 2830 * expects the suballoc inode to already be locked. 2831 */ 2832 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2833 struct inode *alloc_inode, 2834 struct buffer_head *alloc_bh, 2835 unsigned int start_bit, 2836 u64 bg_blkno, 2837 unsigned int count, 2838 void (*undo_fn)(unsigned int bit, 2839 unsigned long *bitmap)) 2840 { 2841 int idx, status = 0; 2842 u32 tmp_used; 2843 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2844 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2845 struct buffer_head *group_bh = NULL; 2846 struct ocfs2_group_desc *group; 2847 struct ocfs2_chain_rec *rec; 2848 __le16 old_bg_contig_free_bits = 0; 2849 2850 /* The alloc_bh comes from ocfs2_free_dinode() or 2851 * ocfs2_free_clusters(). The callers have all locked the 2852 * allocator and gotten alloc_bh from the lock call. This 2853 * validates the dinode buffer. Any corruption that has happened 2854 * is a code bug. */ 2855 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2856 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2857 2858 trace_ocfs2_free_suballoc_bits( 2859 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 2860 (unsigned long long)bg_blkno, 2861 start_bit, count); 2862 2863 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2864 &group_bh); 2865 if (status < 0) { 2866 mlog_errno(status); 2867 goto bail; 2868 } 2869 group = (struct ocfs2_group_desc *) group_bh->b_data; 2870 2871 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2872 2873 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2874 old_bg_contig_free_bits = group->bg_contig_free_bits; 2875 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2876 group, group_bh, 2877 start_bit, count, 0, undo_fn); 2878 if (status < 0) { 2879 mlog_errno(status); 2880 goto bail; 2881 } 2882 2883 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2884 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2885 if (status < 0) { 2886 mlog_errno(status); 2887 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, 2888 start_bit, count, 2889 le16_to_cpu(old_bg_contig_free_bits), 1); 2890 goto bail; 2891 } 2892 2893 idx = le16_to_cpu(group->bg_chain); 2894 rec = &(cl->cl_recs[idx]); 2895 2896 le32_add_cpu(&rec->c_free, count); 2897 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2898 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2899 ocfs2_journal_dirty(handle, alloc_bh); 2900 2901 /* 2902 * Reclaim suballocator free space. 2903 * Bypass: global_bitmap, non empty rec, first rec in cl_recs[] 2904 */ 2905 if (ocfs2_is_cluster_bitmap(alloc_inode) || 2906 (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) || 2907 (le16_to_cpu(cl->cl_next_free_rec) == 1)) { 2908 goto bail; 2909 } 2910 2911 _ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh); 2912 2913 bail: 2914 brelse(group_bh); 2915 return status; 2916 } 2917 2918 int ocfs2_free_suballoc_bits(handle_t *handle, 2919 struct inode *alloc_inode, 2920 struct buffer_head *alloc_bh, 2921 unsigned int start_bit, 2922 u64 bg_blkno, 2923 unsigned int count) 2924 { 2925 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2926 start_bit, bg_blkno, count, NULL); 2927 } 2928 2929 int ocfs2_free_dinode(handle_t *handle, 2930 struct inode *inode_alloc_inode, 2931 struct buffer_head *inode_alloc_bh, 2932 struct ocfs2_dinode *di) 2933 { 2934 u64 blk = le64_to_cpu(di->i_blkno); 2935 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2936 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2937 2938 if (di->i_suballoc_loc) 2939 bg_blkno = le64_to_cpu(di->i_suballoc_loc); 2940 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2941 inode_alloc_bh, bit, bg_blkno, 1); 2942 } 2943 2944 static int _ocfs2_free_clusters(handle_t *handle, 2945 struct inode *bitmap_inode, 2946 struct buffer_head *bitmap_bh, 2947 u64 start_blk, 2948 unsigned int num_clusters, 2949 void (*undo_fn)(unsigned int bit, 2950 unsigned long *bitmap)) 2951 { 2952 int status; 2953 u16 bg_start_bit; 2954 u64 bg_blkno; 2955 2956 /* You can't ever have a contiguous set of clusters 2957 * bigger than a block group bitmap so we never have to worry 2958 * about looping on them. 2959 * This is expensive. We can safely remove once this stuff has 2960 * gotten tested really well. */ 2961 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, 2962 ocfs2_blocks_to_clusters(bitmap_inode->i_sb, 2963 start_blk))); 2964 2965 2966 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2967 &bg_start_bit); 2968 2969 trace_ocfs2_free_clusters((unsigned long long)bg_blkno, 2970 (unsigned long long)start_blk, 2971 bg_start_bit, num_clusters); 2972 2973 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2974 bg_start_bit, bg_blkno, 2975 num_clusters, undo_fn); 2976 if (status < 0) { 2977 mlog_errno(status); 2978 goto out; 2979 } 2980 2981 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 2982 num_clusters); 2983 2984 out: 2985 return status; 2986 } 2987 2988 int ocfs2_free_clusters(handle_t *handle, 2989 struct inode *bitmap_inode, 2990 struct buffer_head *bitmap_bh, 2991 u64 start_blk, 2992 unsigned int num_clusters) 2993 { 2994 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 2995 start_blk, num_clusters, 2996 _ocfs2_set_bit); 2997 } 2998 2999 /* 3000 * Give never-used clusters back to the global bitmap. We don't need 3001 * to protect these bits in the undo buffer. 3002 */ 3003 int ocfs2_release_clusters(handle_t *handle, 3004 struct inode *bitmap_inode, 3005 struct buffer_head *bitmap_bh, 3006 u64 start_blk, 3007 unsigned int num_clusters) 3008 { 3009 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 3010 start_blk, num_clusters, 3011 _ocfs2_clear_bit); 3012 } 3013 3014 /* 3015 * For a given allocation, determine which allocators will need to be 3016 * accessed, and lock them, reserving the appropriate number of bits. 3017 * 3018 * Sparse file systems call this from ocfs2_write_begin_nolock() 3019 * and ocfs2_allocate_unwritten_extents(). 3020 * 3021 * File systems which don't support holes call this from 3022 * ocfs2_extend_allocation(). 3023 */ 3024 int ocfs2_lock_allocators(struct inode *inode, 3025 struct ocfs2_extent_tree *et, 3026 u32 clusters_to_add, u32 extents_to_split, 3027 struct ocfs2_alloc_context **data_ac, 3028 struct ocfs2_alloc_context **meta_ac) 3029 { 3030 int ret = 0, num_free_extents; 3031 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 3032 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3033 3034 *meta_ac = NULL; 3035 if (data_ac) 3036 *data_ac = NULL; 3037 3038 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 3039 3040 num_free_extents = ocfs2_num_free_extents(et); 3041 if (num_free_extents < 0) { 3042 ret = num_free_extents; 3043 mlog_errno(ret); 3044 goto out; 3045 } 3046 3047 /* 3048 * Sparse allocation file systems need to be more conservative 3049 * with reserving room for expansion - the actual allocation 3050 * happens while we've got a journal handle open so re-taking 3051 * a cluster lock (because we ran out of room for another 3052 * extent) will violate ordering rules. 3053 * 3054 * Most of the time we'll only be seeing this 1 cluster at a time 3055 * anyway. 3056 * 3057 * Always lock for any unwritten extents - we might want to 3058 * add blocks during a split. 3059 */ 3060 if (!num_free_extents || 3061 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 3062 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 3063 if (ret < 0) { 3064 if (ret != -ENOSPC) 3065 mlog_errno(ret); 3066 goto out; 3067 } 3068 } 3069 3070 if (clusters_to_add == 0) 3071 goto out; 3072 3073 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 3074 if (ret < 0) { 3075 if (ret != -ENOSPC) 3076 mlog_errno(ret); 3077 goto out; 3078 } 3079 3080 out: 3081 if (ret) { 3082 if (*meta_ac) { 3083 ocfs2_free_alloc_context(*meta_ac); 3084 *meta_ac = NULL; 3085 } 3086 3087 /* 3088 * We cannot have an error and a non null *data_ac. 3089 */ 3090 } 3091 3092 return ret; 3093 } 3094 3095 /* 3096 * Read the inode specified by blkno to get suballoc_slot and 3097 * suballoc_bit. 3098 */ 3099 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 3100 u16 *suballoc_slot, u64 *group_blkno, 3101 u16 *suballoc_bit) 3102 { 3103 int status; 3104 struct buffer_head *inode_bh = NULL; 3105 struct ocfs2_dinode *inode_fe; 3106 3107 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); 3108 3109 /* dirty read disk */ 3110 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 3111 if (status < 0) { 3112 mlog(ML_ERROR, "read block %llu failed %d\n", 3113 (unsigned long long)blkno, status); 3114 goto bail; 3115 } 3116 3117 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 3118 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 3119 mlog(ML_ERROR, "invalid inode %llu requested\n", 3120 (unsigned long long)blkno); 3121 status = -EINVAL; 3122 goto bail; 3123 } 3124 3125 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 3126 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 3127 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 3128 (unsigned long long)blkno, 3129 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 3130 status = -EINVAL; 3131 goto bail; 3132 } 3133 3134 if (suballoc_slot) 3135 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 3136 if (suballoc_bit) 3137 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 3138 if (group_blkno) 3139 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); 3140 3141 bail: 3142 brelse(inode_bh); 3143 3144 if (status) 3145 mlog_errno(status); 3146 return status; 3147 } 3148 3149 /* 3150 * test whether bit is SET in allocator bitmap or not. on success, 0 3151 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 3152 * is returned and *res is meaningless. Call this after you have 3153 * cluster locked against suballoc, or you may get a result based on 3154 * non-up2date contents 3155 */ 3156 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 3157 struct inode *suballoc, 3158 struct buffer_head *alloc_bh, 3159 u64 group_blkno, u64 blkno, 3160 u16 bit, int *res) 3161 { 3162 struct ocfs2_dinode *alloc_di; 3163 struct ocfs2_group_desc *group; 3164 struct buffer_head *group_bh = NULL; 3165 u64 bg_blkno; 3166 int status, quiet = 0, released = 0; 3167 3168 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, 3169 (unsigned int)bit); 3170 3171 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 3172 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 3173 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 3174 (unsigned int)bit, 3175 ocfs2_bits_per_group(&alloc_di->id2.i_chain)); 3176 status = -EINVAL; 3177 goto bail; 3178 } 3179 3180 bg_blkno = group_blkno ? group_blkno : 3181 ocfs2_which_suballoc_group(blkno, bit); 3182 status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno, 3183 &group_bh, &released); 3184 if (released) { 3185 quiet = 1; 3186 status = -ESTALE; 3187 goto bail; 3188 } else if (status < 0) { 3189 mlog(ML_ERROR, "read group %llu failed %d\n", 3190 (unsigned long long)bg_blkno, status); 3191 goto bail; 3192 } 3193 3194 group = (struct ocfs2_group_desc *) group_bh->b_data; 3195 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 3196 3197 bail: 3198 brelse(group_bh); 3199 3200 if (status && !quiet) 3201 mlog_errno(status); 3202 return status; 3203 } 3204 3205 /* 3206 * Test if the bit representing this inode (blkno) is set in the 3207 * suballocator. 3208 * 3209 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 3210 * 3211 * In the event of failure, a negative value is returned and *res is 3212 * meaningless. 3213 * 3214 * Callers must make sure to hold nfs_sync_lock to prevent 3215 * ocfs2_delete_inode() on another node from accessing the same 3216 * suballocator concurrently. 3217 */ 3218 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 3219 { 3220 int status, quiet = 0; 3221 u64 group_blkno = 0; 3222 u16 suballoc_bit = 0, suballoc_slot = 0; 3223 struct inode *inode_alloc_inode; 3224 struct buffer_head *alloc_bh = NULL; 3225 3226 trace_ocfs2_test_inode_bit((unsigned long long)blkno); 3227 3228 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 3229 &group_blkno, &suballoc_bit); 3230 if (status < 0) { 3231 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 3232 goto bail; 3233 } 3234 3235 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) 3236 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 3237 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 3238 else 3239 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 3240 INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 3241 if (!inode_alloc_inode) { 3242 /* the error code could be inaccurate, but we are not able to 3243 * get the correct one. */ 3244 status = -EINVAL; 3245 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 3246 (u32)suballoc_slot); 3247 goto bail; 3248 } 3249 3250 inode_lock(inode_alloc_inode); 3251 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 3252 if (status < 0) { 3253 inode_unlock(inode_alloc_inode); 3254 iput(inode_alloc_inode); 3255 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 3256 (u32)suballoc_slot, status); 3257 goto bail; 3258 } 3259 3260 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 3261 group_blkno, blkno, suballoc_bit, res); 3262 if (status < 0) { 3263 if (status == -ESTALE) 3264 quiet = 1; 3265 else 3266 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 3267 } 3268 3269 ocfs2_inode_unlock(inode_alloc_inode, 0); 3270 inode_unlock(inode_alloc_inode); 3271 3272 iput(inode_alloc_inode); 3273 brelse(alloc_bh); 3274 bail: 3275 if (status && !quiet) 3276 mlog_errno(status); 3277 return status; 3278 } 3279