1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * suballoc.c 4 * 5 * metadata alloc and free 6 * Inspired by ext3 block groups. 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/string.h> 15 #include <linux/highmem.h> 16 17 #include <cluster/masklog.h> 18 19 #include "ocfs2.h" 20 21 #include "alloc.h" 22 #include "blockcheck.h" 23 #include "dlmglue.h" 24 #include "inode.h" 25 #include "journal.h" 26 #include "localalloc.h" 27 #include "suballoc.h" 28 #include "super.h" 29 #include "sysfile.h" 30 #include "uptodate.h" 31 #include "ocfs2_trace.h" 32 33 #include "buffer_head_io.h" 34 35 #define NOT_ALLOC_NEW_GROUP 0 36 #define ALLOC_NEW_GROUP 0x1 37 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 38 39 #define OCFS2_MAX_TO_STEAL 1024 40 41 struct ocfs2_suballoc_result { 42 u64 sr_bg_blkno; /* The bg we allocated from. Set 43 to 0 when a block group is 44 contiguous. */ 45 u64 sr_bg_stable_blkno; /* 46 * Doesn't change, always 47 * set to target block 48 * group descriptor 49 * block. 50 */ 51 u64 sr_blkno; /* The first allocated block */ 52 unsigned int sr_bit_offset; /* The bit in the bg */ 53 unsigned int sr_bits; /* How many bits we claimed */ 54 unsigned int sr_max_contig_bits; /* The length for contiguous 55 * free bits, only available 56 * for cluster group 57 */ 58 }; 59 60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) 61 { 62 if (res->sr_blkno == 0) 63 return 0; 64 65 if (res->sr_bg_blkno) 66 return res->sr_bg_blkno; 67 68 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 69 } 70 71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 72 static int ocfs2_block_group_fill(handle_t *handle, 73 struct inode *alloc_inode, 74 struct buffer_head *bg_bh, 75 u64 group_blkno, 76 unsigned int group_clusters, 77 u16 my_chain, 78 struct ocfs2_chain_list *cl); 79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 80 struct inode *alloc_inode, 81 struct buffer_head *bh, 82 u64 max_block, 83 u64 *last_alloc_group, 84 int flags); 85 86 static int ocfs2_cluster_group_search(struct inode *inode, 87 struct buffer_head *group_bh, 88 u32 bits_wanted, u32 min_bits, 89 u64 max_block, 90 struct ocfs2_suballoc_result *res); 91 static int ocfs2_block_group_search(struct inode *inode, 92 struct buffer_head *group_bh, 93 u32 bits_wanted, u32 min_bits, 94 u64 max_block, 95 struct ocfs2_suballoc_result *res); 96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 97 handle_t *handle, 98 u32 bits_wanted, 99 u32 min_bits, 100 struct ocfs2_suballoc_result *res); 101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 102 int nr); 103 static int ocfs2_relink_block_group(handle_t *handle, 104 struct inode *alloc_inode, 105 struct buffer_head *fe_bh, 106 struct buffer_head *bg_bh, 107 struct buffer_head *prev_bg_bh, 108 u16 chain); 109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 110 u32 wanted); 111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 112 u64 bg_blkno, 113 u16 bg_bit_off); 114 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 115 u64 data_blkno, 116 u64 *bg_blkno, 117 u16 *bg_bit_off); 118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 119 u32 bits_wanted, u64 max_block, 120 int flags, 121 struct ocfs2_alloc_context **ac); 122 123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 124 { 125 struct inode *inode = ac->ac_inode; 126 127 if (inode) { 128 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 129 ocfs2_inode_unlock(inode, 1); 130 131 inode_unlock(inode); 132 133 iput(inode); 134 ac->ac_inode = NULL; 135 } 136 brelse(ac->ac_bh); 137 ac->ac_bh = NULL; 138 ac->ac_resv = NULL; 139 kfree(ac->ac_find_loc_priv); 140 ac->ac_find_loc_priv = NULL; 141 } 142 143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 144 { 145 ocfs2_free_ac_resource(ac); 146 kfree(ac); 147 } 148 149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 150 { 151 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 152 } 153 154 #define do_error(fmt, ...) \ 155 do { \ 156 if (resize) \ 157 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ 158 else \ 159 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 160 } while (0) 161 162 static int ocfs2_validate_gd_self(struct super_block *sb, 163 struct buffer_head *bh, 164 int resize) 165 { 166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 167 168 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 169 do_error("Group descriptor #%llu has bad signature %.*s\n", 170 (unsigned long long)bh->b_blocknr, 7, 171 gd->bg_signature); 172 } 173 174 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 175 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", 176 (unsigned long long)bh->b_blocknr, 177 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 178 } 179 180 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 181 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", 182 (unsigned long long)bh->b_blocknr, 183 le32_to_cpu(gd->bg_generation)); 184 } 185 186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 187 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", 188 (unsigned long long)bh->b_blocknr, 189 le16_to_cpu(gd->bg_bits), 190 le16_to_cpu(gd->bg_free_bits_count)); 191 } 192 193 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 194 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", 195 (unsigned long long)bh->b_blocknr, 196 le16_to_cpu(gd->bg_bits), 197 8 * le16_to_cpu(gd->bg_size)); 198 } 199 200 /* 201 * For discontiguous block groups, validate the on-disk extent list 202 * against the maximum number of extent records that can physically 203 * fit in a single block. 204 */ 205 if (ocfs2_gd_is_discontig(gd)) { 206 u16 max_recs = ocfs2_extent_recs_per_gd(sb); 207 u16 l_count = le16_to_cpu(gd->bg_list.l_count); 208 u16 l_next_free_rec = le16_to_cpu(gd->bg_list.l_next_free_rec); 209 210 if (l_count != max_recs) { 211 do_error("Group descriptor #%llu bad discontig l_count %u expected %u\n", 212 (unsigned long long)bh->b_blocknr, 213 l_count, 214 max_recs); 215 } 216 217 if (l_next_free_rec > l_count) { 218 do_error("Group descriptor #%llu bad discontig l_next_free_rec %u max %u\n", 219 (unsigned long long)bh->b_blocknr, 220 l_next_free_rec, 221 l_count); 222 } 223 } 224 225 return 0; 226 } 227 228 static int ocfs2_validate_gd_parent(struct super_block *sb, 229 struct ocfs2_dinode *di, 230 struct buffer_head *bh, 231 int resize) 232 { 233 unsigned int max_bits; 234 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 235 236 if (di->i_blkno != gd->bg_parent_dinode) { 237 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", 238 (unsigned long long)bh->b_blocknr, 239 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 240 (unsigned long long)le64_to_cpu(di->i_blkno)); 241 } 242 243 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 244 if (le16_to_cpu(gd->bg_bits) > max_bits) { 245 do_error("Group descriptor #%llu has bit count of %u\n", 246 (unsigned long long)bh->b_blocknr, 247 le16_to_cpu(gd->bg_bits)); 248 } 249 250 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 251 if ((le16_to_cpu(gd->bg_chain) > 252 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 253 ((le16_to_cpu(gd->bg_chain) == 254 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 255 do_error("Group descriptor #%llu has bad chain %u\n", 256 (unsigned long long)bh->b_blocknr, 257 le16_to_cpu(gd->bg_chain)); 258 } 259 260 return 0; 261 } 262 263 #undef do_error 264 265 /* 266 * This version only prints errors. It does not fail the filesystem, and 267 * exists only for resize. 268 */ 269 int ocfs2_check_group_descriptor(struct super_block *sb, 270 struct ocfs2_dinode *di, 271 struct buffer_head *bh) 272 { 273 int rc; 274 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 275 276 BUG_ON(!buffer_uptodate(bh)); 277 278 /* 279 * If the ecc fails, we return the error but otherwise 280 * leave the filesystem running. We know any error is 281 * local to this block. 282 */ 283 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 284 if (rc) { 285 mlog(ML_ERROR, 286 "Checksum failed for group descriptor %llu\n", 287 (unsigned long long)bh->b_blocknr); 288 } else 289 rc = ocfs2_validate_gd_self(sb, bh, 1); 290 if (!rc) 291 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 292 293 return rc; 294 } 295 296 static int ocfs2_validate_group_descriptor(struct super_block *sb, 297 struct buffer_head *bh) 298 { 299 int rc; 300 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 301 302 trace_ocfs2_validate_group_descriptor( 303 (unsigned long long)bh->b_blocknr); 304 305 BUG_ON(!buffer_uptodate(bh)); 306 307 /* 308 * If the ecc fails, we return the error but otherwise 309 * leave the filesystem running. We know any error is 310 * local to this block. 311 */ 312 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 313 if (rc) 314 return rc; 315 316 /* 317 * Errors after here are fatal. 318 */ 319 320 return ocfs2_validate_gd_self(sb, bh, 0); 321 } 322 323 /* 324 * The hint group descriptor (gd) may already have been released 325 * in _ocfs2_free_suballoc_bits(). We first check the gd signature, 326 * then perform the standard ocfs2_read_group_descriptor() jobs. 327 * 328 * If the gd signature is invalid, we return 'rc=0' and set 329 * '*released=1'. The caller is expected to handle this specific case. 330 * Otherwise, we return the actual error code. 331 * 332 * We treat gd signature corruption case as a release case. The 333 * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain() 334 * to search each gd block. The code will eventually find this 335 * corrupted gd block - Late, but not missed. 336 * 337 * Note: 338 * The caller is responsible for initializing the '*released' status. 339 */ 340 static int ocfs2_read_hint_group_descriptor(struct inode *inode, 341 struct ocfs2_dinode *di, u64 gd_blkno, 342 struct buffer_head **bh, int *released) 343 { 344 int rc; 345 struct buffer_head *tmp = *bh; 346 struct ocfs2_group_desc *gd; 347 348 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL); 349 if (rc) 350 goto out; 351 352 gd = (struct ocfs2_group_desc *) tmp->b_data; 353 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 354 /* 355 * Invalid gd cache was set in ocfs2_read_block(), 356 * which will affect block_group allocation. 357 * Path: 358 * ocfs2_reserve_suballoc_bits 359 * ocfs2_block_group_alloc 360 * ocfs2_block_group_alloc_contig 361 * ocfs2_set_new_buffer_uptodate 362 */ 363 ocfs2_remove_from_cache(INODE_CACHE(inode), tmp); 364 *released = 1; /* we return 'rc=0' for this case */ 365 goto free_bh; 366 } 367 368 /* below jobs same with ocfs2_read_group_descriptor() */ 369 if (!buffer_jbd(tmp)) { 370 rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp); 371 if (rc) 372 goto free_bh; 373 } 374 375 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 376 if (rc) 377 goto free_bh; 378 379 /* If ocfs2_read_block() got us a new bh, pass it up. */ 380 if (!*bh) 381 *bh = tmp; 382 383 return rc; 384 385 free_bh: 386 brelse(tmp); 387 out: 388 return rc; 389 } 390 391 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 392 u64 gd_blkno, struct buffer_head **bh) 393 { 394 int rc; 395 struct buffer_head *tmp = *bh; 396 397 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 398 ocfs2_validate_group_descriptor); 399 if (rc) 400 goto out; 401 402 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 403 if (rc) { 404 brelse(tmp); 405 goto out; 406 } 407 408 /* If ocfs2_read_block() got us a new bh, pass it up. */ 409 if (!*bh) 410 *bh = tmp; 411 412 out: 413 return rc; 414 } 415 416 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 417 struct ocfs2_group_desc *bg, 418 struct ocfs2_chain_list *cl, 419 u64 p_blkno, unsigned int clusters) 420 { 421 struct ocfs2_extent_list *el = &bg->bg_list; 422 struct ocfs2_extent_rec *rec; 423 424 BUG_ON(!ocfs2_supports_discontig_bg(osb)); 425 if (!el->l_next_free_rec) 426 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); 427 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; 428 rec->e_blkno = cpu_to_le64(p_blkno); 429 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 430 le16_to_cpu(cl->cl_bpc)); 431 rec->e_leaf_clusters = cpu_to_le16(clusters); 432 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 433 le16_add_cpu(&bg->bg_free_bits_count, 434 clusters * le16_to_cpu(cl->cl_bpc)); 435 le16_add_cpu(&el->l_next_free_rec, 1); 436 } 437 438 static int ocfs2_block_group_fill(handle_t *handle, 439 struct inode *alloc_inode, 440 struct buffer_head *bg_bh, 441 u64 group_blkno, 442 unsigned int group_clusters, 443 u16 my_chain, 444 struct ocfs2_chain_list *cl) 445 { 446 int status = 0; 447 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 448 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 449 struct super_block * sb = alloc_inode->i_sb; 450 451 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 452 status = ocfs2_error(alloc_inode->i_sb, 453 "group block (%llu) != b_blocknr (%llu)\n", 454 (unsigned long long)group_blkno, 455 (unsigned long long) bg_bh->b_blocknr); 456 goto bail; 457 } 458 459 status = ocfs2_journal_access_gd(handle, 460 INODE_CACHE(alloc_inode), 461 bg_bh, 462 OCFS2_JOURNAL_ACCESS_CREATE); 463 if (status < 0) { 464 mlog_errno(status); 465 goto bail; 466 } 467 468 memset(bg, 0, sb->s_blocksize); 469 strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 470 bg->bg_generation = cpu_to_le32(osb->fs_generation); 471 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 472 osb->s_feature_incompat)); 473 bg->bg_chain = cpu_to_le16(my_chain); 474 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 475 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 476 bg->bg_blkno = cpu_to_le64(group_blkno); 477 if (group_clusters == le16_to_cpu(cl->cl_cpg)) 478 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 479 else 480 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, 481 group_clusters); 482 483 /* set the 1st bit in the bitmap to account for the descriptor block */ 484 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 485 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 486 487 ocfs2_journal_dirty(handle, bg_bh); 488 489 /* There is no need to zero out or otherwise initialize the 490 * other blocks in a group - All valid FS metadata in a block 491 * group stores the superblock fs_generation value at 492 * allocation time. */ 493 494 bail: 495 if (status) 496 mlog_errno(status); 497 return status; 498 } 499 500 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 501 { 502 u16 curr, best; 503 504 best = curr = 0; 505 while (curr < le16_to_cpu(cl->cl_count)) { 506 if (le32_to_cpu(cl->cl_recs[best].c_total) > 507 le32_to_cpu(cl->cl_recs[curr].c_total)) 508 best = curr; 509 curr++; 510 } 511 return best; 512 } 513 514 static struct buffer_head * 515 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, 516 struct inode *alloc_inode, 517 struct ocfs2_alloc_context *ac, 518 struct ocfs2_chain_list *cl) 519 { 520 int status; 521 u32 bit_off, num_bits; 522 u64 bg_blkno; 523 struct buffer_head *bg_bh; 524 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 525 526 status = ocfs2_claim_clusters(handle, ac, 527 le16_to_cpu(cl->cl_cpg), &bit_off, 528 &num_bits); 529 if (status < 0) { 530 if (status != -ENOSPC) 531 mlog_errno(status); 532 goto bail; 533 } 534 535 /* setup the group */ 536 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 537 trace_ocfs2_block_group_alloc_contig( 538 (unsigned long long)bg_blkno, alloc_rec); 539 540 bg_bh = sb_getblk(osb->sb, bg_blkno); 541 if (!bg_bh) { 542 status = -ENOMEM; 543 mlog_errno(status); 544 goto bail; 545 } 546 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 547 548 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 549 bg_blkno, num_bits, alloc_rec, cl); 550 if (status < 0) { 551 brelse(bg_bh); 552 mlog_errno(status); 553 } 554 555 bail: 556 return status ? ERR_PTR(status) : bg_bh; 557 } 558 559 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, 560 handle_t *handle, 561 struct ocfs2_alloc_context *ac, 562 unsigned int min_bits, 563 u32 *bit_off, u32 *num_bits) 564 { 565 int status = 0; 566 567 while (min_bits) { 568 status = ocfs2_claim_clusters(handle, ac, min_bits, 569 bit_off, num_bits); 570 if (status != -ENOSPC) 571 break; 572 573 min_bits >>= 1; 574 } 575 576 return status; 577 } 578 579 static int ocfs2_block_group_grow_discontig(handle_t *handle, 580 struct inode *alloc_inode, 581 struct buffer_head *bg_bh, 582 struct ocfs2_alloc_context *ac, 583 struct ocfs2_chain_list *cl, 584 unsigned int min_bits) 585 { 586 int status; 587 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 588 struct ocfs2_group_desc *bg = 589 (struct ocfs2_group_desc *)bg_bh->b_data; 590 unsigned int needed = le16_to_cpu(cl->cl_cpg) - 591 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 592 u32 p_cpos, clusters; 593 u64 p_blkno; 594 struct ocfs2_extent_list *el = &bg->bg_list; 595 596 status = ocfs2_journal_access_gd(handle, 597 INODE_CACHE(alloc_inode), 598 bg_bh, 599 OCFS2_JOURNAL_ACCESS_CREATE); 600 if (status < 0) { 601 mlog_errno(status); 602 goto bail; 603 } 604 605 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < 606 le16_to_cpu(el->l_count))) { 607 if (min_bits > needed) 608 min_bits = needed; 609 status = ocfs2_block_group_claim_bits(osb, handle, ac, 610 min_bits, &p_cpos, 611 &clusters); 612 if (status < 0) { 613 if (status != -ENOSPC) 614 mlog_errno(status); 615 goto bail; 616 } 617 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); 618 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, 619 clusters); 620 621 min_bits = clusters; 622 needed = le16_to_cpu(cl->cl_cpg) - 623 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 624 } 625 626 if (needed > 0) { 627 /* 628 * We have used up all the extent rec but can't fill up 629 * the cpg. So bail out. 630 */ 631 status = -ENOSPC; 632 goto bail; 633 } 634 635 ocfs2_journal_dirty(handle, bg_bh); 636 637 bail: 638 return status; 639 } 640 641 static void ocfs2_bg_alloc_cleanup(handle_t *handle, 642 struct ocfs2_alloc_context *cluster_ac, 643 struct inode *alloc_inode, 644 struct buffer_head *bg_bh) 645 { 646 int i, ret; 647 struct ocfs2_group_desc *bg; 648 struct ocfs2_extent_list *el; 649 struct ocfs2_extent_rec *rec; 650 651 if (!bg_bh) 652 return; 653 654 bg = (struct ocfs2_group_desc *)bg_bh->b_data; 655 el = &bg->bg_list; 656 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 657 rec = &el->l_recs[i]; 658 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, 659 cluster_ac->ac_bh, 660 le64_to_cpu(rec->e_blkno), 661 le16_to_cpu(rec->e_leaf_clusters)); 662 if (ret) 663 mlog_errno(ret); 664 /* Try all the clusters to free */ 665 } 666 667 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); 668 brelse(bg_bh); 669 } 670 671 static struct buffer_head * 672 ocfs2_block_group_alloc_discontig(handle_t *handle, 673 struct inode *alloc_inode, 674 struct ocfs2_alloc_context *ac, 675 struct ocfs2_chain_list *cl) 676 { 677 int status; 678 u32 bit_off, num_bits; 679 u64 bg_blkno; 680 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; 681 struct buffer_head *bg_bh = NULL; 682 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 683 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 684 685 if (!ocfs2_supports_discontig_bg(osb)) { 686 status = -ENOSPC; 687 goto bail; 688 } 689 690 status = ocfs2_extend_trans(handle, 691 ocfs2_calc_bg_discontig_credits(osb->sb)); 692 if (status) { 693 mlog_errno(status); 694 goto bail; 695 } 696 697 /* 698 * We're going to be grabbing from multiple cluster groups. 699 * We don't have enough credits to relink them all, and the 700 * cluster groups will be staying in cache for the duration of 701 * this operation. 702 */ 703 ac->ac_disable_chain_relink = 1; 704 705 /* Claim the first region */ 706 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, 707 &bit_off, &num_bits); 708 if (status < 0) { 709 if (status != -ENOSPC) 710 mlog_errno(status); 711 goto bail; 712 } 713 min_bits = num_bits; 714 715 /* setup the group */ 716 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 717 trace_ocfs2_block_group_alloc_discontig( 718 (unsigned long long)bg_blkno, alloc_rec); 719 720 bg_bh = sb_getblk(osb->sb, bg_blkno); 721 if (!bg_bh) { 722 status = -ENOMEM; 723 mlog_errno(status); 724 goto bail; 725 } 726 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 727 728 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 729 bg_blkno, num_bits, alloc_rec, cl); 730 if (status < 0) { 731 mlog_errno(status); 732 goto bail; 733 } 734 735 status = ocfs2_block_group_grow_discontig(handle, alloc_inode, 736 bg_bh, ac, cl, min_bits); 737 if (status) 738 mlog_errno(status); 739 740 bail: 741 if (status) 742 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); 743 return status ? ERR_PTR(status) : bg_bh; 744 } 745 746 /* 747 * We expect the block group allocator to already be locked. 748 */ 749 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 750 struct inode *alloc_inode, 751 struct buffer_head *bh, 752 u64 max_block, 753 u64 *last_alloc_group, 754 int flags) 755 { 756 int status, credits; 757 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 758 struct ocfs2_chain_list *cl; 759 struct ocfs2_alloc_context *ac = NULL; 760 handle_t *handle = NULL; 761 u16 alloc_rec; 762 struct buffer_head *bg_bh = NULL; 763 struct ocfs2_group_desc *bg; 764 765 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 766 767 cl = &fe->id2.i_chain; 768 status = ocfs2_reserve_clusters_with_limit(osb, 769 le16_to_cpu(cl->cl_cpg), 770 max_block, flags, &ac); 771 if (status < 0) { 772 if (status != -ENOSPC) 773 mlog_errno(status); 774 goto bail; 775 } 776 777 credits = ocfs2_calc_group_alloc_credits(osb->sb, 778 le16_to_cpu(cl->cl_cpg)); 779 handle = ocfs2_start_trans(osb, credits); 780 if (IS_ERR(handle)) { 781 status = PTR_ERR(handle); 782 handle = NULL; 783 mlog_errno(status); 784 goto bail; 785 } 786 787 if (last_alloc_group && *last_alloc_group != 0) { 788 trace_ocfs2_block_group_alloc( 789 (unsigned long long)*last_alloc_group); 790 ac->ac_last_group = *last_alloc_group; 791 } 792 793 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 794 ac, cl); 795 if (PTR_ERR(bg_bh) == -ENOSPC) { 796 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 797 bg_bh = ocfs2_block_group_alloc_discontig(handle, 798 alloc_inode, 799 ac, cl); 800 } 801 if (IS_ERR(bg_bh)) { 802 status = PTR_ERR(bg_bh); 803 bg_bh = NULL; 804 if (status != -ENOSPC) 805 mlog_errno(status); 806 goto bail; 807 } 808 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 809 810 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 811 bh, OCFS2_JOURNAL_ACCESS_WRITE); 812 if (status < 0) { 813 mlog_errno(status); 814 goto bail; 815 } 816 817 alloc_rec = le16_to_cpu(bg->bg_chain); 818 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 819 le16_to_cpu(bg->bg_free_bits_count)); 820 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 821 le16_to_cpu(bg->bg_bits)); 822 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; 823 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 824 le16_add_cpu(&cl->cl_next_free_rec, 1); 825 826 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 827 le16_to_cpu(bg->bg_free_bits_count)); 828 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 829 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 830 831 ocfs2_journal_dirty(handle, bh); 832 833 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 834 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 835 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 836 le32_to_cpu(fe->i_clusters))); 837 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 838 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 839 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 840 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 841 842 status = 0; 843 844 /* save the new last alloc group so that the caller can cache it. */ 845 if (last_alloc_group) 846 *last_alloc_group = ac->ac_last_group; 847 848 bail: 849 if (handle) 850 ocfs2_commit_trans(osb, handle); 851 852 if (ac) 853 ocfs2_free_alloc_context(ac); 854 855 brelse(bg_bh); 856 857 if (status) 858 mlog_errno(status); 859 return status; 860 } 861 862 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 863 struct ocfs2_alloc_context *ac, 864 int type, 865 u32 slot, 866 u64 *last_alloc_group, 867 int flags) 868 { 869 int status; 870 u32 bits_wanted = ac->ac_bits_wanted; 871 struct inode *alloc_inode; 872 struct buffer_head *bh = NULL; 873 struct ocfs2_dinode *fe; 874 u32 free_bits; 875 876 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 877 if (!alloc_inode) { 878 mlog_errno(-EINVAL); 879 return -EINVAL; 880 } 881 882 inode_lock(alloc_inode); 883 884 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 885 if (status < 0) { 886 inode_unlock(alloc_inode); 887 iput(alloc_inode); 888 889 mlog_errno(status); 890 return status; 891 } 892 893 ac->ac_inode = alloc_inode; 894 ac->ac_alloc_slot = slot; 895 896 fe = (struct ocfs2_dinode *) bh->b_data; 897 898 /* The bh was validated by the inode read inside 899 * ocfs2_inode_lock(). Any corruption is a code bug. */ 900 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 901 902 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 903 status = ocfs2_error(alloc_inode->i_sb, 904 "Invalid chain allocator %llu\n", 905 (unsigned long long)le64_to_cpu(fe->i_blkno)); 906 goto bail; 907 } 908 909 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 910 le32_to_cpu(fe->id1.bitmap1.i_used); 911 912 if (bits_wanted > free_bits) { 913 /* cluster bitmap never grows */ 914 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 915 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, 916 free_bits); 917 status = -ENOSPC; 918 goto bail; 919 } 920 921 if (!(flags & ALLOC_NEW_GROUP)) { 922 trace_ocfs2_reserve_suballoc_bits_no_new_group( 923 slot, bits_wanted, free_bits); 924 status = -ENOSPC; 925 goto bail; 926 } 927 928 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 929 ac->ac_max_block, 930 last_alloc_group, flags); 931 if (status < 0) { 932 if (status != -ENOSPC) 933 mlog_errno(status); 934 goto bail; 935 } 936 atomic_inc(&osb->alloc_stats.bg_extends); 937 938 /* You should never ask for this much metadata */ 939 BUG_ON(bits_wanted > 940 (le32_to_cpu(fe->id1.bitmap1.i_total) 941 - le32_to_cpu(fe->id1.bitmap1.i_used))); 942 } 943 944 get_bh(bh); 945 ac->ac_bh = bh; 946 bail: 947 brelse(bh); 948 949 if (status) 950 mlog_errno(status); 951 return status; 952 } 953 954 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 955 { 956 spin_lock(&osb->osb_lock); 957 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 958 spin_unlock(&osb->osb_lock); 959 atomic_set(&osb->s_num_inodes_stolen, 0); 960 } 961 962 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 963 { 964 spin_lock(&osb->osb_lock); 965 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 966 spin_unlock(&osb->osb_lock); 967 atomic_set(&osb->s_num_meta_stolen, 0); 968 } 969 970 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 971 { 972 ocfs2_init_inode_steal_slot(osb); 973 ocfs2_init_meta_steal_slot(osb); 974 } 975 976 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 977 { 978 spin_lock(&osb->osb_lock); 979 if (type == INODE_ALLOC_SYSTEM_INODE) 980 osb->s_inode_steal_slot = (u16)slot; 981 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 982 osb->s_meta_steal_slot = (u16)slot; 983 spin_unlock(&osb->osb_lock); 984 } 985 986 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 987 { 988 int slot = OCFS2_INVALID_SLOT; 989 990 spin_lock(&osb->osb_lock); 991 if (type == INODE_ALLOC_SYSTEM_INODE) 992 slot = osb->s_inode_steal_slot; 993 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 994 slot = osb->s_meta_steal_slot; 995 spin_unlock(&osb->osb_lock); 996 997 return slot; 998 } 999 1000 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 1001 { 1002 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 1003 } 1004 1005 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 1006 { 1007 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 1008 } 1009 1010 static int ocfs2_steal_resource(struct ocfs2_super *osb, 1011 struct ocfs2_alloc_context *ac, 1012 int type) 1013 { 1014 int i, status = -ENOSPC; 1015 int slot = __ocfs2_get_steal_slot(osb, type); 1016 1017 /* Start to steal resource from the first slot after ours. */ 1018 if (slot == OCFS2_INVALID_SLOT) 1019 slot = osb->slot_num + 1; 1020 1021 for (i = 0; i < osb->max_slots; i++, slot++) { 1022 if (slot == osb->max_slots) 1023 slot = 0; 1024 1025 if (slot == osb->slot_num) 1026 continue; 1027 1028 status = ocfs2_reserve_suballoc_bits(osb, ac, 1029 type, 1030 (u32)slot, NULL, 1031 NOT_ALLOC_NEW_GROUP); 1032 if (status >= 0) { 1033 __ocfs2_set_steal_slot(osb, slot, type); 1034 break; 1035 } 1036 1037 ocfs2_free_ac_resource(ac); 1038 } 1039 1040 return status; 1041 } 1042 1043 static int ocfs2_steal_inode(struct ocfs2_super *osb, 1044 struct ocfs2_alloc_context *ac) 1045 { 1046 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 1047 } 1048 1049 static int ocfs2_steal_meta(struct ocfs2_super *osb, 1050 struct ocfs2_alloc_context *ac) 1051 { 1052 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 1053 } 1054 1055 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 1056 int blocks, 1057 struct ocfs2_alloc_context **ac) 1058 { 1059 int status; 1060 int slot = ocfs2_get_meta_steal_slot(osb); 1061 1062 *ac = kzalloc_obj(struct ocfs2_alloc_context); 1063 if (!(*ac)) { 1064 status = -ENOMEM; 1065 mlog_errno(status); 1066 goto bail; 1067 } 1068 1069 (*ac)->ac_bits_wanted = blocks; 1070 (*ac)->ac_which = OCFS2_AC_USE_META; 1071 (*ac)->ac_group_search = ocfs2_block_group_search; 1072 1073 if (slot != OCFS2_INVALID_SLOT && 1074 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 1075 goto extent_steal; 1076 1077 atomic_set(&osb->s_num_meta_stolen, 0); 1078 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1079 EXTENT_ALLOC_SYSTEM_INODE, 1080 (u32)osb->slot_num, NULL, 1081 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); 1082 1083 1084 if (status >= 0) { 1085 status = 0; 1086 if (slot != OCFS2_INVALID_SLOT) 1087 ocfs2_init_meta_steal_slot(osb); 1088 goto bail; 1089 } else if (status < 0 && status != -ENOSPC) { 1090 mlog_errno(status); 1091 goto bail; 1092 } 1093 1094 ocfs2_free_ac_resource(*ac); 1095 1096 extent_steal: 1097 status = ocfs2_steal_meta(osb, *ac); 1098 atomic_inc(&osb->s_num_meta_stolen); 1099 if (status < 0) { 1100 if (status != -ENOSPC) 1101 mlog_errno(status); 1102 goto bail; 1103 } 1104 1105 status = 0; 1106 bail: 1107 if ((status < 0) && *ac) { 1108 ocfs2_free_alloc_context(*ac); 1109 *ac = NULL; 1110 } 1111 1112 if (status) 1113 mlog_errno(status); 1114 return status; 1115 } 1116 1117 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 1118 struct ocfs2_extent_list *root_el, 1119 struct ocfs2_alloc_context **ac) 1120 { 1121 return ocfs2_reserve_new_metadata_blocks(osb, 1122 ocfs2_extend_meta_needed(root_el), 1123 ac); 1124 } 1125 1126 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 1127 struct ocfs2_alloc_context **ac) 1128 { 1129 int status; 1130 int slot = ocfs2_get_inode_steal_slot(osb); 1131 u64 alloc_group; 1132 1133 *ac = kzalloc_obj(struct ocfs2_alloc_context); 1134 if (!(*ac)) { 1135 status = -ENOMEM; 1136 mlog_errno(status); 1137 goto bail; 1138 } 1139 1140 (*ac)->ac_bits_wanted = 1; 1141 (*ac)->ac_which = OCFS2_AC_USE_INODE; 1142 1143 (*ac)->ac_group_search = ocfs2_block_group_search; 1144 1145 /* 1146 * stat(2) can't handle i_ino > 32bits, so we tell the 1147 * lower levels not to allocate us a block group past that 1148 * limit. The 'inode64' mount option avoids this behavior. 1149 */ 1150 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 1151 (*ac)->ac_max_block = (u32)~0U; 1152 1153 /* 1154 * slot is set when we successfully steal inode from other nodes. 1155 * It is reset in 3 places: 1156 * 1. when we flush the truncate log 1157 * 2. when we complete local alloc recovery. 1158 * 3. when we successfully allocate from our own slot. 1159 * After it is set, we will go on stealing inodes until we find the 1160 * need to check our slots to see whether there is some space for us. 1161 */ 1162 if (slot != OCFS2_INVALID_SLOT && 1163 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 1164 goto inode_steal; 1165 1166 atomic_set(&osb->s_num_inodes_stolen, 0); 1167 alloc_group = osb->osb_inode_alloc_group; 1168 status = ocfs2_reserve_suballoc_bits(osb, *ac, 1169 INODE_ALLOC_SYSTEM_INODE, 1170 (u32)osb->slot_num, 1171 &alloc_group, 1172 ALLOC_NEW_GROUP | 1173 ALLOC_GROUPS_FROM_GLOBAL); 1174 if (status >= 0) { 1175 status = 0; 1176 1177 spin_lock(&osb->osb_lock); 1178 osb->osb_inode_alloc_group = alloc_group; 1179 spin_unlock(&osb->osb_lock); 1180 trace_ocfs2_reserve_new_inode_new_group( 1181 (unsigned long long)alloc_group); 1182 1183 /* 1184 * Some inodes must be freed by us, so try to allocate 1185 * from our own next time. 1186 */ 1187 if (slot != OCFS2_INVALID_SLOT) 1188 ocfs2_init_inode_steal_slot(osb); 1189 goto bail; 1190 } else if (status < 0 && status != -ENOSPC) { 1191 mlog_errno(status); 1192 goto bail; 1193 } 1194 1195 ocfs2_free_ac_resource(*ac); 1196 1197 inode_steal: 1198 status = ocfs2_steal_inode(osb, *ac); 1199 atomic_inc(&osb->s_num_inodes_stolen); 1200 if (status < 0) { 1201 if (status != -ENOSPC) 1202 mlog_errno(status); 1203 goto bail; 1204 } 1205 1206 status = 0; 1207 bail: 1208 if ((status < 0) && *ac) { 1209 ocfs2_free_alloc_context(*ac); 1210 *ac = NULL; 1211 } 1212 1213 if (status) 1214 mlog_errno(status); 1215 return status; 1216 } 1217 1218 /* local alloc code has to do the same thing, so rather than do this 1219 * twice.. */ 1220 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 1221 struct ocfs2_alloc_context *ac) 1222 { 1223 int status; 1224 1225 ac->ac_which = OCFS2_AC_USE_MAIN; 1226 ac->ac_group_search = ocfs2_cluster_group_search; 1227 1228 status = ocfs2_reserve_suballoc_bits(osb, ac, 1229 GLOBAL_BITMAP_SYSTEM_INODE, 1230 OCFS2_INVALID_SLOT, NULL, 1231 ALLOC_NEW_GROUP); 1232 if (status < 0 && status != -ENOSPC) 1233 mlog_errno(status); 1234 1235 return status; 1236 } 1237 1238 /* Callers don't need to care which bitmap (local alloc or main) to 1239 * use so we figure it out for them, but unfortunately this clutters 1240 * things a bit. */ 1241 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 1242 u32 bits_wanted, u64 max_block, 1243 int flags, 1244 struct ocfs2_alloc_context **ac) 1245 { 1246 int status, ret = 0; 1247 int retried = 0; 1248 1249 *ac = kzalloc_obj(struct ocfs2_alloc_context); 1250 if (!(*ac)) { 1251 status = -ENOMEM; 1252 mlog_errno(status); 1253 goto bail; 1254 } 1255 1256 (*ac)->ac_bits_wanted = bits_wanted; 1257 (*ac)->ac_max_block = max_block; 1258 1259 status = -ENOSPC; 1260 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 1261 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 1262 status = ocfs2_reserve_local_alloc_bits(osb, 1263 bits_wanted, 1264 *ac); 1265 if ((status < 0) && (status != -ENOSPC)) { 1266 mlog_errno(status); 1267 goto bail; 1268 } 1269 } 1270 1271 if (status == -ENOSPC) { 1272 retry: 1273 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1274 /* Retry if there is sufficient space cached in truncate log */ 1275 if (status == -ENOSPC && !retried) { 1276 retried = 1; 1277 ocfs2_inode_unlock((*ac)->ac_inode, 1); 1278 inode_unlock((*ac)->ac_inode); 1279 1280 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1281 if (ret == 1) { 1282 iput((*ac)->ac_inode); 1283 (*ac)->ac_inode = NULL; 1284 goto retry; 1285 } 1286 1287 if (ret < 0) 1288 mlog_errno(ret); 1289 1290 inode_lock((*ac)->ac_inode); 1291 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1292 if (ret < 0) { 1293 mlog_errno(ret); 1294 inode_unlock((*ac)->ac_inode); 1295 iput((*ac)->ac_inode); 1296 (*ac)->ac_inode = NULL; 1297 goto bail; 1298 } 1299 } 1300 if (status < 0) { 1301 if (status != -ENOSPC) 1302 mlog_errno(status); 1303 goto bail; 1304 } 1305 } 1306 1307 status = 0; 1308 bail: 1309 if ((status < 0) && *ac) { 1310 ocfs2_free_alloc_context(*ac); 1311 *ac = NULL; 1312 } 1313 1314 if (status) 1315 mlog_errno(status); 1316 return status; 1317 } 1318 1319 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 1320 u32 bits_wanted, 1321 struct ocfs2_alloc_context **ac) 1322 { 1323 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 1324 ALLOC_NEW_GROUP, ac); 1325 } 1326 1327 /* 1328 * More or less lifted from ext3. I'll leave their description below: 1329 * 1330 * "For ext3 allocations, we must not reuse any blocks which are 1331 * allocated in the bitmap buffer's "last committed data" copy. This 1332 * prevents deletes from freeing up the page for reuse until we have 1333 * committed the delete transaction. 1334 * 1335 * If we didn't do this, then deleting something and reallocating it as 1336 * data would allow the old block to be overwritten before the 1337 * transaction committed (because we force data to disk before commit). 1338 * This would lead to corruption if we crashed between overwriting the 1339 * data and committing the delete. 1340 * 1341 * @@@ We may want to make this allocation behaviour conditional on 1342 * data-writes at some point, and disable it for metadata allocations or 1343 * sync-data inodes." 1344 * 1345 * Note: OCFS2 already does this differently for metadata vs data 1346 * allocations, as those bitmaps are separate and undo access is never 1347 * called on a metadata group descriptor. 1348 */ 1349 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1350 int nr) 1351 { 1352 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1353 struct journal_head *jh; 1354 int ret; 1355 1356 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1357 return 0; 1358 1359 jh = jbd2_journal_grab_journal_head(bg_bh); 1360 if (!jh) 1361 return 1; 1362 1363 spin_lock(&jh->b_state_lock); 1364 bg = (struct ocfs2_group_desc *) jh->b_committed_data; 1365 if (bg) 1366 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1367 else 1368 ret = 1; 1369 spin_unlock(&jh->b_state_lock); 1370 jbd2_journal_put_journal_head(jh); 1371 1372 return ret; 1373 } 1374 1375 u16 ocfs2_find_max_contig_free_bits(void *bitmap, 1376 u16 total_bits, u16 start) 1377 { 1378 u16 offset, free_bits; 1379 u16 contig_bits = 0; 1380 1381 while (start < total_bits) { 1382 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); 1383 if (offset == total_bits) 1384 break; 1385 1386 start = ocfs2_find_next_bit(bitmap, total_bits, offset); 1387 free_bits = start - offset; 1388 if (contig_bits < free_bits) 1389 contig_bits = free_bits; 1390 } 1391 1392 return contig_bits; 1393 } 1394 1395 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1396 struct buffer_head *bg_bh, 1397 unsigned int bits_wanted, 1398 unsigned int total_bits, 1399 struct ocfs2_suballoc_result *res) 1400 { 1401 void *bitmap; 1402 u16 best_offset, best_size; 1403 u16 prev_best_size = 0; 1404 int offset, start, found, status = 0; 1405 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1406 1407 /* Callers got this descriptor from 1408 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1409 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1410 1411 found = start = best_offset = best_size = 0; 1412 bitmap = bg->bg_bitmap; 1413 1414 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < 1415 total_bits) { 1416 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1417 /* We found a zero, but we can't use it as it 1418 * hasn't been put to disk yet! */ 1419 found = 0; 1420 start = offset + 1; 1421 } else if (offset == start) { 1422 /* we found a zero */ 1423 found++; 1424 /* move start to the next bit to test */ 1425 start++; 1426 } else { 1427 /* got a zero after some ones */ 1428 found = 1; 1429 start = offset + 1; 1430 prev_best_size = best_size; 1431 } 1432 if (found > best_size) { 1433 best_size = found; 1434 best_offset = start - found; 1435 } 1436 /* we got everything we needed */ 1437 if (found == bits_wanted) { 1438 /* mlog(0, "Found it all!\n"); */ 1439 break; 1440 } 1441 } 1442 1443 /* best_size will be allocated, we save prev_best_size */ 1444 res->sr_max_contig_bits = prev_best_size; 1445 if (best_size) { 1446 res->sr_bit_offset = best_offset; 1447 res->sr_bits = best_size; 1448 } else { 1449 status = -ENOSPC; 1450 /* No error log here -- see the comment above 1451 * ocfs2_test_bg_bit_allocatable */ 1452 } 1453 1454 return status; 1455 } 1456 1457 int ocfs2_block_group_set_bits(handle_t *handle, 1458 struct inode *alloc_inode, 1459 struct ocfs2_group_desc *bg, 1460 struct buffer_head *group_bh, 1461 unsigned int bit_off, 1462 unsigned int num_bits, 1463 unsigned int max_contig_bits, 1464 int fastpath) 1465 { 1466 int status; 1467 void *bitmap = bg->bg_bitmap; 1468 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1469 unsigned int start = bit_off + num_bits; 1470 u16 contig_bits; 1471 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1472 1473 /* All callers get the descriptor via 1474 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1475 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1476 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1477 1478 trace_ocfs2_block_group_set_bits(bit_off, num_bits); 1479 1480 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1481 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1482 1483 status = ocfs2_journal_access_gd(handle, 1484 INODE_CACHE(alloc_inode), 1485 group_bh, 1486 journal_type); 1487 if (status < 0) { 1488 mlog_errno(status); 1489 goto bail; 1490 } 1491 1492 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1493 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1494 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 1495 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1496 le16_to_cpu(bg->bg_bits), 1497 le16_to_cpu(bg->bg_free_bits_count), 1498 num_bits); 1499 } 1500 while(num_bits--) 1501 ocfs2_set_bit(bit_off++, bitmap); 1502 1503 /* 1504 * this is optimize path, caller set old contig value 1505 * in max_contig_bits to bypass finding action. 1506 */ 1507 if (fastpath) { 1508 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1509 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { 1510 /* 1511 * Usually, the block group bitmap allocates only 1 bit 1512 * at a time, while the cluster group allocates n bits 1513 * each time. Therefore, we only save the contig bits for 1514 * the cluster group. 1515 */ 1516 contig_bits = ocfs2_find_max_contig_free_bits(bitmap, 1517 le16_to_cpu(bg->bg_bits), start); 1518 if (contig_bits > max_contig_bits) 1519 max_contig_bits = contig_bits; 1520 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1521 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); 1522 } else { 1523 bg->bg_contig_free_bits = 0; 1524 } 1525 1526 ocfs2_journal_dirty(handle, group_bh); 1527 1528 bail: 1529 return status; 1530 } 1531 1532 /* find the one with the most empty bits */ 1533 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1534 { 1535 u16 curr, best; 1536 1537 BUG_ON(!cl->cl_next_free_rec); 1538 1539 best = curr = 0; 1540 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1541 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1542 le32_to_cpu(cl->cl_recs[best].c_free)) 1543 best = curr; 1544 curr++; 1545 } 1546 1547 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1548 return best; 1549 } 1550 1551 static int ocfs2_relink_block_group(handle_t *handle, 1552 struct inode *alloc_inode, 1553 struct buffer_head *fe_bh, 1554 struct buffer_head *bg_bh, 1555 struct buffer_head *prev_bg_bh, 1556 u16 chain) 1557 { 1558 int status; 1559 /* there is a really tiny chance the journal calls could fail, 1560 * but we wouldn't want inconsistent blocks in *any* case. */ 1561 u64 bg_ptr, prev_bg_ptr; 1562 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1563 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1564 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1565 1566 /* The caller got these descriptors from 1567 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1568 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1569 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1570 1571 trace_ocfs2_relink_block_group( 1572 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1573 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1574 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1575 1576 bg_ptr = le64_to_cpu(bg->bg_next_group); 1577 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1578 1579 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1580 prev_bg_bh, 1581 OCFS2_JOURNAL_ACCESS_WRITE); 1582 if (status < 0) 1583 goto out; 1584 1585 prev_bg->bg_next_group = bg->bg_next_group; 1586 ocfs2_journal_dirty(handle, prev_bg_bh); 1587 1588 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1589 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1590 if (status < 0) 1591 goto out_rollback_prev_bg; 1592 1593 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1594 ocfs2_journal_dirty(handle, bg_bh); 1595 1596 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1597 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1598 if (status < 0) 1599 goto out_rollback_bg; 1600 1601 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1602 ocfs2_journal_dirty(handle, fe_bh); 1603 1604 out: 1605 if (status < 0) 1606 mlog_errno(status); 1607 return status; 1608 1609 out_rollback_bg: 1610 bg->bg_next_group = cpu_to_le64(bg_ptr); 1611 out_rollback_prev_bg: 1612 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1613 goto out; 1614 } 1615 1616 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1617 u32 wanted) 1618 { 1619 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1620 } 1621 1622 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1623 * value on error. */ 1624 static int ocfs2_cluster_group_search(struct inode *inode, 1625 struct buffer_head *group_bh, 1626 u32 bits_wanted, u32 min_bits, 1627 u64 max_block, 1628 struct ocfs2_suballoc_result *res) 1629 { 1630 int search = -ENOSPC; 1631 int ret; 1632 u64 blkoff; 1633 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1634 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1635 unsigned int max_bits, gd_cluster_off; 1636 1637 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1638 1639 if (le16_to_cpu(gd->bg_contig_free_bits) && 1640 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) 1641 return -ENOSPC; 1642 1643 /* ->bg_contig_free_bits may un-initialized, so compare again */ 1644 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { 1645 max_bits = le16_to_cpu(gd->bg_bits); 1646 1647 /* Tail groups in cluster bitmaps which aren't cpg 1648 * aligned are prone to partial extension by a failed 1649 * fs resize. If the file system resize never got to 1650 * update the dinode cluster count, then we don't want 1651 * to trust any clusters past it, regardless of what 1652 * the group descriptor says. */ 1653 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1654 le64_to_cpu(gd->bg_blkno)); 1655 if ((gd_cluster_off + max_bits) > 1656 OCFS2_I(inode)->ip_clusters) { 1657 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1658 trace_ocfs2_cluster_group_search_wrong_max_bits( 1659 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1660 le16_to_cpu(gd->bg_bits), 1661 OCFS2_I(inode)->ip_clusters, max_bits); 1662 } 1663 1664 ret = ocfs2_block_group_find_clear_bits(osb, 1665 group_bh, bits_wanted, 1666 max_bits, res); 1667 if (ret) 1668 return ret; 1669 1670 if (max_block) { 1671 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1672 gd_cluster_off + 1673 res->sr_bit_offset + 1674 res->sr_bits); 1675 trace_ocfs2_cluster_group_search_max_block( 1676 (unsigned long long)blkoff, 1677 (unsigned long long)max_block); 1678 if (blkoff > max_block) 1679 return -ENOSPC; 1680 } 1681 1682 /* ocfs2_block_group_find_clear_bits() might 1683 * return success, but we still want to return 1684 * -ENOSPC unless it found the minimum number 1685 * of bits. */ 1686 if (min_bits <= res->sr_bits) 1687 search = 0; /* success */ 1688 } 1689 1690 return search; 1691 } 1692 1693 static int ocfs2_block_group_search(struct inode *inode, 1694 struct buffer_head *group_bh, 1695 u32 bits_wanted, u32 min_bits, 1696 u64 max_block, 1697 struct ocfs2_suballoc_result *res) 1698 { 1699 int ret = -ENOSPC; 1700 u64 blkoff; 1701 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1702 1703 BUG_ON(min_bits != 1); 1704 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1705 1706 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { 1707 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1708 group_bh, bits_wanted, 1709 le16_to_cpu(bg->bg_bits), 1710 res); 1711 if (!ret && max_block) { 1712 blkoff = le64_to_cpu(bg->bg_blkno) + 1713 res->sr_bit_offset + res->sr_bits; 1714 trace_ocfs2_block_group_search_max_block( 1715 (unsigned long long)blkoff, 1716 (unsigned long long)max_block); 1717 if (blkoff > max_block) 1718 ret = -ENOSPC; 1719 } 1720 } 1721 1722 return ret; 1723 } 1724 1725 int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1726 handle_t *handle, 1727 struct buffer_head *di_bh, 1728 u32 num_bits, 1729 u16 chain) 1730 { 1731 int ret; 1732 u32 tmp_used; 1733 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1734 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1735 1736 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1737 OCFS2_JOURNAL_ACCESS_WRITE); 1738 if (ret < 0) { 1739 mlog_errno(ret); 1740 goto out; 1741 } 1742 1743 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1744 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1745 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1746 ocfs2_journal_dirty(handle, di_bh); 1747 1748 out: 1749 return ret; 1750 } 1751 1752 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, 1753 struct buffer_head *di_bh, 1754 u32 num_bits, 1755 u16 chain) 1756 { 1757 u32 tmp_used; 1758 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1759 struct ocfs2_chain_list *cl; 1760 1761 cl = (struct ocfs2_chain_list *)&di->id2.i_chain; 1762 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1763 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); 1764 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); 1765 } 1766 1767 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1768 struct ocfs2_extent_rec *rec, 1769 struct ocfs2_chain_list *cl) 1770 { 1771 unsigned int bpc = le16_to_cpu(cl->cl_bpc); 1772 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; 1773 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; 1774 1775 if (res->sr_bit_offset < bitoff) 1776 return 0; 1777 if (res->sr_bit_offset >= (bitoff + bitcount)) 1778 return 0; 1779 res->sr_blkno = le64_to_cpu(rec->e_blkno) + 1780 (res->sr_bit_offset - bitoff); 1781 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) 1782 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; 1783 return 1; 1784 } 1785 1786 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, 1787 struct ocfs2_group_desc *bg, 1788 struct ocfs2_suballoc_result *res) 1789 { 1790 int i; 1791 u64 bg_blkno = res->sr_bg_blkno; /* Save off */ 1792 struct ocfs2_extent_rec *rec; 1793 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1794 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1795 1796 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { 1797 res->sr_blkno = 0; 1798 return; 1799 } 1800 1801 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; 1802 res->sr_bg_blkno = 0; /* Clear it for contig block groups */ 1803 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || 1804 !bg->bg_list.l_next_free_rec) 1805 return; 1806 1807 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { 1808 rec = &bg->bg_list.l_recs[i]; 1809 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { 1810 res->sr_bg_blkno = bg_blkno; /* Restore */ 1811 break; 1812 } 1813 } 1814 } 1815 1816 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1817 handle_t *handle, 1818 u32 bits_wanted, 1819 u32 min_bits, 1820 struct ocfs2_suballoc_result *res, 1821 u16 *bits_left, int *released) 1822 { 1823 int ret; 1824 struct buffer_head *group_bh = NULL; 1825 struct ocfs2_group_desc *gd; 1826 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1827 struct inode *alloc_inode = ac->ac_inode; 1828 1829 ret = ocfs2_read_hint_group_descriptor(alloc_inode, di, 1830 res->sr_bg_blkno, &group_bh, released); 1831 if (*released) { 1832 return 0; 1833 } else if (ret < 0) { 1834 mlog_errno(ret); 1835 return ret; 1836 } 1837 1838 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1839 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1840 ac->ac_max_block, res); 1841 if (ret < 0) { 1842 if (ret != -ENOSPC) 1843 mlog_errno(ret); 1844 goto out; 1845 } 1846 1847 if (!ret) 1848 ocfs2_bg_discontig_fix_result(ac, gd, res); 1849 1850 /* 1851 * sr_bg_blkno might have been changed by 1852 * ocfs2_bg_discontig_fix_result 1853 */ 1854 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1855 1856 if (ac->ac_find_loc_only) 1857 goto out_loc_only; 1858 1859 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1860 res->sr_bits, 1861 le16_to_cpu(gd->bg_chain)); 1862 if (ret < 0) { 1863 mlog_errno(ret); 1864 goto out; 1865 } 1866 1867 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1868 res->sr_bit_offset, res->sr_bits, 1869 res->sr_max_contig_bits, 0); 1870 if (ret < 0) { 1871 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, 1872 res->sr_bits, 1873 le16_to_cpu(gd->bg_chain)); 1874 mlog_errno(ret); 1875 } 1876 1877 out_loc_only: 1878 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1879 1880 out: 1881 brelse(group_bh); 1882 1883 return ret; 1884 } 1885 1886 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1887 handle_t *handle, 1888 u32 bits_wanted, 1889 u32 min_bits, 1890 struct ocfs2_suballoc_result *res, 1891 u16 *bits_left) 1892 { 1893 int status; 1894 u16 chain; 1895 u32 contig_bits; 1896 u64 next_group; 1897 struct inode *alloc_inode = ac->ac_inode; 1898 struct buffer_head *group_bh = NULL; 1899 struct buffer_head *prev_group_bh = NULL; 1900 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1901 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1902 struct ocfs2_group_desc *bg; 1903 1904 chain = ac->ac_chain; 1905 trace_ocfs2_search_chain_begin( 1906 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 1907 bits_wanted, chain); 1908 1909 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1910 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1911 &group_bh); 1912 if (status < 0) { 1913 mlog_errno(status); 1914 goto bail; 1915 } 1916 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1917 1918 status = -ENOSPC; 1919 /* for now, the chain search is a bit simplistic. We just use 1920 * the 1st group with any empty bits. */ 1921 while (1) { 1922 if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { 1923 contig_bits = le16_to_cpu(bg->bg_contig_free_bits); 1924 if (!contig_bits) 1925 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 1926 le16_to_cpu(bg->bg_bits), 0); 1927 if (bits_wanted > contig_bits && contig_bits >= min_bits) 1928 bits_wanted = contig_bits; 1929 } 1930 1931 status = ac->ac_group_search(alloc_inode, group_bh, 1932 bits_wanted, min_bits, 1933 ac->ac_max_block, res); 1934 if (status != -ENOSPC) 1935 break; 1936 if (!bg->bg_next_group) 1937 break; 1938 1939 brelse(prev_group_bh); 1940 prev_group_bh = NULL; 1941 1942 next_group = le64_to_cpu(bg->bg_next_group); 1943 prev_group_bh = group_bh; 1944 group_bh = NULL; 1945 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1946 next_group, &group_bh); 1947 if (status < 0) { 1948 mlog_errno(status); 1949 goto bail; 1950 } 1951 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1952 } 1953 if (status < 0) { 1954 if (status != -ENOSPC) 1955 mlog_errno(status); 1956 goto bail; 1957 } 1958 1959 trace_ocfs2_search_chain_succ( 1960 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); 1961 1962 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1963 1964 BUG_ON(res->sr_bits == 0); 1965 if (!status) 1966 ocfs2_bg_discontig_fix_result(ac, bg, res); 1967 1968 /* 1969 * sr_bg_blkno might have been changed by 1970 * ocfs2_bg_discontig_fix_result 1971 */ 1972 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1973 1974 /* 1975 * Keep track of previous block descriptor read. When 1976 * we find a target, if we have read more than X 1977 * number of descriptors, and the target is reasonably 1978 * empty, relink him to top of his chain. 1979 * 1980 * We've read 0 extra blocks and only send one more to 1981 * the transaction, yet the next guy to search has a 1982 * much easier time. 1983 * 1984 * Do this *after* figuring out how many bits we're taking out 1985 * of our target group. 1986 */ 1987 if (!ac->ac_disable_chain_relink && 1988 (prev_group_bh) && 1989 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { 1990 status = ocfs2_relink_block_group(handle, alloc_inode, 1991 ac->ac_bh, group_bh, 1992 prev_group_bh, chain); 1993 if (status < 0) { 1994 mlog_errno(status); 1995 goto bail; 1996 } 1997 } 1998 1999 if (ac->ac_find_loc_only) 2000 goto out_loc_only; 2001 2002 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, 2003 ac->ac_bh, res->sr_bits, 2004 chain); 2005 if (status) { 2006 mlog_errno(status); 2007 goto bail; 2008 } 2009 2010 status = ocfs2_block_group_set_bits(handle, 2011 alloc_inode, 2012 bg, 2013 group_bh, 2014 res->sr_bit_offset, 2015 res->sr_bits, 2016 res->sr_max_contig_bits, 2017 0); 2018 if (status < 0) { 2019 ocfs2_rollback_alloc_dinode_counts(alloc_inode, 2020 ac->ac_bh, res->sr_bits, chain); 2021 mlog_errno(status); 2022 goto bail; 2023 } 2024 2025 trace_ocfs2_search_chain_end( 2026 (unsigned long long)le64_to_cpu(fe->i_blkno), 2027 res->sr_bits); 2028 2029 out_loc_only: 2030 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 2031 bail: 2032 brelse(group_bh); 2033 brelse(prev_group_bh); 2034 2035 if (status) 2036 mlog_errno(status); 2037 return status; 2038 } 2039 2040 /* will give out up to bits_wanted contiguous bits. */ 2041 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 2042 handle_t *handle, 2043 u32 bits_wanted, 2044 u32 min_bits, 2045 struct ocfs2_suballoc_result *res) 2046 { 2047 int status; 2048 int released = 0; 2049 u16 victim, i; 2050 u16 bits_left = 0; 2051 u64 hint = ac->ac_last_group; 2052 struct ocfs2_chain_list *cl; 2053 struct ocfs2_dinode *fe; 2054 2055 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2056 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 2057 BUG_ON(!ac->ac_bh); 2058 2059 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2060 2061 /* The bh was validated by the inode read during 2062 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 2063 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2064 2065 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 2066 le32_to_cpu(fe->id1.bitmap1.i_total)) { 2067 status = ocfs2_error(ac->ac_inode->i_sb, 2068 "Chain allocator dinode %llu has %u used bits but only %u total\n", 2069 (unsigned long long)le64_to_cpu(fe->i_blkno), 2070 le32_to_cpu(fe->id1.bitmap1.i_used), 2071 le32_to_cpu(fe->id1.bitmap1.i_total)); 2072 goto bail; 2073 } 2074 2075 /* the hint bg may already be released, we quiet search this group. */ 2076 res->sr_bg_blkno = hint; 2077 if (res->sr_bg_blkno) { 2078 /* Attempt to short-circuit the usual search mechanism 2079 * by jumping straight to the most recently used 2080 * allocation group. This helps us maintain some 2081 * contiguousness across allocations. */ 2082 status = ocfs2_search_one_group(ac, handle, bits_wanted, 2083 min_bits, res, &bits_left, 2084 &released); 2085 if (released) { 2086 res->sr_bg_blkno = 0; 2087 goto chain_search; 2088 } 2089 if (!status) 2090 goto set_hint; 2091 if (status < 0 && status != -ENOSPC) { 2092 mlog_errno(status); 2093 goto bail; 2094 } 2095 } 2096 chain_search: 2097 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 2098 if (!le16_to_cpu(cl->cl_next_free_rec) || 2099 le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { 2100 status = ocfs2_error(ac->ac_inode->i_sb, 2101 "Chain allocator dinode %llu has invalid next " 2102 "free chain record %u, but only %u total\n", 2103 (unsigned long long)le64_to_cpu(fe->i_blkno), 2104 le16_to_cpu(cl->cl_next_free_rec), 2105 le16_to_cpu(cl->cl_count)); 2106 goto bail; 2107 } 2108 2109 victim = ocfs2_find_victim_chain(cl); 2110 ac->ac_chain = victim; 2111 2112 search: 2113 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2114 res, &bits_left); 2115 if (!status) { 2116 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) 2117 hint = res->sr_bg_blkno; 2118 else 2119 hint = ocfs2_group_from_res(res); 2120 goto set_hint; 2121 } 2122 if (status < 0 && status != -ENOSPC) { 2123 mlog_errno(status); 2124 goto bail; 2125 } 2126 2127 trace_ocfs2_claim_suballoc_bits(victim); 2128 2129 /* If we didn't pick a good victim, then just default to 2130 * searching each chain in order. Don't allow chain relinking 2131 * because we only calculate enough journal credits for one 2132 * relink per alloc. */ 2133 ac->ac_disable_chain_relink = 1; 2134 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 2135 if (i == victim) 2136 continue; 2137 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) 2138 continue; 2139 2140 ac->ac_chain = i; 2141 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2142 res, &bits_left); 2143 if (!status) { 2144 hint = ocfs2_group_from_res(res); 2145 break; 2146 } 2147 if (status < 0 && status != -ENOSPC) { 2148 mlog_errno(status); 2149 goto bail; 2150 } 2151 } 2152 2153 /* Chains can't supply the bits_wanted contiguous space. 2154 * We should switch to using every single bit when allocating 2155 * from the global bitmap. */ 2156 if (i == le16_to_cpu(cl->cl_next_free_rec) && 2157 status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { 2158 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 2159 ac->ac_chain = victim; 2160 goto search; 2161 } 2162 2163 set_hint: 2164 if (status != -ENOSPC) { 2165 /* If the next search of this group is not likely to 2166 * yield a suitable extent, then we reset the last 2167 * group hint so as to not waste a disk read */ 2168 if (bits_left < min_bits) 2169 ac->ac_last_group = 0; 2170 else 2171 ac->ac_last_group = hint; 2172 } 2173 2174 bail: 2175 if (status) 2176 mlog_errno(status); 2177 return status; 2178 } 2179 2180 int ocfs2_claim_metadata(handle_t *handle, 2181 struct ocfs2_alloc_context *ac, 2182 u32 bits_wanted, 2183 u64 *suballoc_loc, 2184 u16 *suballoc_bit_start, 2185 unsigned int *num_bits, 2186 u64 *blkno_start) 2187 { 2188 int status; 2189 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2190 2191 BUG_ON(!ac); 2192 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 2193 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 2194 2195 status = ocfs2_claim_suballoc_bits(ac, 2196 handle, 2197 bits_wanted, 2198 1, 2199 &res); 2200 if (status < 0) { 2201 mlog_errno(status); 2202 goto bail; 2203 } 2204 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2205 2206 *suballoc_loc = res.sr_bg_blkno; 2207 *suballoc_bit_start = res.sr_bit_offset; 2208 *blkno_start = res.sr_blkno; 2209 ac->ac_bits_given += res.sr_bits; 2210 *num_bits = res.sr_bits; 2211 status = 0; 2212 bail: 2213 if (status) 2214 mlog_errno(status); 2215 return status; 2216 } 2217 2218 /* 2219 * after ocfs2 has the ability to release block group unused space, 2220 * the ->ip_last_used_group may be invalid. so this function returns 2221 * ac->ac_last_group need to verify. 2222 * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details. 2223 */ 2224 static void ocfs2_init_inode_ac_group(struct inode *dir, 2225 struct buffer_head *parent_di_bh, 2226 struct ocfs2_alloc_context *ac) 2227 { 2228 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; 2229 /* 2230 * Try to allocate inodes from some specific group. 2231 * 2232 * If the parent dir has recorded the last group used in allocation, 2233 * cool, use it. Otherwise if we try to allocate new inode from the 2234 * same slot the parent dir belongs to, use the same chunk. 2235 * 2236 * We are very careful here to avoid the mistake of setting 2237 * ac_last_group to a group descriptor from a different (unlocked) slot. 2238 */ 2239 if (OCFS2_I(dir)->ip_last_used_group && 2240 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2241 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2242 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { 2243 if (di->i_suballoc_loc) 2244 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); 2245 else 2246 ac->ac_last_group = ocfs2_which_suballoc_group( 2247 le64_to_cpu(di->i_blkno), 2248 le16_to_cpu(di->i_suballoc_bit)); 2249 } 2250 } 2251 2252 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2253 struct ocfs2_alloc_context *ac) 2254 { 2255 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 2256 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2257 } 2258 2259 int ocfs2_find_new_inode_loc(struct inode *dir, 2260 struct buffer_head *parent_fe_bh, 2261 struct ocfs2_alloc_context *ac, 2262 u64 *fe_blkno) 2263 { 2264 int ret; 2265 handle_t *handle = NULL; 2266 struct ocfs2_suballoc_result *res; 2267 2268 BUG_ON(!ac); 2269 BUG_ON(ac->ac_bits_given != 0); 2270 BUG_ON(ac->ac_bits_wanted != 1); 2271 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2272 2273 res = kzalloc_obj(*res, GFP_NOFS); 2274 if (res == NULL) { 2275 ret = -ENOMEM; 2276 mlog_errno(ret); 2277 goto out; 2278 } 2279 2280 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2281 2282 /* 2283 * The handle started here is for chain relink. Alternatively, 2284 * we could just disable relink for these calls. 2285 */ 2286 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); 2287 if (IS_ERR(handle)) { 2288 ret = PTR_ERR(handle); 2289 handle = NULL; 2290 mlog_errno(ret); 2291 goto out; 2292 } 2293 2294 /* 2295 * This will instruct ocfs2_claim_suballoc_bits and 2296 * ocfs2_search_one_group to search but save actual allocation 2297 * for later. 2298 */ 2299 ac->ac_find_loc_only = 1; 2300 2301 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); 2302 if (ret < 0) { 2303 mlog_errno(ret); 2304 goto out; 2305 } 2306 2307 ac->ac_find_loc_priv = res; 2308 *fe_blkno = res->sr_blkno; 2309 ocfs2_update_inode_fsync_trans(handle, dir, 0); 2310 out: 2311 if (handle) 2312 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2313 2314 if (ret) 2315 kfree(res); 2316 2317 return ret; 2318 } 2319 2320 int ocfs2_claim_new_inode_at_loc(handle_t *handle, 2321 struct inode *dir, 2322 struct ocfs2_alloc_context *ac, 2323 u64 *suballoc_loc, 2324 u16 *suballoc_bit, 2325 u64 di_blkno) 2326 { 2327 int ret; 2328 u16 chain; 2329 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; 2330 struct buffer_head *bg_bh = NULL; 2331 struct ocfs2_group_desc *bg; 2332 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2333 2334 /* 2335 * Since di_blkno is being passed back in, we check for any 2336 * inconsistencies which may have happened between 2337 * calls. These are code bugs as di_blkno is not expected to 2338 * change once returned from ocfs2_find_new_inode_loc() 2339 */ 2340 BUG_ON(res->sr_blkno != di_blkno); 2341 2342 ret = ocfs2_read_group_descriptor(ac->ac_inode, di, 2343 res->sr_bg_stable_blkno, &bg_bh); 2344 if (ret) { 2345 mlog_errno(ret); 2346 goto out; 2347 } 2348 2349 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 2350 chain = le16_to_cpu(bg->bg_chain); 2351 2352 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, 2353 ac->ac_bh, res->sr_bits, 2354 chain); 2355 if (ret) { 2356 mlog_errno(ret); 2357 goto out; 2358 } 2359 2360 ret = ocfs2_block_group_set_bits(handle, 2361 ac->ac_inode, 2362 bg, 2363 bg_bh, 2364 res->sr_bit_offset, 2365 res->sr_bits, 2366 res->sr_max_contig_bits, 2367 0); 2368 if (ret < 0) { 2369 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, 2370 ac->ac_bh, res->sr_bits, chain); 2371 mlog_errno(ret); 2372 goto out; 2373 } 2374 2375 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, 2376 res->sr_bits); 2377 2378 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2379 2380 BUG_ON(res->sr_bits != 1); 2381 2382 *suballoc_loc = res->sr_bg_blkno; 2383 *suballoc_bit = res->sr_bit_offset; 2384 ac->ac_bits_given++; 2385 ocfs2_save_inode_ac_group(dir, ac); 2386 2387 out: 2388 brelse(bg_bh); 2389 2390 return ret; 2391 } 2392 2393 int ocfs2_claim_new_inode(handle_t *handle, 2394 struct inode *dir, 2395 struct buffer_head *parent_fe_bh, 2396 struct ocfs2_alloc_context *ac, 2397 u64 *suballoc_loc, 2398 u16 *suballoc_bit, 2399 u64 *fe_blkno) 2400 { 2401 int status; 2402 struct ocfs2_suballoc_result res; 2403 2404 BUG_ON(!ac); 2405 BUG_ON(ac->ac_bits_given != 0); 2406 BUG_ON(ac->ac_bits_wanted != 1); 2407 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2408 2409 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2410 2411 status = ocfs2_claim_suballoc_bits(ac, 2412 handle, 2413 1, 2414 1, 2415 &res); 2416 if (status < 0) { 2417 mlog_errno(status); 2418 goto bail; 2419 } 2420 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2421 2422 BUG_ON(res.sr_bits != 1); 2423 2424 *suballoc_loc = res.sr_bg_blkno; 2425 *suballoc_bit = res.sr_bit_offset; 2426 *fe_blkno = res.sr_blkno; 2427 ac->ac_bits_given++; 2428 ocfs2_save_inode_ac_group(dir, ac); 2429 status = 0; 2430 bail: 2431 if (status) 2432 mlog_errno(status); 2433 return status; 2434 } 2435 2436 /* translate a group desc. blkno and it's bitmap offset into 2437 * disk cluster offset. */ 2438 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 2439 u64 bg_blkno, 2440 u16 bg_bit_off) 2441 { 2442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2443 u32 cluster = 0; 2444 2445 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2446 2447 if (bg_blkno != osb->first_cluster_group_blkno) 2448 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 2449 cluster += (u32) bg_bit_off; 2450 return cluster; 2451 } 2452 2453 /* given a cluster offset, calculate which block group it belongs to 2454 * and return that block offset. */ 2455 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 2456 { 2457 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2458 u32 group_no; 2459 2460 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2461 2462 group_no = cluster / osb->bitmap_cpg; 2463 if (!group_no) 2464 return osb->first_cluster_group_blkno; 2465 return ocfs2_clusters_to_blocks(inode->i_sb, 2466 group_no * osb->bitmap_cpg); 2467 } 2468 2469 /* given the block number of a cluster start, calculate which cluster 2470 * group and descriptor bitmap offset that corresponds to. */ 2471 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 2472 u64 data_blkno, 2473 u64 *bg_blkno, 2474 u16 *bg_bit_off) 2475 { 2476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2477 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 2478 2479 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2480 2481 *bg_blkno = ocfs2_which_cluster_group(inode, 2482 data_cluster); 2483 2484 if (*bg_blkno == osb->first_cluster_group_blkno) 2485 *bg_bit_off = (u16) data_cluster; 2486 else 2487 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 2488 data_blkno - *bg_blkno); 2489 } 2490 2491 /* 2492 * min_bits - minimum contiguous chunk from this total allocation we 2493 * can handle. set to what we asked for originally for a full 2494 * contig. allocation, set to '1' to indicate we can deal with extents 2495 * of any size. 2496 */ 2497 int __ocfs2_claim_clusters(handle_t *handle, 2498 struct ocfs2_alloc_context *ac, 2499 u32 min_clusters, 2500 u32 max_clusters, 2501 u32 *cluster_start, 2502 u32 *num_clusters) 2503 { 2504 int status; 2505 unsigned int bits_wanted = max_clusters; 2506 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2507 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2508 2509 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2510 2511 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2512 && ac->ac_which != OCFS2_AC_USE_MAIN 2513 && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); 2514 2515 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2516 WARN_ON(min_clusters > 1); 2517 2518 status = ocfs2_claim_local_alloc_bits(osb, 2519 handle, 2520 ac, 2521 bits_wanted, 2522 cluster_start, 2523 num_clusters); 2524 if (!status) 2525 atomic_inc(&osb->alloc_stats.local_data); 2526 } else { 2527 if (min_clusters > (osb->bitmap_cpg - 1)) { 2528 /* The only paths asking for contiguousness 2529 * should know about this already. */ 2530 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 2531 "group bitmap size %u!\n", min_clusters, 2532 osb->bitmap_cpg); 2533 status = -ENOSPC; 2534 goto bail; 2535 } 2536 /* clamp the current request down to a realistic size. */ 2537 if (bits_wanted > (osb->bitmap_cpg - 1)) 2538 bits_wanted = osb->bitmap_cpg - 1; 2539 2540 status = ocfs2_claim_suballoc_bits(ac, 2541 handle, 2542 bits_wanted, 2543 min_clusters, 2544 &res); 2545 if (!status) { 2546 BUG_ON(res.sr_blkno); /* cluster alloc can't set */ 2547 *cluster_start = 2548 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2549 res.sr_bg_blkno, 2550 res.sr_bit_offset); 2551 atomic_inc(&osb->alloc_stats.bitmap_data); 2552 *num_clusters = res.sr_bits; 2553 } 2554 } 2555 if (status < 0) { 2556 if (status != -ENOSPC) 2557 mlog_errno(status); 2558 goto bail; 2559 } 2560 2561 ac->ac_bits_given += *num_clusters; 2562 2563 bail: 2564 if (status) 2565 mlog_errno(status); 2566 return status; 2567 } 2568 2569 int ocfs2_claim_clusters(handle_t *handle, 2570 struct ocfs2_alloc_context *ac, 2571 u32 min_clusters, 2572 u32 *cluster_start, 2573 u32 *num_clusters) 2574 { 2575 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2576 2577 return __ocfs2_claim_clusters(handle, ac, min_clusters, 2578 bits_wanted, cluster_start, num_clusters); 2579 } 2580 2581 static int ocfs2_block_group_clear_bits(handle_t *handle, 2582 struct inode *alloc_inode, 2583 struct ocfs2_group_desc *bg, 2584 struct buffer_head *group_bh, 2585 unsigned int bit_off, 2586 unsigned int num_bits, 2587 unsigned int max_contig_bits, 2588 void (*undo_fn)(unsigned int bit, 2589 unsigned long *bmap)) 2590 { 2591 int status; 2592 unsigned int tmp; 2593 u16 contig_bits; 2594 struct ocfs2_group_desc *undo_bg = NULL; 2595 struct journal_head *jh; 2596 2597 /* The caller got this descriptor from 2598 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2599 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2600 2601 trace_ocfs2_block_group_clear_bits(bit_off, num_bits); 2602 2603 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2604 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2605 group_bh, 2606 undo_fn ? 2607 OCFS2_JOURNAL_ACCESS_UNDO : 2608 OCFS2_JOURNAL_ACCESS_WRITE); 2609 if (status < 0) { 2610 mlog_errno(status); 2611 goto bail; 2612 } 2613 2614 jh = bh2jh(group_bh); 2615 if (undo_fn) { 2616 spin_lock(&jh->b_state_lock); 2617 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; 2618 BUG_ON(!undo_bg); 2619 } 2620 2621 tmp = num_bits; 2622 while(tmp--) { 2623 ocfs2_clear_bit((bit_off + tmp), 2624 (unsigned long *) bg->bg_bitmap); 2625 if (undo_fn) 2626 undo_fn(bit_off + tmp, 2627 (unsigned long *) undo_bg->bg_bitmap); 2628 } 2629 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2630 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2631 if (undo_fn) 2632 spin_unlock(&jh->b_state_lock); 2633 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 2634 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2635 le16_to_cpu(bg->bg_bits), 2636 le16_to_cpu(bg->bg_free_bits_count), 2637 num_bits); 2638 } 2639 2640 /* 2641 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), 2642 * we still need to rescan whole bitmap. 2643 */ 2644 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 2645 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 2646 le16_to_cpu(bg->bg_bits), 0); 2647 if (contig_bits > max_contig_bits) 2648 max_contig_bits = contig_bits; 2649 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 2650 } else { 2651 bg->bg_contig_free_bits = 0; 2652 } 2653 2654 if (undo_fn) 2655 spin_unlock(&jh->b_state_lock); 2656 2657 ocfs2_journal_dirty(handle, group_bh); 2658 bail: 2659 return status; 2660 } 2661 2662 /* 2663 * Reclaim the suballocator managed space to main bitmap. 2664 * This function first works on the suballocator to perform the 2665 * cleanup rec/alloc_inode job, then switches to the main bitmap 2666 * to reclaim released space. 2667 * 2668 * handle: The transaction handle 2669 * alloc_inode: The suballoc inode 2670 * alloc_bh: The buffer_head of suballoc inode 2671 * group_bh: The group descriptor buffer_head of suballocator managed. 2672 * Caller should release the input group_bh. 2673 */ 2674 static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle, 2675 struct inode *alloc_inode, 2676 struct buffer_head *alloc_bh, 2677 struct buffer_head *group_bh) 2678 { 2679 int idx, status = 0; 2680 int i, next_free_rec, len = 0; 2681 __le16 old_bg_contig_free_bits = 0; 2682 u16 start_bit; 2683 u32 tmp_used; 2684 u64 bg_blkno, start_blk; 2685 unsigned int count; 2686 struct ocfs2_chain_rec *rec; 2687 struct buffer_head *main_bm_bh = NULL; 2688 struct inode *main_bm_inode = NULL; 2689 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 2690 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2691 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2692 struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data; 2693 2694 idx = le16_to_cpu(group->bg_chain); 2695 rec = &(cl->cl_recs[idx]); 2696 2697 status = ocfs2_extend_trans(handle, 2698 ocfs2_calc_group_alloc_credits(osb->sb, 2699 le16_to_cpu(cl->cl_cpg))); 2700 if (status) { 2701 mlog_errno(status); 2702 goto bail; 2703 } 2704 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2705 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2706 if (status < 0) { 2707 mlog_errno(status); 2708 goto bail; 2709 } 2710 2711 /* 2712 * Only clear the suballocator rec item in-place. 2713 * 2714 * If idx is not the last, we don't compress (remove the empty item) 2715 * the cl_recs[]. If not, we need to do lots jobs. 2716 * 2717 * Compress cl_recs[] code example: 2718 * if (idx != cl->cl_next_free_rec - 1) 2719 * memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1], 2720 * sizeof(struct ocfs2_chain_rec) * 2721 * (cl->cl_next_free_rec - idx - 1)); 2722 * for(i = idx; i < cl->cl_next_free_rec-1; i++) { 2723 * group->bg_chain = "later group->bg_chain"; 2724 * group->bg_blkno = xxx; 2725 * ... ... 2726 * } 2727 */ 2728 2729 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total); 2730 fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total)); 2731 2732 /* Substraction 1 for the block group itself */ 2733 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2734 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1); 2735 2736 tmp_used = le32_to_cpu(fe->i_clusters); 2737 fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg)); 2738 2739 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 2740 OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters); 2741 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 2742 le32_to_cpu(fe->i_clusters))); 2743 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 2744 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 2745 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 2746 2747 ocfs2_journal_dirty(handle, alloc_bh); 2748 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 2749 2750 start_blk = le64_to_cpu(rec->c_blkno); 2751 count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc); 2752 2753 /* 2754 * If the rec is the last one, let's compress the chain list by 2755 * removing the empty cl_recs[] at the end. 2756 */ 2757 next_free_rec = le16_to_cpu(cl->cl_next_free_rec); 2758 if (idx == (next_free_rec - 1)) { 2759 len++; /* the last item should be counted first */ 2760 for (i = (next_free_rec - 2); i > 0; i--) { 2761 if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total) 2762 len++; 2763 else 2764 break; 2765 } 2766 } 2767 le16_add_cpu(&cl->cl_next_free_rec, -len); 2768 2769 rec->c_free = 0; 2770 rec->c_total = 0; 2771 rec->c_blkno = 0; 2772 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh); 2773 memset(group, 0, sizeof(struct ocfs2_group_desc)); 2774 2775 /* prepare job for reclaim clusters */ 2776 main_bm_inode = ocfs2_get_system_file_inode(osb, 2777 GLOBAL_BITMAP_SYSTEM_INODE, 2778 OCFS2_INVALID_SLOT); 2779 if (!main_bm_inode) 2780 goto bail; /* ignore the error in reclaim path */ 2781 2782 inode_lock(main_bm_inode); 2783 2784 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); 2785 if (status < 0) 2786 goto free_bm_inode; /* ignore the error in reclaim path */ 2787 2788 ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno, 2789 &start_bit); 2790 fe = (struct ocfs2_dinode *) main_bm_bh->b_data; 2791 cl = &fe->id2.i_chain; 2792 /* reuse group_bh, caller will release the input group_bh */ 2793 group_bh = NULL; 2794 2795 /* reclaim clusters to global_bitmap */ 2796 status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno, 2797 &group_bh); 2798 if (status < 0) { 2799 mlog_errno(status); 2800 goto free_bm_bh; 2801 } 2802 group = (struct ocfs2_group_desc *) group_bh->b_data; 2803 2804 if ((count + start_bit) > le16_to_cpu(group->bg_bits)) { 2805 ocfs2_error(alloc_inode->i_sb, 2806 "reclaim length (%d) beyands block group length (%d)", 2807 count + start_bit, le16_to_cpu(group->bg_bits)); 2808 goto free_group_bh; 2809 } 2810 2811 old_bg_contig_free_bits = group->bg_contig_free_bits; 2812 status = ocfs2_block_group_clear_bits(handle, main_bm_inode, 2813 group, group_bh, 2814 start_bit, count, 0, 2815 _ocfs2_clear_bit); 2816 if (status < 0) { 2817 mlog_errno(status); 2818 goto free_group_bh; 2819 } 2820 2821 status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 2822 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2823 if (status < 0) { 2824 mlog_errno(status); 2825 ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh, 2826 start_bit, count, 2827 le16_to_cpu(old_bg_contig_free_bits), 1); 2828 goto free_group_bh; 2829 } 2830 2831 idx = le16_to_cpu(group->bg_chain); 2832 rec = &(cl->cl_recs[idx]); 2833 2834 le32_add_cpu(&rec->c_free, count); 2835 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2836 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2837 ocfs2_journal_dirty(handle, main_bm_bh); 2838 2839 free_group_bh: 2840 brelse(group_bh); 2841 2842 free_bm_bh: 2843 ocfs2_inode_unlock(main_bm_inode, 1); 2844 brelse(main_bm_bh); 2845 2846 free_bm_inode: 2847 inode_unlock(main_bm_inode); 2848 iput(main_bm_inode); 2849 2850 bail: 2851 return status; 2852 } 2853 2854 /* 2855 * expects the suballoc inode to already be locked. 2856 */ 2857 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2858 struct inode *alloc_inode, 2859 struct buffer_head *alloc_bh, 2860 unsigned int start_bit, 2861 u64 bg_blkno, 2862 unsigned int count, 2863 void (*undo_fn)(unsigned int bit, 2864 unsigned long *bitmap)) 2865 { 2866 int idx, status = 0; 2867 u32 tmp_used; 2868 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2869 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2870 struct buffer_head *group_bh = NULL; 2871 struct ocfs2_group_desc *group; 2872 struct ocfs2_chain_rec *rec; 2873 __le16 old_bg_contig_free_bits = 0; 2874 2875 /* The alloc_bh comes from ocfs2_free_dinode() or 2876 * ocfs2_free_clusters(). The callers have all locked the 2877 * allocator and gotten alloc_bh from the lock call. This 2878 * validates the dinode buffer. Any corruption that has happened 2879 * is a code bug. */ 2880 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2881 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2882 2883 trace_ocfs2_free_suballoc_bits( 2884 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 2885 (unsigned long long)bg_blkno, 2886 start_bit, count); 2887 2888 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2889 &group_bh); 2890 if (status < 0) { 2891 mlog_errno(status); 2892 goto bail; 2893 } 2894 group = (struct ocfs2_group_desc *) group_bh->b_data; 2895 2896 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2897 2898 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2899 old_bg_contig_free_bits = group->bg_contig_free_bits; 2900 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2901 group, group_bh, 2902 start_bit, count, 0, undo_fn); 2903 if (status < 0) { 2904 mlog_errno(status); 2905 goto bail; 2906 } 2907 2908 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2909 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2910 if (status < 0) { 2911 mlog_errno(status); 2912 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, 2913 start_bit, count, 2914 le16_to_cpu(old_bg_contig_free_bits), 1); 2915 goto bail; 2916 } 2917 2918 idx = le16_to_cpu(group->bg_chain); 2919 rec = &(cl->cl_recs[idx]); 2920 2921 le32_add_cpu(&rec->c_free, count); 2922 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2923 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2924 ocfs2_journal_dirty(handle, alloc_bh); 2925 2926 /* 2927 * Reclaim suballocator free space. 2928 * Bypass: global_bitmap, non empty rec, first rec in cl_recs[] 2929 */ 2930 if (ocfs2_is_cluster_bitmap(alloc_inode) || 2931 (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) || 2932 (le16_to_cpu(cl->cl_next_free_rec) == 1)) { 2933 goto bail; 2934 } 2935 2936 _ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh); 2937 2938 bail: 2939 brelse(group_bh); 2940 return status; 2941 } 2942 2943 int ocfs2_free_suballoc_bits(handle_t *handle, 2944 struct inode *alloc_inode, 2945 struct buffer_head *alloc_bh, 2946 unsigned int start_bit, 2947 u64 bg_blkno, 2948 unsigned int count) 2949 { 2950 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2951 start_bit, bg_blkno, count, NULL); 2952 } 2953 2954 int ocfs2_free_dinode(handle_t *handle, 2955 struct inode *inode_alloc_inode, 2956 struct buffer_head *inode_alloc_bh, 2957 struct ocfs2_dinode *di) 2958 { 2959 u64 blk = le64_to_cpu(di->i_blkno); 2960 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2961 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2962 2963 if (di->i_suballoc_loc) 2964 bg_blkno = le64_to_cpu(di->i_suballoc_loc); 2965 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2966 inode_alloc_bh, bit, bg_blkno, 1); 2967 } 2968 2969 static int _ocfs2_free_clusters(handle_t *handle, 2970 struct inode *bitmap_inode, 2971 struct buffer_head *bitmap_bh, 2972 u64 start_blk, 2973 unsigned int num_clusters, 2974 void (*undo_fn)(unsigned int bit, 2975 unsigned long *bitmap)) 2976 { 2977 int status; 2978 u16 bg_start_bit; 2979 u64 bg_blkno; 2980 2981 /* You can't ever have a contiguous set of clusters 2982 * bigger than a block group bitmap so we never have to worry 2983 * about looping on them. 2984 * This is expensive. We can safely remove once this stuff has 2985 * gotten tested really well. */ 2986 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, 2987 ocfs2_blocks_to_clusters(bitmap_inode->i_sb, 2988 start_blk))); 2989 2990 2991 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2992 &bg_start_bit); 2993 2994 trace_ocfs2_free_clusters((unsigned long long)bg_blkno, 2995 (unsigned long long)start_blk, 2996 bg_start_bit, num_clusters); 2997 2998 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2999 bg_start_bit, bg_blkno, 3000 num_clusters, undo_fn); 3001 if (status < 0) { 3002 mlog_errno(status); 3003 goto out; 3004 } 3005 3006 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 3007 num_clusters); 3008 3009 out: 3010 return status; 3011 } 3012 3013 int ocfs2_free_clusters(handle_t *handle, 3014 struct inode *bitmap_inode, 3015 struct buffer_head *bitmap_bh, 3016 u64 start_blk, 3017 unsigned int num_clusters) 3018 { 3019 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 3020 start_blk, num_clusters, 3021 _ocfs2_set_bit); 3022 } 3023 3024 /* 3025 * Give never-used clusters back to the global bitmap. We don't need 3026 * to protect these bits in the undo buffer. 3027 */ 3028 int ocfs2_release_clusters(handle_t *handle, 3029 struct inode *bitmap_inode, 3030 struct buffer_head *bitmap_bh, 3031 u64 start_blk, 3032 unsigned int num_clusters) 3033 { 3034 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 3035 start_blk, num_clusters, 3036 _ocfs2_clear_bit); 3037 } 3038 3039 /* 3040 * For a given allocation, determine which allocators will need to be 3041 * accessed, and lock them, reserving the appropriate number of bits. 3042 * 3043 * Sparse file systems call this from ocfs2_write_begin_nolock() 3044 * and ocfs2_allocate_unwritten_extents(). 3045 * 3046 * File systems which don't support holes call this from 3047 * ocfs2_extend_allocation(). 3048 */ 3049 int ocfs2_lock_allocators(struct inode *inode, 3050 struct ocfs2_extent_tree *et, 3051 u32 clusters_to_add, u32 extents_to_split, 3052 struct ocfs2_alloc_context **data_ac, 3053 struct ocfs2_alloc_context **meta_ac) 3054 { 3055 int ret = 0, num_free_extents; 3056 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 3057 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3058 3059 *meta_ac = NULL; 3060 if (data_ac) 3061 *data_ac = NULL; 3062 3063 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 3064 3065 num_free_extents = ocfs2_num_free_extents(et); 3066 if (num_free_extents < 0) { 3067 ret = num_free_extents; 3068 mlog_errno(ret); 3069 goto out; 3070 } 3071 3072 /* 3073 * Sparse allocation file systems need to be more conservative 3074 * with reserving room for expansion - the actual allocation 3075 * happens while we've got a journal handle open so re-taking 3076 * a cluster lock (because we ran out of room for another 3077 * extent) will violate ordering rules. 3078 * 3079 * Most of the time we'll only be seeing this 1 cluster at a time 3080 * anyway. 3081 * 3082 * Always lock for any unwritten extents - we might want to 3083 * add blocks during a split. 3084 */ 3085 if (!num_free_extents || 3086 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 3087 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 3088 if (ret < 0) { 3089 if (ret != -ENOSPC) 3090 mlog_errno(ret); 3091 goto out; 3092 } 3093 } 3094 3095 if (clusters_to_add == 0) 3096 goto out; 3097 3098 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 3099 if (ret < 0) { 3100 if (ret != -ENOSPC) 3101 mlog_errno(ret); 3102 goto out; 3103 } 3104 3105 out: 3106 if (ret) { 3107 if (*meta_ac) { 3108 ocfs2_free_alloc_context(*meta_ac); 3109 *meta_ac = NULL; 3110 } 3111 3112 /* 3113 * We cannot have an error and a non null *data_ac. 3114 */ 3115 } 3116 3117 return ret; 3118 } 3119 3120 /* 3121 * Read the inode specified by blkno to get suballoc_slot and 3122 * suballoc_bit. 3123 */ 3124 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 3125 u16 *suballoc_slot, u64 *group_blkno, 3126 u16 *suballoc_bit) 3127 { 3128 int status; 3129 struct buffer_head *inode_bh = NULL; 3130 struct ocfs2_dinode *inode_fe; 3131 3132 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); 3133 3134 /* dirty read disk */ 3135 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 3136 if (status < 0) { 3137 mlog(ML_ERROR, "read block %llu failed %d\n", 3138 (unsigned long long)blkno, status); 3139 goto bail; 3140 } 3141 3142 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 3143 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 3144 mlog(ML_ERROR, "invalid inode %llu requested\n", 3145 (unsigned long long)blkno); 3146 status = -EINVAL; 3147 goto bail; 3148 } 3149 3150 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 3151 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 3152 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 3153 (unsigned long long)blkno, 3154 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 3155 status = -EINVAL; 3156 goto bail; 3157 } 3158 3159 if (suballoc_slot) 3160 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 3161 if (suballoc_bit) 3162 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 3163 if (group_blkno) 3164 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); 3165 3166 bail: 3167 brelse(inode_bh); 3168 3169 if (status) 3170 mlog_errno(status); 3171 return status; 3172 } 3173 3174 /* 3175 * test whether bit is SET in allocator bitmap or not. on success, 0 3176 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 3177 * is returned and *res is meaningless. Call this after you have 3178 * cluster locked against suballoc, or you may get a result based on 3179 * non-up2date contents 3180 */ 3181 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 3182 struct inode *suballoc, 3183 struct buffer_head *alloc_bh, 3184 u64 group_blkno, u64 blkno, 3185 u16 bit, int *res) 3186 { 3187 struct ocfs2_dinode *alloc_di; 3188 struct ocfs2_group_desc *group; 3189 struct buffer_head *group_bh = NULL; 3190 u64 bg_blkno; 3191 int status, quiet = 0, released = 0; 3192 3193 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, 3194 (unsigned int)bit); 3195 3196 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 3197 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 3198 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 3199 (unsigned int)bit, 3200 ocfs2_bits_per_group(&alloc_di->id2.i_chain)); 3201 status = -EINVAL; 3202 goto bail; 3203 } 3204 3205 bg_blkno = group_blkno ? group_blkno : 3206 ocfs2_which_suballoc_group(blkno, bit); 3207 status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno, 3208 &group_bh, &released); 3209 if (released) { 3210 quiet = 1; 3211 status = -ESTALE; 3212 goto bail; 3213 } else if (status < 0) { 3214 mlog(ML_ERROR, "read group %llu failed %d\n", 3215 (unsigned long long)bg_blkno, status); 3216 goto bail; 3217 } 3218 3219 group = (struct ocfs2_group_desc *) group_bh->b_data; 3220 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 3221 3222 bail: 3223 brelse(group_bh); 3224 3225 if (status && !quiet) 3226 mlog_errno(status); 3227 return status; 3228 } 3229 3230 /* 3231 * Test if the bit representing this inode (blkno) is set in the 3232 * suballocator. 3233 * 3234 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 3235 * 3236 * In the event of failure, a negative value is returned and *res is 3237 * meaningless. 3238 * 3239 * Callers must make sure to hold nfs_sync_lock to prevent 3240 * ocfs2_delete_inode() on another node from accessing the same 3241 * suballocator concurrently. 3242 */ 3243 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 3244 { 3245 int status, quiet = 0; 3246 u64 group_blkno = 0; 3247 u16 suballoc_bit = 0, suballoc_slot = 0; 3248 struct inode *inode_alloc_inode; 3249 struct buffer_head *alloc_bh = NULL; 3250 3251 trace_ocfs2_test_inode_bit((unsigned long long)blkno); 3252 3253 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 3254 &group_blkno, &suballoc_bit); 3255 if (status < 0) { 3256 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 3257 goto bail; 3258 } 3259 3260 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) 3261 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 3262 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 3263 else 3264 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 3265 INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 3266 if (!inode_alloc_inode) { 3267 /* the error code could be inaccurate, but we are not able to 3268 * get the correct one. */ 3269 status = -EINVAL; 3270 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 3271 (u32)suballoc_slot); 3272 goto bail; 3273 } 3274 3275 inode_lock(inode_alloc_inode); 3276 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 3277 if (status < 0) { 3278 inode_unlock(inode_alloc_inode); 3279 iput(inode_alloc_inode); 3280 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 3281 (u32)suballoc_slot, status); 3282 goto bail; 3283 } 3284 3285 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 3286 group_blkno, blkno, suballoc_bit, res); 3287 if (status < 0) { 3288 if (status == -ESTALE) 3289 quiet = 1; 3290 else 3291 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 3292 } 3293 3294 ocfs2_inode_unlock(inode_alloc_inode, 0); 3295 inode_unlock(inode_alloc_inode); 3296 3297 iput(inode_alloc_inode); 3298 brelse(alloc_bh); 3299 bail: 3300 if (status && !quiet) 3301 mlog_errno(status); 3302 return status; 3303 } 3304