1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * suballoc.c 4 * 5 * metadata alloc and free 6 * Inspired by ext3 block groups. 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/string.h> 15 #include <linux/highmem.h> 16 17 #include <cluster/masklog.h> 18 19 #include "ocfs2.h" 20 21 #include "alloc.h" 22 #include "blockcheck.h" 23 #include "dlmglue.h" 24 #include "inode.h" 25 #include "journal.h" 26 #include "localalloc.h" 27 #include "suballoc.h" 28 #include "super.h" 29 #include "sysfile.h" 30 #include "uptodate.h" 31 #include "ocfs2_trace.h" 32 33 #include "buffer_head_io.h" 34 35 #define NOT_ALLOC_NEW_GROUP 0 36 #define ALLOC_NEW_GROUP 0x1 37 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 38 39 #define OCFS2_MAX_TO_STEAL 1024 40 41 struct ocfs2_suballoc_result { 42 u64 sr_bg_blkno; /* The bg we allocated from. Set 43 to 0 when a block group is 44 contiguous. */ 45 u64 sr_bg_stable_blkno; /* 46 * Doesn't change, always 47 * set to target block 48 * group descriptor 49 * block. 50 */ 51 u64 sr_blkno; /* The first allocated block */ 52 unsigned int sr_bit_offset; /* The bit in the bg */ 53 unsigned int sr_bits; /* How many bits we claimed */ 54 unsigned int sr_max_contig_bits; /* The length for contiguous 55 * free bits, only available 56 * for cluster group 57 */ 58 }; 59 60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) 61 { 62 if (res->sr_blkno == 0) 63 return 0; 64 65 if (res->sr_bg_blkno) 66 return res->sr_bg_blkno; 67 68 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 69 } 70 71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 72 static int ocfs2_block_group_fill(handle_t *handle, 73 struct inode *alloc_inode, 74 struct buffer_head *bg_bh, 75 u64 group_blkno, 76 unsigned int group_clusters, 77 u16 my_chain, 78 struct ocfs2_chain_list *cl); 79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 80 struct inode *alloc_inode, 81 struct buffer_head *bh, 82 u64 max_block, 83 u64 *last_alloc_group, 84 int flags); 85 86 static int ocfs2_cluster_group_search(struct inode *inode, 87 struct buffer_head *group_bh, 88 u32 bits_wanted, u32 min_bits, 89 u64 max_block, 90 struct ocfs2_suballoc_result *res); 91 static int ocfs2_block_group_search(struct inode *inode, 92 struct buffer_head *group_bh, 93 u32 bits_wanted, u32 min_bits, 94 u64 max_block, 95 struct ocfs2_suballoc_result *res); 96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 97 handle_t *handle, 98 u32 bits_wanted, 99 u32 min_bits, 100 struct ocfs2_suballoc_result *res); 101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 102 int nr); 103 static int ocfs2_relink_block_group(handle_t *handle, 104 struct inode *alloc_inode, 105 struct buffer_head *fe_bh, 106 struct buffer_head *bg_bh, 107 struct buffer_head *prev_bg_bh, 108 u16 chain); 109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 110 u32 wanted); 111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 112 u64 bg_blkno, 113 u16 bg_bit_off); 114 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 115 u64 data_blkno, 116 u64 *bg_blkno, 117 u16 *bg_bit_off); 118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 119 u32 bits_wanted, u64 max_block, 120 int flags, 121 struct ocfs2_alloc_context **ac); 122 123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 124 { 125 struct inode *inode = ac->ac_inode; 126 127 if (inode) { 128 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 129 ocfs2_inode_unlock(inode, 1); 130 131 inode_unlock(inode); 132 133 iput(inode); 134 ac->ac_inode = NULL; 135 } 136 brelse(ac->ac_bh); 137 ac->ac_bh = NULL; 138 ac->ac_resv = NULL; 139 kfree(ac->ac_find_loc_priv); 140 ac->ac_find_loc_priv = NULL; 141 } 142 143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 144 { 145 ocfs2_free_ac_resource(ac); 146 kfree(ac); 147 } 148 149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) 150 { 151 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 152 } 153 154 #define do_error(fmt, ...) \ 155 do { \ 156 if (resize) \ 157 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ 158 else \ 159 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 160 } while (0) 161 162 static int ocfs2_validate_gd_self(struct super_block *sb, 163 struct buffer_head *bh, 164 int resize) 165 { 166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 167 168 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 169 do_error("Group descriptor #%llu has bad signature %.*s\n", 170 (unsigned long long)bh->b_blocknr, 7, 171 gd->bg_signature); 172 } 173 174 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 175 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", 176 (unsigned long long)bh->b_blocknr, 177 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 178 } 179 180 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 181 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", 182 (unsigned long long)bh->b_blocknr, 183 le32_to_cpu(gd->bg_generation)); 184 } 185 186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 187 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", 188 (unsigned long long)bh->b_blocknr, 189 le16_to_cpu(gd->bg_bits), 190 le16_to_cpu(gd->bg_free_bits_count)); 191 } 192 193 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 194 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", 195 (unsigned long long)bh->b_blocknr, 196 le16_to_cpu(gd->bg_bits), 197 8 * le16_to_cpu(gd->bg_size)); 198 } 199 200 /* 201 * For discontiguous block groups, validate the on-disk extent list 202 * against the maximum number of extent records that can physically 203 * fit in a single block. 204 */ 205 if (ocfs2_gd_is_discontig(gd)) { 206 u16 max_recs = ocfs2_extent_recs_per_gd(sb); 207 u16 l_count = le16_to_cpu(gd->bg_list.l_count); 208 u16 l_next_free_rec = le16_to_cpu(gd->bg_list.l_next_free_rec); 209 210 if (l_count != max_recs) { 211 do_error("Group descriptor #%llu bad discontig l_count %u expected %u\n", 212 (unsigned long long)bh->b_blocknr, 213 l_count, 214 max_recs); 215 } 216 217 if (l_next_free_rec > l_count) { 218 do_error("Group descriptor #%llu bad discontig l_next_free_rec %u max %u\n", 219 (unsigned long long)bh->b_blocknr, 220 l_next_free_rec, 221 l_count); 222 } 223 } 224 225 return 0; 226 } 227 228 static int ocfs2_validate_gd_parent(struct super_block *sb, 229 struct ocfs2_dinode *di, 230 struct buffer_head *bh, 231 int resize) 232 { 233 unsigned int max_bits; 234 unsigned int max_bitmap_bits; 235 unsigned int max_bitmap_size; 236 int suballocator; 237 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 238 239 suballocator = le64_to_cpu(di->i_blkno) != OCFS2_SB(sb)->bitmap_blkno; 240 max_bitmap_size = ocfs2_group_bitmap_size(sb, suballocator, 241 OCFS2_SB(sb)->s_feature_incompat); 242 max_bitmap_bits = max_bitmap_size * 8; 243 244 if (di->i_blkno != gd->bg_parent_dinode) { 245 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", 246 (unsigned long long)bh->b_blocknr, 247 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 248 (unsigned long long)le64_to_cpu(di->i_blkno)); 249 } 250 251 if (le16_to_cpu(gd->bg_size) > max_bitmap_size) { 252 do_error("Group descriptor #%llu has bitmap size %u but physical max of %u\n", 253 (unsigned long long)bh->b_blocknr, 254 le16_to_cpu(gd->bg_size), 255 max_bitmap_size); 256 } 257 258 if (le16_to_cpu(gd->bg_bits) > max_bitmap_bits) { 259 do_error("Group descriptor #%llu has bit count %u but physical max of %u\n", 260 (unsigned long long)bh->b_blocknr, 261 le16_to_cpu(gd->bg_bits), 262 max_bitmap_bits); 263 } 264 265 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 266 if (le16_to_cpu(gd->bg_bits) > max_bits) { 267 do_error("Group descriptor #%llu has bit count of %u\n", 268 (unsigned long long)bh->b_blocknr, 269 le16_to_cpu(gd->bg_bits)); 270 } 271 272 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 273 if ((le16_to_cpu(gd->bg_chain) > 274 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 275 ((le16_to_cpu(gd->bg_chain) == 276 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 277 do_error("Group descriptor #%llu has bad chain %u\n", 278 (unsigned long long)bh->b_blocknr, 279 le16_to_cpu(gd->bg_chain)); 280 } 281 282 return 0; 283 } 284 285 #undef do_error 286 287 /* 288 * This version only prints errors. It does not fail the filesystem, and 289 * exists only for resize. 290 */ 291 int ocfs2_check_group_descriptor(struct super_block *sb, 292 struct ocfs2_dinode *di, 293 struct buffer_head *bh) 294 { 295 int rc; 296 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 297 298 BUG_ON(!buffer_uptodate(bh)); 299 300 /* 301 * If the ecc fails, we return the error but otherwise 302 * leave the filesystem running. We know any error is 303 * local to this block. 304 */ 305 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 306 if (rc) { 307 mlog(ML_ERROR, 308 "Checksum failed for group descriptor %llu\n", 309 (unsigned long long)bh->b_blocknr); 310 } else 311 rc = ocfs2_validate_gd_self(sb, bh, 1); 312 if (!rc) 313 rc = ocfs2_validate_gd_parent(sb, di, bh, 1); 314 315 return rc; 316 } 317 318 static int ocfs2_validate_group_descriptor(struct super_block *sb, 319 struct buffer_head *bh) 320 { 321 int rc; 322 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 323 324 trace_ocfs2_validate_group_descriptor( 325 (unsigned long long)bh->b_blocknr); 326 327 BUG_ON(!buffer_uptodate(bh)); 328 329 /* 330 * If the ecc fails, we return the error but otherwise 331 * leave the filesystem running. We know any error is 332 * local to this block. 333 */ 334 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); 335 if (rc) 336 return rc; 337 338 /* 339 * Errors after here are fatal. 340 */ 341 342 return ocfs2_validate_gd_self(sb, bh, 0); 343 } 344 345 /* 346 * The hint group descriptor (gd) may already have been released 347 * in _ocfs2_free_suballoc_bits(). We first check the gd signature, 348 * then perform the standard ocfs2_read_group_descriptor() jobs. 349 * 350 * If the gd signature is invalid, we return 'rc=0' and set 351 * '*released=1'. The caller is expected to handle this specific case. 352 * Otherwise, we return the actual error code. 353 * 354 * We treat gd signature corruption case as a release case. The 355 * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain() 356 * to search each gd block. The code will eventually find this 357 * corrupted gd block - Late, but not missed. 358 * 359 * Note: 360 * The caller is responsible for initializing the '*released' status. 361 */ 362 static int ocfs2_read_hint_group_descriptor(struct inode *inode, 363 struct ocfs2_dinode *di, u64 gd_blkno, 364 struct buffer_head **bh, int *released) 365 { 366 int rc; 367 struct buffer_head *tmp = *bh; 368 struct ocfs2_group_desc *gd; 369 370 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL); 371 if (rc) 372 goto out; 373 374 gd = (struct ocfs2_group_desc *) tmp->b_data; 375 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 376 /* 377 * Invalid gd cache was set in ocfs2_read_block(), 378 * which will affect block_group allocation. 379 * Path: 380 * ocfs2_reserve_suballoc_bits 381 * ocfs2_block_group_alloc 382 * ocfs2_block_group_alloc_contig 383 * ocfs2_set_new_buffer_uptodate 384 */ 385 ocfs2_remove_from_cache(INODE_CACHE(inode), tmp); 386 *released = 1; /* we return 'rc=0' for this case */ 387 goto free_bh; 388 } 389 390 /* below jobs same with ocfs2_read_group_descriptor() */ 391 if (!buffer_jbd(tmp)) { 392 rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp); 393 if (rc) 394 goto free_bh; 395 } 396 397 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 398 if (rc) 399 goto free_bh; 400 401 /* If ocfs2_read_block() got us a new bh, pass it up. */ 402 if (!*bh) 403 *bh = tmp; 404 405 return rc; 406 407 free_bh: 408 brelse(tmp); 409 out: 410 return rc; 411 } 412 413 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 414 u64 gd_blkno, struct buffer_head **bh) 415 { 416 int rc; 417 struct buffer_head *tmp = *bh; 418 419 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, 420 ocfs2_validate_group_descriptor); 421 if (rc) 422 goto out; 423 424 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 425 if (rc) { 426 brelse(tmp); 427 goto out; 428 } 429 430 /* If ocfs2_read_block() got us a new bh, pass it up. */ 431 if (!*bh) 432 *bh = tmp; 433 434 out: 435 return rc; 436 } 437 438 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 439 struct ocfs2_group_desc *bg, 440 struct ocfs2_chain_list *cl, 441 u64 p_blkno, unsigned int clusters) 442 { 443 struct ocfs2_extent_list *el = &bg->bg_list; 444 struct ocfs2_extent_rec *rec; 445 446 BUG_ON(!ocfs2_supports_discontig_bg(osb)); 447 if (!el->l_next_free_rec) 448 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); 449 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; 450 rec->e_blkno = cpu_to_le64(p_blkno); 451 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 452 le16_to_cpu(cl->cl_bpc)); 453 rec->e_leaf_clusters = cpu_to_le16(clusters); 454 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 455 le16_add_cpu(&bg->bg_free_bits_count, 456 clusters * le16_to_cpu(cl->cl_bpc)); 457 le16_add_cpu(&el->l_next_free_rec, 1); 458 } 459 460 static int ocfs2_block_group_fill(handle_t *handle, 461 struct inode *alloc_inode, 462 struct buffer_head *bg_bh, 463 u64 group_blkno, 464 unsigned int group_clusters, 465 u16 my_chain, 466 struct ocfs2_chain_list *cl) 467 { 468 int status = 0; 469 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 470 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 471 struct super_block * sb = alloc_inode->i_sb; 472 473 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 474 status = ocfs2_error(alloc_inode->i_sb, 475 "group block (%llu) != b_blocknr (%llu)\n", 476 (unsigned long long)group_blkno, 477 (unsigned long long) bg_bh->b_blocknr); 478 goto bail; 479 } 480 481 status = ocfs2_journal_access_gd(handle, 482 INODE_CACHE(alloc_inode), 483 bg_bh, 484 OCFS2_JOURNAL_ACCESS_CREATE); 485 if (status < 0) { 486 mlog_errno(status); 487 goto bail; 488 } 489 490 memset(bg, 0, sb->s_blocksize); 491 strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 492 bg->bg_generation = cpu_to_le32(osb->fs_generation); 493 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 494 osb->s_feature_incompat)); 495 bg->bg_chain = cpu_to_le16(my_chain); 496 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 497 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 498 bg->bg_blkno = cpu_to_le64(group_blkno); 499 if (group_clusters == le16_to_cpu(cl->cl_cpg)) 500 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 501 else 502 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, 503 group_clusters); 504 505 /* set the 1st bit in the bitmap to account for the descriptor block */ 506 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 507 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 508 509 ocfs2_journal_dirty(handle, bg_bh); 510 511 /* There is no need to zero out or otherwise initialize the 512 * other blocks in a group - All valid FS metadata in a block 513 * group stores the superblock fs_generation value at 514 * allocation time. */ 515 516 bail: 517 if (status) 518 mlog_errno(status); 519 return status; 520 } 521 522 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) 523 { 524 u16 curr, best; 525 526 best = curr = 0; 527 while (curr < le16_to_cpu(cl->cl_count)) { 528 if (le32_to_cpu(cl->cl_recs[best].c_total) > 529 le32_to_cpu(cl->cl_recs[curr].c_total)) 530 best = curr; 531 curr++; 532 } 533 return best; 534 } 535 536 static struct buffer_head * 537 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, 538 struct inode *alloc_inode, 539 struct ocfs2_alloc_context *ac, 540 struct ocfs2_chain_list *cl) 541 { 542 int status; 543 u32 bit_off, num_bits; 544 u64 bg_blkno; 545 struct buffer_head *bg_bh; 546 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 547 548 status = ocfs2_claim_clusters(handle, ac, 549 le16_to_cpu(cl->cl_cpg), &bit_off, 550 &num_bits); 551 if (status < 0) { 552 if (status != -ENOSPC) 553 mlog_errno(status); 554 goto bail; 555 } 556 557 /* setup the group */ 558 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 559 trace_ocfs2_block_group_alloc_contig( 560 (unsigned long long)bg_blkno, alloc_rec); 561 562 bg_bh = sb_getblk(osb->sb, bg_blkno); 563 if (!bg_bh) { 564 status = -ENOMEM; 565 mlog_errno(status); 566 goto bail; 567 } 568 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 569 570 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 571 bg_blkno, num_bits, alloc_rec, cl); 572 if (status < 0) { 573 brelse(bg_bh); 574 mlog_errno(status); 575 } 576 577 bail: 578 return status ? ERR_PTR(status) : bg_bh; 579 } 580 581 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, 582 handle_t *handle, 583 struct ocfs2_alloc_context *ac, 584 unsigned int min_bits, 585 u32 *bit_off, u32 *num_bits) 586 { 587 int status = 0; 588 589 while (min_bits) { 590 status = ocfs2_claim_clusters(handle, ac, min_bits, 591 bit_off, num_bits); 592 if (status != -ENOSPC) 593 break; 594 595 min_bits >>= 1; 596 } 597 598 return status; 599 } 600 601 static int ocfs2_block_group_grow_discontig(handle_t *handle, 602 struct inode *alloc_inode, 603 struct buffer_head *bg_bh, 604 struct ocfs2_alloc_context *ac, 605 struct ocfs2_chain_list *cl, 606 unsigned int min_bits) 607 { 608 int status; 609 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 610 struct ocfs2_group_desc *bg = 611 (struct ocfs2_group_desc *)bg_bh->b_data; 612 unsigned int needed = le16_to_cpu(cl->cl_cpg) - 613 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 614 u32 p_cpos, clusters; 615 u64 p_blkno; 616 struct ocfs2_extent_list *el = &bg->bg_list; 617 618 status = ocfs2_journal_access_gd(handle, 619 INODE_CACHE(alloc_inode), 620 bg_bh, 621 OCFS2_JOURNAL_ACCESS_CREATE); 622 if (status < 0) { 623 mlog_errno(status); 624 goto bail; 625 } 626 627 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < 628 le16_to_cpu(el->l_count))) { 629 if (min_bits > needed) 630 min_bits = needed; 631 status = ocfs2_block_group_claim_bits(osb, handle, ac, 632 min_bits, &p_cpos, 633 &clusters); 634 if (status < 0) { 635 if (status != -ENOSPC) 636 mlog_errno(status); 637 goto bail; 638 } 639 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); 640 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, 641 clusters); 642 643 min_bits = clusters; 644 needed = le16_to_cpu(cl->cl_cpg) - 645 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); 646 } 647 648 if (needed > 0) { 649 /* 650 * We have used up all the extent rec but can't fill up 651 * the cpg. So bail out. 652 */ 653 status = -ENOSPC; 654 goto bail; 655 } 656 657 ocfs2_journal_dirty(handle, bg_bh); 658 659 bail: 660 return status; 661 } 662 663 static void ocfs2_bg_alloc_cleanup(handle_t *handle, 664 struct ocfs2_alloc_context *cluster_ac, 665 struct inode *alloc_inode, 666 struct buffer_head *bg_bh) 667 { 668 int i, ret; 669 struct ocfs2_group_desc *bg; 670 struct ocfs2_extent_list *el; 671 struct ocfs2_extent_rec *rec; 672 673 if (!bg_bh) 674 return; 675 676 bg = (struct ocfs2_group_desc *)bg_bh->b_data; 677 el = &bg->bg_list; 678 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 679 rec = &el->l_recs[i]; 680 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, 681 cluster_ac->ac_bh, 682 le64_to_cpu(rec->e_blkno), 683 le16_to_cpu(rec->e_leaf_clusters)); 684 if (ret) 685 mlog_errno(ret); 686 /* Try all the clusters to free */ 687 } 688 689 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); 690 brelse(bg_bh); 691 } 692 693 static struct buffer_head * 694 ocfs2_block_group_alloc_discontig(handle_t *handle, 695 struct inode *alloc_inode, 696 struct ocfs2_alloc_context *ac, 697 struct ocfs2_chain_list *cl) 698 { 699 int status; 700 u32 bit_off, num_bits; 701 u64 bg_blkno; 702 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; 703 struct buffer_head *bg_bh = NULL; 704 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); 705 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 706 707 if (!ocfs2_supports_discontig_bg(osb)) { 708 status = -ENOSPC; 709 goto bail; 710 } 711 712 status = ocfs2_extend_trans(handle, 713 ocfs2_calc_bg_discontig_credits(osb->sb)); 714 if (status) { 715 mlog_errno(status); 716 goto bail; 717 } 718 719 /* 720 * We're going to be grabbing from multiple cluster groups. 721 * We don't have enough credits to relink them all, and the 722 * cluster groups will be staying in cache for the duration of 723 * this operation. 724 */ 725 ac->ac_disable_chain_relink = 1; 726 727 /* Claim the first region */ 728 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, 729 &bit_off, &num_bits); 730 if (status < 0) { 731 if (status != -ENOSPC) 732 mlog_errno(status); 733 goto bail; 734 } 735 min_bits = num_bits; 736 737 /* setup the group */ 738 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 739 trace_ocfs2_block_group_alloc_discontig( 740 (unsigned long long)bg_blkno, alloc_rec); 741 742 bg_bh = sb_getblk(osb->sb, bg_blkno); 743 if (!bg_bh) { 744 status = -ENOMEM; 745 mlog_errno(status); 746 goto bail; 747 } 748 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); 749 750 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, 751 bg_blkno, num_bits, alloc_rec, cl); 752 if (status < 0) { 753 mlog_errno(status); 754 goto bail; 755 } 756 757 status = ocfs2_block_group_grow_discontig(handle, alloc_inode, 758 bg_bh, ac, cl, min_bits); 759 if (status) 760 mlog_errno(status); 761 762 bail: 763 if (status) 764 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); 765 return status ? ERR_PTR(status) : bg_bh; 766 } 767 768 /* 769 * We expect the block group allocator to already be locked. 770 */ 771 static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 772 struct inode *alloc_inode, 773 struct buffer_head *bh, 774 u64 max_block, 775 u64 *last_alloc_group, 776 int flags) 777 { 778 int status, credits; 779 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 780 struct ocfs2_chain_list *cl; 781 struct ocfs2_alloc_context *ac = NULL; 782 handle_t *handle = NULL; 783 u16 alloc_rec; 784 struct buffer_head *bg_bh = NULL; 785 struct ocfs2_group_desc *bg; 786 787 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 788 789 cl = &fe->id2.i_chain; 790 status = ocfs2_reserve_clusters_with_limit(osb, 791 le16_to_cpu(cl->cl_cpg), 792 max_block, flags, &ac); 793 if (status < 0) { 794 if (status != -ENOSPC) 795 mlog_errno(status); 796 goto bail; 797 } 798 799 credits = ocfs2_calc_group_alloc_credits(osb->sb, 800 le16_to_cpu(cl->cl_cpg)); 801 handle = ocfs2_start_trans(osb, credits); 802 if (IS_ERR(handle)) { 803 status = PTR_ERR(handle); 804 handle = NULL; 805 mlog_errno(status); 806 goto bail; 807 } 808 809 if (last_alloc_group && *last_alloc_group != 0) { 810 trace_ocfs2_block_group_alloc( 811 (unsigned long long)*last_alloc_group); 812 ac->ac_last_group = *last_alloc_group; 813 } 814 815 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 816 ac, cl); 817 if (PTR_ERR(bg_bh) == -ENOSPC) { 818 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 819 bg_bh = ocfs2_block_group_alloc_discontig(handle, 820 alloc_inode, 821 ac, cl); 822 } 823 if (IS_ERR(bg_bh)) { 824 status = PTR_ERR(bg_bh); 825 bg_bh = NULL; 826 if (status != -ENOSPC) 827 mlog_errno(status); 828 goto bail; 829 } 830 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 831 832 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 833 bh, OCFS2_JOURNAL_ACCESS_WRITE); 834 if (status < 0) { 835 mlog_errno(status); 836 goto bail; 837 } 838 839 alloc_rec = le16_to_cpu(bg->bg_chain); 840 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 841 le16_to_cpu(bg->bg_free_bits_count)); 842 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 843 le16_to_cpu(bg->bg_bits)); 844 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; 845 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 846 le16_add_cpu(&cl->cl_next_free_rec, 1); 847 848 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - 849 le16_to_cpu(bg->bg_free_bits_count)); 850 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 851 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 852 853 ocfs2_journal_dirty(handle, bh); 854 855 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 856 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 857 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 858 le32_to_cpu(fe->i_clusters))); 859 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 860 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 861 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 862 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 863 864 status = 0; 865 866 /* save the new last alloc group so that the caller can cache it. */ 867 if (last_alloc_group) 868 *last_alloc_group = ac->ac_last_group; 869 870 bail: 871 if (handle) 872 ocfs2_commit_trans(osb, handle); 873 874 if (ac) 875 ocfs2_free_alloc_context(ac); 876 877 brelse(bg_bh); 878 879 if (status) 880 mlog_errno(status); 881 return status; 882 } 883 884 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 885 struct ocfs2_alloc_context *ac, 886 int type, 887 u32 slot, 888 u64 *last_alloc_group, 889 int flags) 890 { 891 int status; 892 u32 bits_wanted = ac->ac_bits_wanted; 893 struct inode *alloc_inode; 894 struct buffer_head *bh = NULL; 895 struct ocfs2_dinode *fe; 896 u32 free_bits; 897 898 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 899 if (!alloc_inode) { 900 mlog_errno(-EINVAL); 901 return -EINVAL; 902 } 903 904 inode_lock(alloc_inode); 905 906 status = ocfs2_inode_lock(alloc_inode, &bh, 1); 907 if (status < 0) { 908 inode_unlock(alloc_inode); 909 iput(alloc_inode); 910 911 mlog_errno(status); 912 return status; 913 } 914 915 ac->ac_inode = alloc_inode; 916 ac->ac_alloc_slot = slot; 917 918 fe = (struct ocfs2_dinode *) bh->b_data; 919 920 /* The bh was validated by the inode read inside 921 * ocfs2_inode_lock(). Any corruption is a code bug. */ 922 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 923 924 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 925 status = ocfs2_error(alloc_inode->i_sb, 926 "Invalid chain allocator %llu\n", 927 (unsigned long long)le64_to_cpu(fe->i_blkno)); 928 goto bail; 929 } 930 931 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 932 le32_to_cpu(fe->id1.bitmap1.i_used); 933 934 if (bits_wanted > free_bits) { 935 /* cluster bitmap never grows */ 936 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 937 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, 938 free_bits); 939 status = -ENOSPC; 940 goto bail; 941 } 942 943 if (!(flags & ALLOC_NEW_GROUP)) { 944 trace_ocfs2_reserve_suballoc_bits_no_new_group( 945 slot, bits_wanted, free_bits); 946 status = -ENOSPC; 947 goto bail; 948 } 949 950 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 951 ac->ac_max_block, 952 last_alloc_group, flags); 953 if (status < 0) { 954 if (status != -ENOSPC) 955 mlog_errno(status); 956 goto bail; 957 } 958 atomic_inc(&osb->alloc_stats.bg_extends); 959 960 /* You should never ask for this much metadata */ 961 BUG_ON(bits_wanted > 962 (le32_to_cpu(fe->id1.bitmap1.i_total) 963 - le32_to_cpu(fe->id1.bitmap1.i_used))); 964 } 965 966 get_bh(bh); 967 ac->ac_bh = bh; 968 bail: 969 brelse(bh); 970 971 if (status) 972 mlog_errno(status); 973 return status; 974 } 975 976 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 977 { 978 spin_lock(&osb->osb_lock); 979 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; 980 spin_unlock(&osb->osb_lock); 981 atomic_set(&osb->s_num_inodes_stolen, 0); 982 } 983 984 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) 985 { 986 spin_lock(&osb->osb_lock); 987 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; 988 spin_unlock(&osb->osb_lock); 989 atomic_set(&osb->s_num_meta_stolen, 0); 990 } 991 992 void ocfs2_init_steal_slots(struct ocfs2_super *osb) 993 { 994 ocfs2_init_inode_steal_slot(osb); 995 ocfs2_init_meta_steal_slot(osb); 996 } 997 998 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) 999 { 1000 spin_lock(&osb->osb_lock); 1001 if (type == INODE_ALLOC_SYSTEM_INODE) 1002 osb->s_inode_steal_slot = (u16)slot; 1003 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 1004 osb->s_meta_steal_slot = (u16)slot; 1005 spin_unlock(&osb->osb_lock); 1006 } 1007 1008 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) 1009 { 1010 int slot = OCFS2_INVALID_SLOT; 1011 1012 spin_lock(&osb->osb_lock); 1013 if (type == INODE_ALLOC_SYSTEM_INODE) 1014 slot = osb->s_inode_steal_slot; 1015 else if (type == EXTENT_ALLOC_SYSTEM_INODE) 1016 slot = osb->s_meta_steal_slot; 1017 spin_unlock(&osb->osb_lock); 1018 1019 return slot; 1020 } 1021 1022 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) 1023 { 1024 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); 1025 } 1026 1027 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) 1028 { 1029 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); 1030 } 1031 1032 static int ocfs2_steal_resource(struct ocfs2_super *osb, 1033 struct ocfs2_alloc_context *ac, 1034 int type) 1035 { 1036 int i, status = -ENOSPC; 1037 int slot = __ocfs2_get_steal_slot(osb, type); 1038 1039 /* Start to steal resource from the first slot after ours. */ 1040 if (slot == OCFS2_INVALID_SLOT) 1041 slot = osb->slot_num + 1; 1042 1043 for (i = 0; i < osb->max_slots; i++, slot++) { 1044 if (slot == osb->max_slots) 1045 slot = 0; 1046 1047 if (slot == osb->slot_num) 1048 continue; 1049 1050 status = ocfs2_reserve_suballoc_bits(osb, ac, 1051 type, 1052 (u32)slot, NULL, 1053 NOT_ALLOC_NEW_GROUP); 1054 if (status >= 0) { 1055 __ocfs2_set_steal_slot(osb, slot, type); 1056 break; 1057 } 1058 1059 ocfs2_free_ac_resource(ac); 1060 } 1061 1062 return status; 1063 } 1064 1065 static int ocfs2_steal_inode(struct ocfs2_super *osb, 1066 struct ocfs2_alloc_context *ac) 1067 { 1068 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); 1069 } 1070 1071 static int ocfs2_steal_meta(struct ocfs2_super *osb, 1072 struct ocfs2_alloc_context *ac) 1073 { 1074 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); 1075 } 1076 1077 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 1078 int blocks, 1079 struct ocfs2_alloc_context **ac) 1080 { 1081 int status; 1082 int slot = ocfs2_get_meta_steal_slot(osb); 1083 1084 *ac = kzalloc_obj(struct ocfs2_alloc_context); 1085 if (!(*ac)) { 1086 status = -ENOMEM; 1087 mlog_errno(status); 1088 goto bail; 1089 } 1090 1091 (*ac)->ac_bits_wanted = blocks; 1092 (*ac)->ac_which = OCFS2_AC_USE_META; 1093 (*ac)->ac_group_search = ocfs2_block_group_search; 1094 1095 if (slot != OCFS2_INVALID_SLOT && 1096 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) 1097 goto extent_steal; 1098 1099 atomic_set(&osb->s_num_meta_stolen, 0); 1100 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1101 EXTENT_ALLOC_SYSTEM_INODE, 1102 (u32)osb->slot_num, NULL, 1103 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); 1104 1105 1106 if (status >= 0) { 1107 status = 0; 1108 if (slot != OCFS2_INVALID_SLOT) 1109 ocfs2_init_meta_steal_slot(osb); 1110 goto bail; 1111 } else if (status < 0 && status != -ENOSPC) { 1112 mlog_errno(status); 1113 goto bail; 1114 } 1115 1116 ocfs2_free_ac_resource(*ac); 1117 1118 extent_steal: 1119 status = ocfs2_steal_meta(osb, *ac); 1120 atomic_inc(&osb->s_num_meta_stolen); 1121 if (status < 0) { 1122 if (status != -ENOSPC) 1123 mlog_errno(status); 1124 goto bail; 1125 } 1126 1127 status = 0; 1128 bail: 1129 if ((status < 0) && *ac) { 1130 ocfs2_free_alloc_context(*ac); 1131 *ac = NULL; 1132 } 1133 1134 if (status) 1135 mlog_errno(status); 1136 return status; 1137 } 1138 1139 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 1140 struct ocfs2_extent_list *root_el, 1141 struct ocfs2_alloc_context **ac) 1142 { 1143 return ocfs2_reserve_new_metadata_blocks(osb, 1144 ocfs2_extend_meta_needed(root_el), 1145 ac); 1146 } 1147 1148 int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 1149 struct ocfs2_alloc_context **ac) 1150 { 1151 int status; 1152 int slot = ocfs2_get_inode_steal_slot(osb); 1153 u64 alloc_group; 1154 1155 *ac = kzalloc_obj(struct ocfs2_alloc_context); 1156 if (!(*ac)) { 1157 status = -ENOMEM; 1158 mlog_errno(status); 1159 goto bail; 1160 } 1161 1162 (*ac)->ac_bits_wanted = 1; 1163 (*ac)->ac_which = OCFS2_AC_USE_INODE; 1164 1165 (*ac)->ac_group_search = ocfs2_block_group_search; 1166 1167 /* 1168 * stat(2) can't handle i_ino > 32bits, so we tell the 1169 * lower levels not to allocate us a block group past that 1170 * limit. The 'inode64' mount option avoids this behavior. 1171 */ 1172 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) 1173 (*ac)->ac_max_block = (u32)~0U; 1174 1175 /* 1176 * slot is set when we successfully steal inode from other nodes. 1177 * It is reset in 3 places: 1178 * 1. when we flush the truncate log 1179 * 2. when we complete local alloc recovery. 1180 * 3. when we successfully allocate from our own slot. 1181 * After it is set, we will go on stealing inodes until we find the 1182 * need to check our slots to see whether there is some space for us. 1183 */ 1184 if (slot != OCFS2_INVALID_SLOT && 1185 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) 1186 goto inode_steal; 1187 1188 atomic_set(&osb->s_num_inodes_stolen, 0); 1189 alloc_group = osb->osb_inode_alloc_group; 1190 status = ocfs2_reserve_suballoc_bits(osb, *ac, 1191 INODE_ALLOC_SYSTEM_INODE, 1192 (u32)osb->slot_num, 1193 &alloc_group, 1194 ALLOC_NEW_GROUP | 1195 ALLOC_GROUPS_FROM_GLOBAL); 1196 if (status >= 0) { 1197 status = 0; 1198 1199 spin_lock(&osb->osb_lock); 1200 osb->osb_inode_alloc_group = alloc_group; 1201 spin_unlock(&osb->osb_lock); 1202 trace_ocfs2_reserve_new_inode_new_group( 1203 (unsigned long long)alloc_group); 1204 1205 /* 1206 * Some inodes must be freed by us, so try to allocate 1207 * from our own next time. 1208 */ 1209 if (slot != OCFS2_INVALID_SLOT) 1210 ocfs2_init_inode_steal_slot(osb); 1211 goto bail; 1212 } else if (status < 0 && status != -ENOSPC) { 1213 mlog_errno(status); 1214 goto bail; 1215 } 1216 1217 ocfs2_free_ac_resource(*ac); 1218 1219 inode_steal: 1220 status = ocfs2_steal_inode(osb, *ac); 1221 atomic_inc(&osb->s_num_inodes_stolen); 1222 if (status < 0) { 1223 if (status != -ENOSPC) 1224 mlog_errno(status); 1225 goto bail; 1226 } 1227 1228 status = 0; 1229 bail: 1230 if ((status < 0) && *ac) { 1231 ocfs2_free_alloc_context(*ac); 1232 *ac = NULL; 1233 } 1234 1235 if (status) 1236 mlog_errno(status); 1237 return status; 1238 } 1239 1240 /* local alloc code has to do the same thing, so rather than do this 1241 * twice.. */ 1242 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 1243 struct ocfs2_alloc_context *ac) 1244 { 1245 int status; 1246 1247 ac->ac_which = OCFS2_AC_USE_MAIN; 1248 ac->ac_group_search = ocfs2_cluster_group_search; 1249 1250 status = ocfs2_reserve_suballoc_bits(osb, ac, 1251 GLOBAL_BITMAP_SYSTEM_INODE, 1252 OCFS2_INVALID_SLOT, NULL, 1253 ALLOC_NEW_GROUP); 1254 if (status < 0 && status != -ENOSPC) 1255 mlog_errno(status); 1256 1257 return status; 1258 } 1259 1260 /* Callers don't need to care which bitmap (local alloc or main) to 1261 * use so we figure it out for them, but unfortunately this clutters 1262 * things a bit. */ 1263 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 1264 u32 bits_wanted, u64 max_block, 1265 int flags, 1266 struct ocfs2_alloc_context **ac) 1267 { 1268 int status, ret = 0; 1269 int retried = 0; 1270 1271 *ac = kzalloc_obj(struct ocfs2_alloc_context); 1272 if (!(*ac)) { 1273 status = -ENOMEM; 1274 mlog_errno(status); 1275 goto bail; 1276 } 1277 1278 (*ac)->ac_bits_wanted = bits_wanted; 1279 (*ac)->ac_max_block = max_block; 1280 1281 status = -ENOSPC; 1282 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && 1283 ocfs2_alloc_should_use_local(osb, bits_wanted)) { 1284 status = ocfs2_reserve_local_alloc_bits(osb, 1285 bits_wanted, 1286 *ac); 1287 if ((status < 0) && (status != -ENOSPC)) { 1288 mlog_errno(status); 1289 goto bail; 1290 } 1291 } 1292 1293 if (status == -ENOSPC) { 1294 retry: 1295 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1296 /* Retry if there is sufficient space cached in truncate log */ 1297 if (status == -ENOSPC && !retried) { 1298 retried = 1; 1299 ocfs2_inode_unlock((*ac)->ac_inode, 1); 1300 inode_unlock((*ac)->ac_inode); 1301 1302 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1303 if (ret == 1) { 1304 iput((*ac)->ac_inode); 1305 (*ac)->ac_inode = NULL; 1306 goto retry; 1307 } 1308 1309 if (ret < 0) 1310 mlog_errno(ret); 1311 1312 inode_lock((*ac)->ac_inode); 1313 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1314 if (ret < 0) { 1315 mlog_errno(ret); 1316 inode_unlock((*ac)->ac_inode); 1317 iput((*ac)->ac_inode); 1318 (*ac)->ac_inode = NULL; 1319 goto bail; 1320 } 1321 } 1322 if (status < 0) { 1323 if (status != -ENOSPC) 1324 mlog_errno(status); 1325 goto bail; 1326 } 1327 } 1328 1329 status = 0; 1330 bail: 1331 if ((status < 0) && *ac) { 1332 ocfs2_free_alloc_context(*ac); 1333 *ac = NULL; 1334 } 1335 1336 if (status) 1337 mlog_errno(status); 1338 return status; 1339 } 1340 1341 int ocfs2_reserve_clusters(struct ocfs2_super *osb, 1342 u32 bits_wanted, 1343 struct ocfs2_alloc_context **ac) 1344 { 1345 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, 1346 ALLOC_NEW_GROUP, ac); 1347 } 1348 1349 /* 1350 * More or less lifted from ext3. I'll leave their description below: 1351 * 1352 * "For ext3 allocations, we must not reuse any blocks which are 1353 * allocated in the bitmap buffer's "last committed data" copy. This 1354 * prevents deletes from freeing up the page for reuse until we have 1355 * committed the delete transaction. 1356 * 1357 * If we didn't do this, then deleting something and reallocating it as 1358 * data would allow the old block to be overwritten before the 1359 * transaction committed (because we force data to disk before commit). 1360 * This would lead to corruption if we crashed between overwriting the 1361 * data and committing the delete. 1362 * 1363 * @@@ We may want to make this allocation behaviour conditional on 1364 * data-writes at some point, and disable it for metadata allocations or 1365 * sync-data inodes." 1366 * 1367 * Note: OCFS2 already does this differently for metadata vs data 1368 * allocations, as those bitmaps are separate and undo access is never 1369 * called on a metadata group descriptor. 1370 */ 1371 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 1372 int nr) 1373 { 1374 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1375 struct journal_head *jh; 1376 int ret; 1377 1378 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) 1379 return 0; 1380 1381 jh = jbd2_journal_grab_journal_head(bg_bh); 1382 if (!jh) 1383 return 1; 1384 1385 spin_lock(&jh->b_state_lock); 1386 bg = (struct ocfs2_group_desc *) jh->b_committed_data; 1387 if (bg) 1388 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); 1389 else 1390 ret = 1; 1391 spin_unlock(&jh->b_state_lock); 1392 jbd2_journal_put_journal_head(jh); 1393 1394 return ret; 1395 } 1396 1397 u16 ocfs2_find_max_contig_free_bits(void *bitmap, 1398 u16 total_bits, u16 start) 1399 { 1400 u16 offset, free_bits; 1401 u16 contig_bits = 0; 1402 1403 while (start < total_bits) { 1404 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); 1405 if (offset == total_bits) 1406 break; 1407 1408 start = ocfs2_find_next_bit(bitmap, total_bits, offset); 1409 free_bits = start - offset; 1410 if (contig_bits < free_bits) 1411 contig_bits = free_bits; 1412 } 1413 1414 return contig_bits; 1415 } 1416 1417 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 1418 struct buffer_head *bg_bh, 1419 unsigned int bits_wanted, 1420 unsigned int total_bits, 1421 struct ocfs2_suballoc_result *res) 1422 { 1423 void *bitmap; 1424 u16 best_offset, best_size; 1425 u16 prev_best_size = 0; 1426 int offset, start, found, status = 0; 1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1428 1429 /* Callers got this descriptor from 1430 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1431 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1432 1433 found = start = best_offset = best_size = 0; 1434 bitmap = bg->bg_bitmap; 1435 1436 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < 1437 total_bits) { 1438 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 1439 /* We found a zero, but we can't use it as it 1440 * hasn't been put to disk yet! */ 1441 found = 0; 1442 start = offset + 1; 1443 } else if (offset == start) { 1444 /* we found a zero */ 1445 found++; 1446 /* move start to the next bit to test */ 1447 start++; 1448 } else { 1449 /* got a zero after some ones */ 1450 found = 1; 1451 start = offset + 1; 1452 prev_best_size = best_size; 1453 } 1454 if (found > best_size) { 1455 best_size = found; 1456 best_offset = start - found; 1457 } 1458 /* we got everything we needed */ 1459 if (found == bits_wanted) { 1460 /* mlog(0, "Found it all!\n"); */ 1461 break; 1462 } 1463 } 1464 1465 /* best_size will be allocated, we save prev_best_size */ 1466 res->sr_max_contig_bits = prev_best_size; 1467 if (best_size) { 1468 res->sr_bit_offset = best_offset; 1469 res->sr_bits = best_size; 1470 } else { 1471 status = -ENOSPC; 1472 /* No error log here -- see the comment above 1473 * ocfs2_test_bg_bit_allocatable */ 1474 } 1475 1476 return status; 1477 } 1478 1479 int ocfs2_block_group_set_bits(handle_t *handle, 1480 struct inode *alloc_inode, 1481 struct ocfs2_group_desc *bg, 1482 struct buffer_head *group_bh, 1483 unsigned int bit_off, 1484 unsigned int num_bits, 1485 unsigned int max_contig_bits, 1486 int fastpath) 1487 { 1488 int status; 1489 void *bitmap = bg->bg_bitmap; 1490 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1491 unsigned int start = bit_off + num_bits; 1492 u16 contig_bits; 1493 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 1494 1495 /* All callers get the descriptor via 1496 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1497 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1498 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1499 1500 trace_ocfs2_block_group_set_bits(bit_off, num_bits); 1501 1502 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1503 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1504 1505 status = ocfs2_journal_access_gd(handle, 1506 INODE_CACHE(alloc_inode), 1507 group_bh, 1508 journal_type); 1509 if (status < 0) { 1510 mlog_errno(status); 1511 goto bail; 1512 } 1513 1514 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1515 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1516 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 1517 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1518 le16_to_cpu(bg->bg_bits), 1519 le16_to_cpu(bg->bg_free_bits_count), 1520 num_bits); 1521 } 1522 while(num_bits--) 1523 ocfs2_set_bit(bit_off++, bitmap); 1524 1525 /* 1526 * this is optimize path, caller set old contig value 1527 * in max_contig_bits to bypass finding action. 1528 */ 1529 if (fastpath) { 1530 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1531 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { 1532 /* 1533 * Usually, the block group bitmap allocates only 1 bit 1534 * at a time, while the cluster group allocates n bits 1535 * each time. Therefore, we only save the contig bits for 1536 * the cluster group. 1537 */ 1538 contig_bits = ocfs2_find_max_contig_free_bits(bitmap, 1539 le16_to_cpu(bg->bg_bits), start); 1540 if (contig_bits > max_contig_bits) 1541 max_contig_bits = contig_bits; 1542 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 1543 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); 1544 } else { 1545 bg->bg_contig_free_bits = 0; 1546 } 1547 1548 ocfs2_journal_dirty(handle, group_bh); 1549 1550 bail: 1551 return status; 1552 } 1553 1554 /* find the one with the most empty bits */ 1555 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) 1556 { 1557 u16 curr, best; 1558 1559 BUG_ON(!cl->cl_next_free_rec); 1560 1561 best = curr = 0; 1562 while (curr < le16_to_cpu(cl->cl_next_free_rec)) { 1563 if (le32_to_cpu(cl->cl_recs[curr].c_free) > 1564 le32_to_cpu(cl->cl_recs[best].c_free)) 1565 best = curr; 1566 curr++; 1567 } 1568 1569 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); 1570 return best; 1571 } 1572 1573 static int ocfs2_relink_block_group(handle_t *handle, 1574 struct inode *alloc_inode, 1575 struct buffer_head *fe_bh, 1576 struct buffer_head *bg_bh, 1577 struct buffer_head *prev_bg_bh, 1578 u16 chain) 1579 { 1580 int status; 1581 /* there is a really tiny chance the journal calls could fail, 1582 * but we wouldn't want inconsistent blocks in *any* case. */ 1583 u64 bg_ptr, prev_bg_ptr; 1584 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1585 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1586 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1587 1588 /* The caller got these descriptors from 1589 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1590 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1591 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1592 1593 trace_ocfs2_relink_block_group( 1594 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1595 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1596 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1597 1598 bg_ptr = le64_to_cpu(bg->bg_next_group); 1599 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1600 1601 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1602 prev_bg_bh, 1603 OCFS2_JOURNAL_ACCESS_WRITE); 1604 if (status < 0) 1605 goto out; 1606 1607 prev_bg->bg_next_group = bg->bg_next_group; 1608 ocfs2_journal_dirty(handle, prev_bg_bh); 1609 1610 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1611 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1612 if (status < 0) 1613 goto out_rollback_prev_bg; 1614 1615 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1616 ocfs2_journal_dirty(handle, bg_bh); 1617 1618 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1619 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1620 if (status < 0) 1621 goto out_rollback_bg; 1622 1623 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1624 ocfs2_journal_dirty(handle, fe_bh); 1625 1626 out: 1627 if (status < 0) 1628 mlog_errno(status); 1629 return status; 1630 1631 out_rollback_bg: 1632 bg->bg_next_group = cpu_to_le64(bg_ptr); 1633 out_rollback_prev_bg: 1634 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1635 goto out; 1636 } 1637 1638 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1639 u32 wanted) 1640 { 1641 return le16_to_cpu(bg->bg_free_bits_count) > wanted; 1642 } 1643 1644 /* return 0 on success, -ENOSPC to keep searching and any other < 0 1645 * value on error. */ 1646 static int ocfs2_cluster_group_search(struct inode *inode, 1647 struct buffer_head *group_bh, 1648 u32 bits_wanted, u32 min_bits, 1649 u64 max_block, 1650 struct ocfs2_suballoc_result *res) 1651 { 1652 int search = -ENOSPC; 1653 int ret; 1654 u64 blkoff; 1655 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1656 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1657 unsigned int max_bits, gd_cluster_off; 1658 1659 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1660 1661 if (le16_to_cpu(gd->bg_contig_free_bits) && 1662 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) 1663 return -ENOSPC; 1664 1665 /* ->bg_contig_free_bits may un-initialized, so compare again */ 1666 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { 1667 max_bits = le16_to_cpu(gd->bg_bits); 1668 1669 /* Tail groups in cluster bitmaps which aren't cpg 1670 * aligned are prone to partial extension by a failed 1671 * fs resize. If the file system resize never got to 1672 * update the dinode cluster count, then we don't want 1673 * to trust any clusters past it, regardless of what 1674 * the group descriptor says. */ 1675 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, 1676 le64_to_cpu(gd->bg_blkno)); 1677 if ((gd_cluster_off + max_bits) > 1678 OCFS2_I(inode)->ip_clusters) { 1679 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1680 trace_ocfs2_cluster_group_search_wrong_max_bits( 1681 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1682 le16_to_cpu(gd->bg_bits), 1683 OCFS2_I(inode)->ip_clusters, max_bits); 1684 } 1685 1686 ret = ocfs2_block_group_find_clear_bits(osb, 1687 group_bh, bits_wanted, 1688 max_bits, res); 1689 if (ret) 1690 return ret; 1691 1692 if (max_block) { 1693 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1694 gd_cluster_off + 1695 res->sr_bit_offset + 1696 res->sr_bits); 1697 trace_ocfs2_cluster_group_search_max_block( 1698 (unsigned long long)blkoff, 1699 (unsigned long long)max_block); 1700 if (blkoff > max_block) 1701 return -ENOSPC; 1702 } 1703 1704 /* ocfs2_block_group_find_clear_bits() might 1705 * return success, but we still want to return 1706 * -ENOSPC unless it found the minimum number 1707 * of bits. */ 1708 if (min_bits <= res->sr_bits) 1709 search = 0; /* success */ 1710 } 1711 1712 return search; 1713 } 1714 1715 static int ocfs2_block_group_search(struct inode *inode, 1716 struct buffer_head *group_bh, 1717 u32 bits_wanted, u32 min_bits, 1718 u64 max_block, 1719 struct ocfs2_suballoc_result *res) 1720 { 1721 int ret = -ENOSPC; 1722 u64 blkoff; 1723 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1724 1725 BUG_ON(min_bits != 1); 1726 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1727 1728 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { 1729 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1730 group_bh, bits_wanted, 1731 le16_to_cpu(bg->bg_bits), 1732 res); 1733 if (!ret && max_block) { 1734 blkoff = le64_to_cpu(bg->bg_blkno) + 1735 res->sr_bit_offset + res->sr_bits; 1736 trace_ocfs2_block_group_search_max_block( 1737 (unsigned long long)blkoff, 1738 (unsigned long long)max_block); 1739 if (blkoff > max_block) 1740 ret = -ENOSPC; 1741 } 1742 } 1743 1744 return ret; 1745 } 1746 1747 int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1748 handle_t *handle, 1749 struct buffer_head *di_bh, 1750 u32 num_bits, 1751 u16 chain) 1752 { 1753 int ret; 1754 u32 tmp_used; 1755 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1756 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1757 1758 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1759 OCFS2_JOURNAL_ACCESS_WRITE); 1760 if (ret < 0) { 1761 mlog_errno(ret); 1762 goto out; 1763 } 1764 1765 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1766 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1767 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1768 ocfs2_journal_dirty(handle, di_bh); 1769 1770 out: 1771 return ret; 1772 } 1773 1774 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, 1775 struct buffer_head *di_bh, 1776 u32 num_bits, 1777 u16 chain) 1778 { 1779 u32 tmp_used; 1780 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1781 struct ocfs2_chain_list *cl; 1782 1783 cl = (struct ocfs2_chain_list *)&di->id2.i_chain; 1784 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1785 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); 1786 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); 1787 } 1788 1789 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1790 struct ocfs2_extent_rec *rec, 1791 struct ocfs2_chain_list *cl) 1792 { 1793 unsigned int bpc = le16_to_cpu(cl->cl_bpc); 1794 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; 1795 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; 1796 1797 if (res->sr_bit_offset < bitoff) 1798 return 0; 1799 if (res->sr_bit_offset >= (bitoff + bitcount)) 1800 return 0; 1801 res->sr_blkno = le64_to_cpu(rec->e_blkno) + 1802 (res->sr_bit_offset - bitoff); 1803 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) 1804 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; 1805 return 1; 1806 } 1807 1808 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, 1809 struct ocfs2_group_desc *bg, 1810 struct ocfs2_suballoc_result *res) 1811 { 1812 int i; 1813 u64 bg_blkno = res->sr_bg_blkno; /* Save off */ 1814 struct ocfs2_extent_rec *rec; 1815 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1816 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1817 1818 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { 1819 res->sr_blkno = 0; 1820 return; 1821 } 1822 1823 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; 1824 res->sr_bg_blkno = 0; /* Clear it for contig block groups */ 1825 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || 1826 !bg->bg_list.l_next_free_rec) 1827 return; 1828 1829 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { 1830 rec = &bg->bg_list.l_recs[i]; 1831 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { 1832 res->sr_bg_blkno = bg_blkno; /* Restore */ 1833 break; 1834 } 1835 } 1836 } 1837 1838 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1839 handle_t *handle, 1840 u32 bits_wanted, 1841 u32 min_bits, 1842 struct ocfs2_suballoc_result *res, 1843 u16 *bits_left, int *released) 1844 { 1845 int ret; 1846 struct buffer_head *group_bh = NULL; 1847 struct ocfs2_group_desc *gd; 1848 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1849 struct inode *alloc_inode = ac->ac_inode; 1850 1851 ret = ocfs2_read_hint_group_descriptor(alloc_inode, di, 1852 res->sr_bg_blkno, &group_bh, released); 1853 if (*released) { 1854 return 0; 1855 } else if (ret < 0) { 1856 mlog_errno(ret); 1857 return ret; 1858 } 1859 1860 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1861 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1862 ac->ac_max_block, res); 1863 if (ret < 0) { 1864 if (ret != -ENOSPC) 1865 mlog_errno(ret); 1866 goto out; 1867 } 1868 1869 if (!ret) 1870 ocfs2_bg_discontig_fix_result(ac, gd, res); 1871 1872 /* 1873 * sr_bg_blkno might have been changed by 1874 * ocfs2_bg_discontig_fix_result 1875 */ 1876 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1877 1878 if (ac->ac_find_loc_only) 1879 goto out_loc_only; 1880 1881 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1882 res->sr_bits, 1883 le16_to_cpu(gd->bg_chain)); 1884 if (ret < 0) { 1885 mlog_errno(ret); 1886 goto out; 1887 } 1888 1889 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1890 res->sr_bit_offset, res->sr_bits, 1891 res->sr_max_contig_bits, 0); 1892 if (ret < 0) { 1893 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, 1894 res->sr_bits, 1895 le16_to_cpu(gd->bg_chain)); 1896 mlog_errno(ret); 1897 } 1898 1899 out_loc_only: 1900 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1901 1902 out: 1903 brelse(group_bh); 1904 1905 return ret; 1906 } 1907 1908 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1909 handle_t *handle, 1910 u32 bits_wanted, 1911 u32 min_bits, 1912 struct ocfs2_suballoc_result *res, 1913 u16 *bits_left) 1914 { 1915 int status; 1916 u16 chain; 1917 u32 contig_bits; 1918 u64 next_group; 1919 struct inode *alloc_inode = ac->ac_inode; 1920 struct buffer_head *group_bh = NULL; 1921 struct buffer_head *prev_group_bh = NULL; 1922 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1923 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1924 struct ocfs2_group_desc *bg; 1925 1926 chain = ac->ac_chain; 1927 trace_ocfs2_search_chain_begin( 1928 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 1929 bits_wanted, chain); 1930 1931 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1932 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1933 &group_bh); 1934 if (status < 0) { 1935 mlog_errno(status); 1936 goto bail; 1937 } 1938 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1939 1940 status = -ENOSPC; 1941 /* for now, the chain search is a bit simplistic. We just use 1942 * the 1st group with any empty bits. */ 1943 while (1) { 1944 if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { 1945 contig_bits = le16_to_cpu(bg->bg_contig_free_bits); 1946 if (!contig_bits) 1947 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 1948 le16_to_cpu(bg->bg_bits), 0); 1949 if (bits_wanted > contig_bits && contig_bits >= min_bits) 1950 bits_wanted = contig_bits; 1951 } 1952 1953 status = ac->ac_group_search(alloc_inode, group_bh, 1954 bits_wanted, min_bits, 1955 ac->ac_max_block, res); 1956 if (status != -ENOSPC) 1957 break; 1958 if (!bg->bg_next_group) 1959 break; 1960 1961 brelse(prev_group_bh); 1962 prev_group_bh = NULL; 1963 1964 next_group = le64_to_cpu(bg->bg_next_group); 1965 prev_group_bh = group_bh; 1966 group_bh = NULL; 1967 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1968 next_group, &group_bh); 1969 if (status < 0) { 1970 mlog_errno(status); 1971 goto bail; 1972 } 1973 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1974 } 1975 if (status < 0) { 1976 if (status != -ENOSPC) 1977 mlog_errno(status); 1978 goto bail; 1979 } 1980 1981 trace_ocfs2_search_chain_succ( 1982 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); 1983 1984 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1985 1986 BUG_ON(res->sr_bits == 0); 1987 if (!status) 1988 ocfs2_bg_discontig_fix_result(ac, bg, res); 1989 1990 /* 1991 * sr_bg_blkno might have been changed by 1992 * ocfs2_bg_discontig_fix_result 1993 */ 1994 res->sr_bg_stable_blkno = group_bh->b_blocknr; 1995 1996 /* 1997 * Keep track of previous block descriptor read. When 1998 * we find a target, if we have read more than X 1999 * number of descriptors, and the target is reasonably 2000 * empty, relink him to top of his chain. 2001 * 2002 * We've read 0 extra blocks and only send one more to 2003 * the transaction, yet the next guy to search has a 2004 * much easier time. 2005 * 2006 * Do this *after* figuring out how many bits we're taking out 2007 * of our target group. 2008 */ 2009 if (!ac->ac_disable_chain_relink && 2010 (prev_group_bh) && 2011 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { 2012 status = ocfs2_relink_block_group(handle, alloc_inode, 2013 ac->ac_bh, group_bh, 2014 prev_group_bh, chain); 2015 if (status < 0) { 2016 mlog_errno(status); 2017 goto bail; 2018 } 2019 } 2020 2021 if (ac->ac_find_loc_only) 2022 goto out_loc_only; 2023 2024 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, 2025 ac->ac_bh, res->sr_bits, 2026 chain); 2027 if (status) { 2028 mlog_errno(status); 2029 goto bail; 2030 } 2031 2032 status = ocfs2_block_group_set_bits(handle, 2033 alloc_inode, 2034 bg, 2035 group_bh, 2036 res->sr_bit_offset, 2037 res->sr_bits, 2038 res->sr_max_contig_bits, 2039 0); 2040 if (status < 0) { 2041 ocfs2_rollback_alloc_dinode_counts(alloc_inode, 2042 ac->ac_bh, res->sr_bits, chain); 2043 mlog_errno(status); 2044 goto bail; 2045 } 2046 2047 trace_ocfs2_search_chain_end( 2048 (unsigned long long)le64_to_cpu(fe->i_blkno), 2049 res->sr_bits); 2050 2051 out_loc_only: 2052 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 2053 bail: 2054 brelse(group_bh); 2055 brelse(prev_group_bh); 2056 2057 if (status) 2058 mlog_errno(status); 2059 return status; 2060 } 2061 2062 /* will give out up to bits_wanted contiguous bits. */ 2063 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, 2064 handle_t *handle, 2065 u32 bits_wanted, 2066 u32 min_bits, 2067 struct ocfs2_suballoc_result *res) 2068 { 2069 int status; 2070 int released = 0; 2071 u16 victim, i; 2072 u16 bits_left = 0; 2073 u64 hint = ac->ac_last_group; 2074 struct ocfs2_chain_list *cl; 2075 struct ocfs2_dinode *fe; 2076 2077 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2078 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 2079 BUG_ON(!ac->ac_bh); 2080 2081 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2082 2083 /* The bh was validated by the inode read during 2084 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ 2085 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2086 2087 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 2088 le32_to_cpu(fe->id1.bitmap1.i_total)) { 2089 status = ocfs2_error(ac->ac_inode->i_sb, 2090 "Chain allocator dinode %llu has %u used bits but only %u total\n", 2091 (unsigned long long)le64_to_cpu(fe->i_blkno), 2092 le32_to_cpu(fe->id1.bitmap1.i_used), 2093 le32_to_cpu(fe->id1.bitmap1.i_total)); 2094 goto bail; 2095 } 2096 2097 /* the hint bg may already be released, we quiet search this group. */ 2098 res->sr_bg_blkno = hint; 2099 if (res->sr_bg_blkno) { 2100 /* Attempt to short-circuit the usual search mechanism 2101 * by jumping straight to the most recently used 2102 * allocation group. This helps us maintain some 2103 * contiguousness across allocations. */ 2104 status = ocfs2_search_one_group(ac, handle, bits_wanted, 2105 min_bits, res, &bits_left, 2106 &released); 2107 if (released) { 2108 res->sr_bg_blkno = 0; 2109 goto chain_search; 2110 } 2111 if (!status) 2112 goto set_hint; 2113 if (status < 0 && status != -ENOSPC) { 2114 mlog_errno(status); 2115 goto bail; 2116 } 2117 } 2118 chain_search: 2119 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 2120 if (!le16_to_cpu(cl->cl_next_free_rec) || 2121 le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { 2122 status = ocfs2_error(ac->ac_inode->i_sb, 2123 "Chain allocator dinode %llu has invalid next " 2124 "free chain record %u, but only %u total\n", 2125 (unsigned long long)le64_to_cpu(fe->i_blkno), 2126 le16_to_cpu(cl->cl_next_free_rec), 2127 le16_to_cpu(cl->cl_count)); 2128 goto bail; 2129 } 2130 2131 victim = ocfs2_find_victim_chain(cl); 2132 ac->ac_chain = victim; 2133 2134 search: 2135 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2136 res, &bits_left); 2137 if (!status) { 2138 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) 2139 hint = res->sr_bg_blkno; 2140 else 2141 hint = ocfs2_group_from_res(res); 2142 goto set_hint; 2143 } 2144 if (status < 0 && status != -ENOSPC) { 2145 mlog_errno(status); 2146 goto bail; 2147 } 2148 2149 trace_ocfs2_claim_suballoc_bits(victim); 2150 2151 /* If we didn't pick a good victim, then just default to 2152 * searching each chain in order. Don't allow chain relinking 2153 * because we only calculate enough journal credits for one 2154 * relink per alloc. */ 2155 ac->ac_disable_chain_relink = 1; 2156 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { 2157 if (i == victim) 2158 continue; 2159 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) 2160 continue; 2161 2162 ac->ac_chain = i; 2163 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2164 res, &bits_left); 2165 if (!status) { 2166 hint = ocfs2_group_from_res(res); 2167 break; 2168 } 2169 if (status < 0 && status != -ENOSPC) { 2170 mlog_errno(status); 2171 goto bail; 2172 } 2173 } 2174 2175 /* Chains can't supply the bits_wanted contiguous space. 2176 * We should switch to using every single bit when allocating 2177 * from the global bitmap. */ 2178 if (i == le16_to_cpu(cl->cl_next_free_rec) && 2179 status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { 2180 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 2181 ac->ac_chain = victim; 2182 goto search; 2183 } 2184 2185 set_hint: 2186 if (status != -ENOSPC) { 2187 /* If the next search of this group is not likely to 2188 * yield a suitable extent, then we reset the last 2189 * group hint so as to not waste a disk read */ 2190 if (bits_left < min_bits) 2191 ac->ac_last_group = 0; 2192 else 2193 ac->ac_last_group = hint; 2194 } 2195 2196 bail: 2197 if (status) 2198 mlog_errno(status); 2199 return status; 2200 } 2201 2202 int ocfs2_claim_metadata(handle_t *handle, 2203 struct ocfs2_alloc_context *ac, 2204 u32 bits_wanted, 2205 u64 *suballoc_loc, 2206 u16 *suballoc_bit_start, 2207 unsigned int *num_bits, 2208 u64 *blkno_start) 2209 { 2210 int status; 2211 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2212 2213 BUG_ON(!ac); 2214 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 2215 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 2216 2217 status = ocfs2_claim_suballoc_bits(ac, 2218 handle, 2219 bits_wanted, 2220 1, 2221 &res); 2222 if (status < 0) { 2223 mlog_errno(status); 2224 goto bail; 2225 } 2226 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2227 2228 *suballoc_loc = res.sr_bg_blkno; 2229 *suballoc_bit_start = res.sr_bit_offset; 2230 *blkno_start = res.sr_blkno; 2231 ac->ac_bits_given += res.sr_bits; 2232 *num_bits = res.sr_bits; 2233 status = 0; 2234 bail: 2235 if (status) 2236 mlog_errno(status); 2237 return status; 2238 } 2239 2240 /* 2241 * after ocfs2 has the ability to release block group unused space, 2242 * the ->ip_last_used_group may be invalid. so this function returns 2243 * ac->ac_last_group need to verify. 2244 * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details. 2245 */ 2246 static void ocfs2_init_inode_ac_group(struct inode *dir, 2247 struct buffer_head *parent_di_bh, 2248 struct ocfs2_alloc_context *ac) 2249 { 2250 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; 2251 /* 2252 * Try to allocate inodes from some specific group. 2253 * 2254 * If the parent dir has recorded the last group used in allocation, 2255 * cool, use it. Otherwise if we try to allocate new inode from the 2256 * same slot the parent dir belongs to, use the same chunk. 2257 * 2258 * We are very careful here to avoid the mistake of setting 2259 * ac_last_group to a group descriptor from a different (unlocked) slot. 2260 */ 2261 if (OCFS2_I(dir)->ip_last_used_group && 2262 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2263 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2264 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { 2265 if (di->i_suballoc_loc) 2266 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); 2267 else 2268 ac->ac_last_group = ocfs2_which_suballoc_group( 2269 le64_to_cpu(di->i_blkno), 2270 le16_to_cpu(di->i_suballoc_bit)); 2271 } 2272 } 2273 2274 static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2275 struct ocfs2_alloc_context *ac) 2276 { 2277 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; 2278 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2279 } 2280 2281 int ocfs2_find_new_inode_loc(struct inode *dir, 2282 struct buffer_head *parent_fe_bh, 2283 struct ocfs2_alloc_context *ac, 2284 u64 *fe_blkno) 2285 { 2286 int ret; 2287 handle_t *handle = NULL; 2288 struct ocfs2_suballoc_result *res; 2289 2290 BUG_ON(!ac); 2291 BUG_ON(ac->ac_bits_given != 0); 2292 BUG_ON(ac->ac_bits_wanted != 1); 2293 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2294 2295 res = kzalloc_obj(*res, GFP_NOFS); 2296 if (res == NULL) { 2297 ret = -ENOMEM; 2298 mlog_errno(ret); 2299 goto out; 2300 } 2301 2302 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2303 2304 /* 2305 * The handle started here is for chain relink. Alternatively, 2306 * we could just disable relink for these calls. 2307 */ 2308 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); 2309 if (IS_ERR(handle)) { 2310 ret = PTR_ERR(handle); 2311 handle = NULL; 2312 mlog_errno(ret); 2313 goto out; 2314 } 2315 2316 /* 2317 * This will instruct ocfs2_claim_suballoc_bits and 2318 * ocfs2_search_one_group to search but save actual allocation 2319 * for later. 2320 */ 2321 ac->ac_find_loc_only = 1; 2322 2323 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); 2324 if (ret < 0) { 2325 mlog_errno(ret); 2326 goto out; 2327 } 2328 2329 ac->ac_find_loc_priv = res; 2330 *fe_blkno = res->sr_blkno; 2331 ocfs2_update_inode_fsync_trans(handle, dir, 0); 2332 out: 2333 if (handle) 2334 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2335 2336 if (ret) 2337 kfree(res); 2338 2339 return ret; 2340 } 2341 2342 int ocfs2_claim_new_inode_at_loc(handle_t *handle, 2343 struct inode *dir, 2344 struct ocfs2_alloc_context *ac, 2345 u64 *suballoc_loc, 2346 u16 *suballoc_bit, 2347 u64 di_blkno) 2348 { 2349 int ret; 2350 u16 chain; 2351 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; 2352 struct buffer_head *bg_bh = NULL; 2353 struct ocfs2_group_desc *bg; 2354 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; 2355 2356 /* 2357 * Since di_blkno is being passed back in, we check for any 2358 * inconsistencies which may have happened between 2359 * calls. These are code bugs as di_blkno is not expected to 2360 * change once returned from ocfs2_find_new_inode_loc() 2361 */ 2362 BUG_ON(res->sr_blkno != di_blkno); 2363 2364 ret = ocfs2_read_group_descriptor(ac->ac_inode, di, 2365 res->sr_bg_stable_blkno, &bg_bh); 2366 if (ret) { 2367 mlog_errno(ret); 2368 goto out; 2369 } 2370 2371 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 2372 chain = le16_to_cpu(bg->bg_chain); 2373 2374 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, 2375 ac->ac_bh, res->sr_bits, 2376 chain); 2377 if (ret) { 2378 mlog_errno(ret); 2379 goto out; 2380 } 2381 2382 ret = ocfs2_block_group_set_bits(handle, 2383 ac->ac_inode, 2384 bg, 2385 bg_bh, 2386 res->sr_bit_offset, 2387 res->sr_bits, 2388 res->sr_max_contig_bits, 2389 0); 2390 if (ret < 0) { 2391 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, 2392 ac->ac_bh, res->sr_bits, chain); 2393 mlog_errno(ret); 2394 goto out; 2395 } 2396 2397 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, 2398 res->sr_bits); 2399 2400 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2401 2402 BUG_ON(res->sr_bits != 1); 2403 2404 *suballoc_loc = res->sr_bg_blkno; 2405 *suballoc_bit = res->sr_bit_offset; 2406 ac->ac_bits_given++; 2407 ocfs2_save_inode_ac_group(dir, ac); 2408 2409 out: 2410 brelse(bg_bh); 2411 2412 return ret; 2413 } 2414 2415 int ocfs2_claim_new_inode(handle_t *handle, 2416 struct inode *dir, 2417 struct buffer_head *parent_fe_bh, 2418 struct ocfs2_alloc_context *ac, 2419 u64 *suballoc_loc, 2420 u16 *suballoc_bit, 2421 u64 *fe_blkno) 2422 { 2423 int status; 2424 struct ocfs2_suballoc_result res; 2425 2426 BUG_ON(!ac); 2427 BUG_ON(ac->ac_bits_given != 0); 2428 BUG_ON(ac->ac_bits_wanted != 1); 2429 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 2430 2431 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2432 2433 status = ocfs2_claim_suballoc_bits(ac, 2434 handle, 2435 1, 2436 1, 2437 &res); 2438 if (status < 0) { 2439 mlog_errno(status); 2440 goto bail; 2441 } 2442 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2443 2444 BUG_ON(res.sr_bits != 1); 2445 2446 *suballoc_loc = res.sr_bg_blkno; 2447 *suballoc_bit = res.sr_bit_offset; 2448 *fe_blkno = res.sr_blkno; 2449 ac->ac_bits_given++; 2450 ocfs2_save_inode_ac_group(dir, ac); 2451 status = 0; 2452 bail: 2453 if (status) 2454 mlog_errno(status); 2455 return status; 2456 } 2457 2458 /* translate a group desc. blkno and it's bitmap offset into 2459 * disk cluster offset. */ 2460 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 2461 u64 bg_blkno, 2462 u16 bg_bit_off) 2463 { 2464 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2465 u32 cluster = 0; 2466 2467 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2468 2469 if (bg_blkno != osb->first_cluster_group_blkno) 2470 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); 2471 cluster += (u32) bg_bit_off; 2472 return cluster; 2473 } 2474 2475 /* given a cluster offset, calculate which block group it belongs to 2476 * and return that block offset. */ 2477 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) 2478 { 2479 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2480 u32 group_no; 2481 2482 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2483 2484 group_no = cluster / osb->bitmap_cpg; 2485 if (!group_no) 2486 return osb->first_cluster_group_blkno; 2487 return ocfs2_clusters_to_blocks(inode->i_sb, 2488 group_no * osb->bitmap_cpg); 2489 } 2490 2491 /* given the block number of a cluster start, calculate which cluster 2492 * group and descriptor bitmap offset that corresponds to. */ 2493 static inline void ocfs2_block_to_cluster_group(struct inode *inode, 2494 u64 data_blkno, 2495 u64 *bg_blkno, 2496 u16 *bg_bit_off) 2497 { 2498 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2499 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); 2500 2501 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 2502 2503 *bg_blkno = ocfs2_which_cluster_group(inode, 2504 data_cluster); 2505 2506 if (*bg_blkno == osb->first_cluster_group_blkno) 2507 *bg_bit_off = (u16) data_cluster; 2508 else 2509 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, 2510 data_blkno - *bg_blkno); 2511 } 2512 2513 /* 2514 * min_bits - minimum contiguous chunk from this total allocation we 2515 * can handle. set to what we asked for originally for a full 2516 * contig. allocation, set to '1' to indicate we can deal with extents 2517 * of any size. 2518 */ 2519 int __ocfs2_claim_clusters(handle_t *handle, 2520 struct ocfs2_alloc_context *ac, 2521 u32 min_clusters, 2522 u32 max_clusters, 2523 u32 *cluster_start, 2524 u32 *num_clusters) 2525 { 2526 int status; 2527 unsigned int bits_wanted = max_clusters; 2528 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2529 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2530 2531 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2532 2533 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2534 && ac->ac_which != OCFS2_AC_USE_MAIN 2535 && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); 2536 2537 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2538 WARN_ON(min_clusters > 1); 2539 2540 status = ocfs2_claim_local_alloc_bits(osb, 2541 handle, 2542 ac, 2543 bits_wanted, 2544 cluster_start, 2545 num_clusters); 2546 if (!status) 2547 atomic_inc(&osb->alloc_stats.local_data); 2548 } else { 2549 if (min_clusters > (osb->bitmap_cpg - 1)) { 2550 /* The only paths asking for contiguousness 2551 * should know about this already. */ 2552 mlog(ML_ERROR, "minimum allocation requested %u exceeds " 2553 "group bitmap size %u!\n", min_clusters, 2554 osb->bitmap_cpg); 2555 status = -ENOSPC; 2556 goto bail; 2557 } 2558 /* clamp the current request down to a realistic size. */ 2559 if (bits_wanted > (osb->bitmap_cpg - 1)) 2560 bits_wanted = osb->bitmap_cpg - 1; 2561 2562 status = ocfs2_claim_suballoc_bits(ac, 2563 handle, 2564 bits_wanted, 2565 min_clusters, 2566 &res); 2567 if (!status) { 2568 BUG_ON(res.sr_blkno); /* cluster alloc can't set */ 2569 *cluster_start = 2570 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2571 res.sr_bg_blkno, 2572 res.sr_bit_offset); 2573 atomic_inc(&osb->alloc_stats.bitmap_data); 2574 *num_clusters = res.sr_bits; 2575 } 2576 } 2577 if (status < 0) { 2578 if (status != -ENOSPC) 2579 mlog_errno(status); 2580 goto bail; 2581 } 2582 2583 ac->ac_bits_given += *num_clusters; 2584 2585 bail: 2586 if (status) 2587 mlog_errno(status); 2588 return status; 2589 } 2590 2591 int ocfs2_claim_clusters(handle_t *handle, 2592 struct ocfs2_alloc_context *ac, 2593 u32 min_clusters, 2594 u32 *cluster_start, 2595 u32 *num_clusters) 2596 { 2597 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2598 2599 return __ocfs2_claim_clusters(handle, ac, min_clusters, 2600 bits_wanted, cluster_start, num_clusters); 2601 } 2602 2603 static int ocfs2_block_group_clear_bits(handle_t *handle, 2604 struct inode *alloc_inode, 2605 struct ocfs2_group_desc *bg, 2606 struct buffer_head *group_bh, 2607 unsigned int bit_off, 2608 unsigned int num_bits, 2609 unsigned int max_contig_bits, 2610 void (*undo_fn)(unsigned int bit, 2611 unsigned long *bmap)) 2612 { 2613 int status; 2614 unsigned int tmp; 2615 u16 contig_bits; 2616 struct ocfs2_group_desc *undo_bg = NULL; 2617 struct journal_head *jh; 2618 2619 /* The caller got this descriptor from 2620 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2621 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2622 2623 trace_ocfs2_block_group_clear_bits(bit_off, num_bits); 2624 2625 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2626 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2627 group_bh, 2628 undo_fn ? 2629 OCFS2_JOURNAL_ACCESS_UNDO : 2630 OCFS2_JOURNAL_ACCESS_WRITE); 2631 if (status < 0) { 2632 mlog_errno(status); 2633 goto bail; 2634 } 2635 2636 jh = bh2jh(group_bh); 2637 if (undo_fn) { 2638 spin_lock(&jh->b_state_lock); 2639 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; 2640 BUG_ON(!undo_bg); 2641 } 2642 2643 tmp = num_bits; 2644 while(tmp--) { 2645 ocfs2_clear_bit((bit_off + tmp), 2646 (unsigned long *) bg->bg_bitmap); 2647 if (undo_fn) 2648 undo_fn(bit_off + tmp, 2649 (unsigned long *) undo_bg->bg_bitmap); 2650 } 2651 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2652 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2653 if (undo_fn) 2654 spin_unlock(&jh->b_state_lock); 2655 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", 2656 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2657 le16_to_cpu(bg->bg_bits), 2658 le16_to_cpu(bg->bg_free_bits_count), 2659 num_bits); 2660 } 2661 2662 /* 2663 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), 2664 * we still need to rescan whole bitmap. 2665 */ 2666 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 2667 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 2668 le16_to_cpu(bg->bg_bits), 0); 2669 if (contig_bits > max_contig_bits) 2670 max_contig_bits = contig_bits; 2671 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); 2672 } else { 2673 bg->bg_contig_free_bits = 0; 2674 } 2675 2676 if (undo_fn) 2677 spin_unlock(&jh->b_state_lock); 2678 2679 ocfs2_journal_dirty(handle, group_bh); 2680 bail: 2681 return status; 2682 } 2683 2684 /* 2685 * Reclaim the suballocator managed space to main bitmap. 2686 * This function first works on the suballocator to perform the 2687 * cleanup rec/alloc_inode job, then switches to the main bitmap 2688 * to reclaim released space. 2689 * 2690 * handle: The transaction handle 2691 * alloc_inode: The suballoc inode 2692 * alloc_bh: The buffer_head of suballoc inode 2693 * group_bh: The group descriptor buffer_head of suballocator managed. 2694 * Caller should release the input group_bh. 2695 */ 2696 static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle, 2697 struct inode *alloc_inode, 2698 struct buffer_head *alloc_bh, 2699 struct buffer_head *group_bh) 2700 { 2701 int idx, status = 0; 2702 int i, next_free_rec, len = 0; 2703 __le16 old_bg_contig_free_bits = 0; 2704 u16 start_bit; 2705 u32 tmp_used; 2706 u64 bg_blkno, start_blk; 2707 unsigned int count; 2708 struct ocfs2_chain_rec *rec; 2709 struct buffer_head *main_bm_bh = NULL; 2710 struct inode *main_bm_inode = NULL; 2711 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 2712 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2713 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2714 struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data; 2715 2716 idx = le16_to_cpu(group->bg_chain); 2717 rec = &(cl->cl_recs[idx]); 2718 2719 status = ocfs2_extend_trans(handle, 2720 ocfs2_calc_group_alloc_credits(osb->sb, 2721 le16_to_cpu(cl->cl_cpg))); 2722 if (status) { 2723 mlog_errno(status); 2724 goto bail; 2725 } 2726 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2727 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2728 if (status < 0) { 2729 mlog_errno(status); 2730 goto bail; 2731 } 2732 2733 /* 2734 * Only clear the suballocator rec item in-place. 2735 * 2736 * If idx is not the last, we don't compress (remove the empty item) 2737 * the cl_recs[]. If not, we need to do lots jobs. 2738 * 2739 * Compress cl_recs[] code example: 2740 * if (idx != cl->cl_next_free_rec - 1) 2741 * memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1], 2742 * sizeof(struct ocfs2_chain_rec) * 2743 * (cl->cl_next_free_rec - idx - 1)); 2744 * for(i = idx; i < cl->cl_next_free_rec-1; i++) { 2745 * group->bg_chain = "later group->bg_chain"; 2746 * group->bg_blkno = xxx; 2747 * ... ... 2748 * } 2749 */ 2750 2751 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total); 2752 fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total)); 2753 2754 /* Substraction 1 for the block group itself */ 2755 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2756 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1); 2757 2758 tmp_used = le32_to_cpu(fe->i_clusters); 2759 fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg)); 2760 2761 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 2762 OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters); 2763 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 2764 le32_to_cpu(fe->i_clusters))); 2765 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 2766 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 2767 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 2768 2769 ocfs2_journal_dirty(handle, alloc_bh); 2770 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 2771 2772 start_blk = le64_to_cpu(rec->c_blkno); 2773 count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc); 2774 2775 /* 2776 * If the rec is the last one, let's compress the chain list by 2777 * removing the empty cl_recs[] at the end. 2778 */ 2779 next_free_rec = le16_to_cpu(cl->cl_next_free_rec); 2780 if (idx == (next_free_rec - 1)) { 2781 len++; /* the last item should be counted first */ 2782 for (i = (next_free_rec - 2); i > 0; i--) { 2783 if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total) 2784 len++; 2785 else 2786 break; 2787 } 2788 } 2789 le16_add_cpu(&cl->cl_next_free_rec, -len); 2790 2791 rec->c_free = 0; 2792 rec->c_total = 0; 2793 rec->c_blkno = 0; 2794 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh); 2795 memset(group, 0, sizeof(struct ocfs2_group_desc)); 2796 2797 /* prepare job for reclaim clusters */ 2798 main_bm_inode = ocfs2_get_system_file_inode(osb, 2799 GLOBAL_BITMAP_SYSTEM_INODE, 2800 OCFS2_INVALID_SLOT); 2801 if (!main_bm_inode) 2802 goto bail; /* ignore the error in reclaim path */ 2803 2804 inode_lock(main_bm_inode); 2805 2806 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); 2807 if (status < 0) 2808 goto free_bm_inode; /* ignore the error in reclaim path */ 2809 2810 ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno, 2811 &start_bit); 2812 fe = (struct ocfs2_dinode *) main_bm_bh->b_data; 2813 cl = &fe->id2.i_chain; 2814 /* reuse group_bh, caller will release the input group_bh */ 2815 group_bh = NULL; 2816 2817 /* reclaim clusters to global_bitmap */ 2818 status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno, 2819 &group_bh); 2820 if (status < 0) { 2821 mlog_errno(status); 2822 goto free_bm_bh; 2823 } 2824 group = (struct ocfs2_group_desc *) group_bh->b_data; 2825 2826 if ((count + start_bit) > le16_to_cpu(group->bg_bits)) { 2827 ocfs2_error(alloc_inode->i_sb, 2828 "reclaim length (%d) beyands block group length (%d)", 2829 count + start_bit, le16_to_cpu(group->bg_bits)); 2830 goto free_group_bh; 2831 } 2832 2833 old_bg_contig_free_bits = group->bg_contig_free_bits; 2834 status = ocfs2_block_group_clear_bits(handle, main_bm_inode, 2835 group, group_bh, 2836 start_bit, count, 0, 2837 _ocfs2_clear_bit); 2838 if (status < 0) { 2839 mlog_errno(status); 2840 goto free_group_bh; 2841 } 2842 2843 status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 2844 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2845 if (status < 0) { 2846 mlog_errno(status); 2847 ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh, 2848 start_bit, count, 2849 le16_to_cpu(old_bg_contig_free_bits), 1); 2850 goto free_group_bh; 2851 } 2852 2853 idx = le16_to_cpu(group->bg_chain); 2854 rec = &(cl->cl_recs[idx]); 2855 2856 le32_add_cpu(&rec->c_free, count); 2857 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2858 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2859 ocfs2_journal_dirty(handle, main_bm_bh); 2860 2861 free_group_bh: 2862 brelse(group_bh); 2863 2864 free_bm_bh: 2865 ocfs2_inode_unlock(main_bm_inode, 1); 2866 brelse(main_bm_bh); 2867 2868 free_bm_inode: 2869 inode_unlock(main_bm_inode); 2870 iput(main_bm_inode); 2871 2872 bail: 2873 return status; 2874 } 2875 2876 /* 2877 * expects the suballoc inode to already be locked. 2878 */ 2879 static int _ocfs2_free_suballoc_bits(handle_t *handle, 2880 struct inode *alloc_inode, 2881 struct buffer_head *alloc_bh, 2882 unsigned int start_bit, 2883 u64 bg_blkno, 2884 unsigned int count, 2885 void (*undo_fn)(unsigned int bit, 2886 unsigned long *bitmap)) 2887 { 2888 int idx, status = 0; 2889 u32 tmp_used; 2890 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2891 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2892 struct buffer_head *group_bh = NULL; 2893 struct ocfs2_group_desc *group; 2894 struct ocfs2_chain_rec *rec; 2895 __le16 old_bg_contig_free_bits = 0; 2896 2897 /* The alloc_bh comes from ocfs2_free_dinode() or 2898 * ocfs2_free_clusters(). The callers have all locked the 2899 * allocator and gotten alloc_bh from the lock call. This 2900 * validates the dinode buffer. Any corruption that has happened 2901 * is a code bug. */ 2902 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2903 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2904 2905 trace_ocfs2_free_suballoc_bits( 2906 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, 2907 (unsigned long long)bg_blkno, 2908 start_bit, count); 2909 2910 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2911 &group_bh); 2912 if (status < 0) { 2913 mlog_errno(status); 2914 goto bail; 2915 } 2916 group = (struct ocfs2_group_desc *) group_bh->b_data; 2917 2918 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 2919 2920 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2921 old_bg_contig_free_bits = group->bg_contig_free_bits; 2922 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2923 group, group_bh, 2924 start_bit, count, 0, undo_fn); 2925 if (status < 0) { 2926 mlog_errno(status); 2927 goto bail; 2928 } 2929 2930 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2931 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2932 if (status < 0) { 2933 mlog_errno(status); 2934 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, 2935 start_bit, count, 2936 le16_to_cpu(old_bg_contig_free_bits), 1); 2937 goto bail; 2938 } 2939 2940 idx = le16_to_cpu(group->bg_chain); 2941 rec = &(cl->cl_recs[idx]); 2942 2943 le32_add_cpu(&rec->c_free, count); 2944 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2945 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2946 ocfs2_journal_dirty(handle, alloc_bh); 2947 2948 /* 2949 * Reclaim suballocator free space. 2950 * Bypass: global_bitmap, non empty rec, first rec in cl_recs[] 2951 */ 2952 if (ocfs2_is_cluster_bitmap(alloc_inode) || 2953 (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) || 2954 (le16_to_cpu(cl->cl_next_free_rec) == 1)) { 2955 goto bail; 2956 } 2957 2958 _ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh); 2959 2960 bail: 2961 brelse(group_bh); 2962 return status; 2963 } 2964 2965 int ocfs2_free_suballoc_bits(handle_t *handle, 2966 struct inode *alloc_inode, 2967 struct buffer_head *alloc_bh, 2968 unsigned int start_bit, 2969 u64 bg_blkno, 2970 unsigned int count) 2971 { 2972 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, 2973 start_bit, bg_blkno, count, NULL); 2974 } 2975 2976 int ocfs2_free_dinode(handle_t *handle, 2977 struct inode *inode_alloc_inode, 2978 struct buffer_head *inode_alloc_bh, 2979 struct ocfs2_dinode *di) 2980 { 2981 u64 blk = le64_to_cpu(di->i_blkno); 2982 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2983 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2984 2985 if (di->i_suballoc_loc) 2986 bg_blkno = le64_to_cpu(di->i_suballoc_loc); 2987 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2988 inode_alloc_bh, bit, bg_blkno, 1); 2989 } 2990 2991 static int _ocfs2_free_clusters(handle_t *handle, 2992 struct inode *bitmap_inode, 2993 struct buffer_head *bitmap_bh, 2994 u64 start_blk, 2995 unsigned int num_clusters, 2996 void (*undo_fn)(unsigned int bit, 2997 unsigned long *bitmap)) 2998 { 2999 int status; 3000 u16 bg_start_bit; 3001 u64 bg_blkno; 3002 3003 /* You can't ever have a contiguous set of clusters 3004 * bigger than a block group bitmap so we never have to worry 3005 * about looping on them. 3006 * This is expensive. We can safely remove once this stuff has 3007 * gotten tested really well. */ 3008 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, 3009 ocfs2_blocks_to_clusters(bitmap_inode->i_sb, 3010 start_blk))); 3011 3012 3013 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 3014 &bg_start_bit); 3015 3016 trace_ocfs2_free_clusters((unsigned long long)bg_blkno, 3017 (unsigned long long)start_blk, 3018 bg_start_bit, num_clusters); 3019 3020 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 3021 bg_start_bit, bg_blkno, 3022 num_clusters, undo_fn); 3023 if (status < 0) { 3024 mlog_errno(status); 3025 goto out; 3026 } 3027 3028 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), 3029 num_clusters); 3030 3031 out: 3032 return status; 3033 } 3034 3035 int ocfs2_free_clusters(handle_t *handle, 3036 struct inode *bitmap_inode, 3037 struct buffer_head *bitmap_bh, 3038 u64 start_blk, 3039 unsigned int num_clusters) 3040 { 3041 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 3042 start_blk, num_clusters, 3043 _ocfs2_set_bit); 3044 } 3045 3046 /* 3047 * Give never-used clusters back to the global bitmap. We don't need 3048 * to protect these bits in the undo buffer. 3049 */ 3050 int ocfs2_release_clusters(handle_t *handle, 3051 struct inode *bitmap_inode, 3052 struct buffer_head *bitmap_bh, 3053 u64 start_blk, 3054 unsigned int num_clusters) 3055 { 3056 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, 3057 start_blk, num_clusters, 3058 _ocfs2_clear_bit); 3059 } 3060 3061 /* 3062 * For a given allocation, determine which allocators will need to be 3063 * accessed, and lock them, reserving the appropriate number of bits. 3064 * 3065 * Sparse file systems call this from ocfs2_write_begin_nolock() 3066 * and ocfs2_allocate_unwritten_extents(). 3067 * 3068 * File systems which don't support holes call this from 3069 * ocfs2_extend_allocation(). 3070 */ 3071 int ocfs2_lock_allocators(struct inode *inode, 3072 struct ocfs2_extent_tree *et, 3073 u32 clusters_to_add, u32 extents_to_split, 3074 struct ocfs2_alloc_context **data_ac, 3075 struct ocfs2_alloc_context **meta_ac) 3076 { 3077 int ret = 0, num_free_extents; 3078 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; 3079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3080 3081 *meta_ac = NULL; 3082 if (data_ac) 3083 *data_ac = NULL; 3084 3085 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 3086 3087 num_free_extents = ocfs2_num_free_extents(et); 3088 if (num_free_extents < 0) { 3089 ret = num_free_extents; 3090 mlog_errno(ret); 3091 goto out; 3092 } 3093 3094 /* 3095 * Sparse allocation file systems need to be more conservative 3096 * with reserving room for expansion - the actual allocation 3097 * happens while we've got a journal handle open so re-taking 3098 * a cluster lock (because we ran out of room for another 3099 * extent) will violate ordering rules. 3100 * 3101 * Most of the time we'll only be seeing this 1 cluster at a time 3102 * anyway. 3103 * 3104 * Always lock for any unwritten extents - we might want to 3105 * add blocks during a split. 3106 */ 3107 if (!num_free_extents || 3108 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { 3109 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); 3110 if (ret < 0) { 3111 if (ret != -ENOSPC) 3112 mlog_errno(ret); 3113 goto out; 3114 } 3115 } 3116 3117 if (clusters_to_add == 0) 3118 goto out; 3119 3120 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 3121 if (ret < 0) { 3122 if (ret != -ENOSPC) 3123 mlog_errno(ret); 3124 goto out; 3125 } 3126 3127 out: 3128 if (ret) { 3129 if (*meta_ac) { 3130 ocfs2_free_alloc_context(*meta_ac); 3131 *meta_ac = NULL; 3132 } 3133 3134 /* 3135 * We cannot have an error and a non null *data_ac. 3136 */ 3137 } 3138 3139 return ret; 3140 } 3141 3142 /* 3143 * Read the inode specified by blkno to get suballoc_slot and 3144 * suballoc_bit. 3145 */ 3146 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 3147 u16 *suballoc_slot, u64 *group_blkno, 3148 u16 *suballoc_bit) 3149 { 3150 int status; 3151 struct buffer_head *inode_bh = NULL; 3152 struct ocfs2_dinode *inode_fe; 3153 3154 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); 3155 3156 /* dirty read disk */ 3157 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 3158 if (status < 0) { 3159 mlog(ML_ERROR, "read block %llu failed %d\n", 3160 (unsigned long long)blkno, status); 3161 goto bail; 3162 } 3163 3164 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 3165 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 3166 mlog(ML_ERROR, "invalid inode %llu requested\n", 3167 (unsigned long long)blkno); 3168 status = -EINVAL; 3169 goto bail; 3170 } 3171 3172 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 3173 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 3174 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 3175 (unsigned long long)blkno, 3176 (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 3177 status = -EINVAL; 3178 goto bail; 3179 } 3180 3181 if (suballoc_slot) 3182 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 3183 if (suballoc_bit) 3184 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 3185 if (group_blkno) 3186 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); 3187 3188 bail: 3189 brelse(inode_bh); 3190 3191 if (status) 3192 mlog_errno(status); 3193 return status; 3194 } 3195 3196 /* 3197 * test whether bit is SET in allocator bitmap or not. on success, 0 3198 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno 3199 * is returned and *res is meaningless. Call this after you have 3200 * cluster locked against suballoc, or you may get a result based on 3201 * non-up2date contents 3202 */ 3203 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 3204 struct inode *suballoc, 3205 struct buffer_head *alloc_bh, 3206 u64 group_blkno, u64 blkno, 3207 u16 bit, int *res) 3208 { 3209 struct ocfs2_dinode *alloc_di; 3210 struct ocfs2_group_desc *group; 3211 struct buffer_head *group_bh = NULL; 3212 u64 bg_blkno; 3213 int status, quiet = 0, released = 0; 3214 3215 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, 3216 (unsigned int)bit); 3217 3218 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 3219 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 3220 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 3221 (unsigned int)bit, 3222 ocfs2_bits_per_group(&alloc_di->id2.i_chain)); 3223 status = -EINVAL; 3224 goto bail; 3225 } 3226 3227 bg_blkno = group_blkno ? group_blkno : 3228 ocfs2_which_suballoc_group(blkno, bit); 3229 status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno, 3230 &group_bh, &released); 3231 if (released) { 3232 quiet = 1; 3233 status = -ESTALE; 3234 goto bail; 3235 } else if (status < 0) { 3236 mlog(ML_ERROR, "read group %llu failed %d\n", 3237 (unsigned long long)bg_blkno, status); 3238 goto bail; 3239 } 3240 3241 group = (struct ocfs2_group_desc *) group_bh->b_data; 3242 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); 3243 3244 bail: 3245 brelse(group_bh); 3246 3247 if (status && !quiet) 3248 mlog_errno(status); 3249 return status; 3250 } 3251 3252 /* 3253 * Test if the bit representing this inode (blkno) is set in the 3254 * suballocator. 3255 * 3256 * On success, 0 is returned and *res is 1 for SET; 0 otherwise. 3257 * 3258 * In the event of failure, a negative value is returned and *res is 3259 * meaningless. 3260 * 3261 * Callers must make sure to hold nfs_sync_lock to prevent 3262 * ocfs2_delete_inode() on another node from accessing the same 3263 * suballocator concurrently. 3264 */ 3265 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 3266 { 3267 int status, quiet = 0; 3268 u64 group_blkno = 0; 3269 u16 suballoc_bit = 0, suballoc_slot = 0; 3270 struct inode *inode_alloc_inode; 3271 struct buffer_head *alloc_bh = NULL; 3272 3273 trace_ocfs2_test_inode_bit((unsigned long long)blkno); 3274 3275 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 3276 &group_blkno, &suballoc_bit); 3277 if (status < 0) { 3278 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 3279 goto bail; 3280 } 3281 3282 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) 3283 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 3284 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 3285 else 3286 inode_alloc_inode = ocfs2_get_system_file_inode(osb, 3287 INODE_ALLOC_SYSTEM_INODE, suballoc_slot); 3288 if (!inode_alloc_inode) { 3289 /* the error code could be inaccurate, but we are not able to 3290 * get the correct one. */ 3291 status = -EINVAL; 3292 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", 3293 (u32)suballoc_slot); 3294 goto bail; 3295 } 3296 3297 inode_lock(inode_alloc_inode); 3298 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 3299 if (status < 0) { 3300 inode_unlock(inode_alloc_inode); 3301 iput(inode_alloc_inode); 3302 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 3303 (u32)suballoc_slot, status); 3304 goto bail; 3305 } 3306 3307 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 3308 group_blkno, blkno, suballoc_bit, res); 3309 if (status < 0) { 3310 if (status == -ESTALE) 3311 quiet = 1; 3312 else 3313 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 3314 } 3315 3316 ocfs2_inode_unlock(inode_alloc_inode, 0); 3317 inode_unlock(inode_alloc_inode); 3318 3319 iput(inode_alloc_inode); 3320 brelse(alloc_bh); 3321 bail: 3322 if (status && !quiet) 3323 mlog_errno(status); 3324 return status; 3325 } 3326